Repository: adidas/lakehouse-engine Branch: master Commit: 1487dfdcafbf Files: 1183 Total size: 3.2 MB Directory structure: gitextract_pl4w_c1i/ ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ └── pull_request_template.md ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE.txt ├── Makefile ├── README.md ├── assets/ │ └── gab/ │ ├── metadata/ │ │ ├── gab/ │ │ │ └── f_agg_dummy_sales_kpi/ │ │ │ ├── 1_article_category.sql │ │ │ └── 2_f_agg_dummy_sales_kpi.sql │ │ └── tables/ │ │ ├── dim_calendar.sql │ │ ├── dummy_sales_kpi.sql │ │ ├── gab_log_events.sql │ │ ├── gab_use_case_results.sql │ │ └── lkp_query_builder.sql │ ├── notebooks/ │ │ ├── gab.py │ │ ├── gab_dim_calendar.py │ │ ├── gab_job_manager.py │ │ └── query_builder_helper.py │ └── utils/ │ ├── databricks_job_utils.py │ └── query_builder_utils.py ├── cicd/ │ ├── .bumpversion.cfg │ ├── Dockerfile │ ├── Jenkinsfile │ ├── Jenkinsfile_deploy │ ├── bandit.yaml │ ├── code_doc/ │ │ ├── content.css │ │ ├── custom_example_macros.py │ │ ├── examples.json │ │ ├── gen_ref_nav.py │ │ ├── index.html.jinja2 │ │ ├── mkdocs.yml │ │ ├── mkdocs_macros.py │ │ ├── module.html.jinja2 │ │ ├── render_doc.py │ │ └── render_docs.py │ ├── flake8.conf │ ├── meta.yaml │ ├── requirements.txt │ ├── requirements_azure.txt │ ├── requirements_cicd.txt │ ├── requirements_dq.txt │ ├── requirements_os.txt │ ├── requirements_sftp.txt │ └── requirements_sharepoint.txt ├── lakehouse_engine/ │ ├── __init__.py │ ├── algorithms/ │ │ ├── __init__.py │ │ ├── algorithm.py │ │ ├── data_loader.py │ │ ├── dq_validator.py │ │ ├── exceptions.py │ │ ├── gab.py │ │ ├── reconciliator.py │ │ ├── sensor.py │ │ └── sensors/ │ │ ├── __init__.py │ │ ├── heartbeat.py │ │ └── sensor.py │ ├── configs/ │ │ ├── __init__.py │ │ └── engine.yaml │ ├── core/ │ │ ├── __init__.py │ │ ├── dbfs_file_manager.py │ │ ├── definitions.py │ │ ├── exec_env.py │ │ ├── executable.py │ │ ├── file_manager.py │ │ ├── gab_manager.py │ │ ├── gab_sql_generator.py │ │ ├── s3_file_manager.py │ │ ├── sensor_manager.py │ │ └── table_manager.py │ ├── dq_processors/ │ │ ├── __init__.py │ │ ├── custom_expectations/ │ │ │ ├── __init__.py │ │ │ ├── expect_column_pair_a_to_be_not_equal_to_b.py │ │ │ ├── expect_column_pair_a_to_be_smaller_or_equal_than_b.py │ │ │ ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py │ │ │ ├── expect_column_values_to_be_date_not_older_than.py │ │ │ ├── expect_column_values_to_not_be_null_or_empty_string.py │ │ │ ├── expect_multicolumn_column_a_must_equal_b_or_c.py │ │ │ └── expect_queried_column_agg_value_to_be.py │ │ ├── dq_factory.py │ │ ├── exceptions.py │ │ └── validator.py │ ├── engine.py │ ├── io/ │ │ ├── __init__.py │ │ ├── exceptions.py │ │ ├── reader.py │ │ ├── reader_factory.py │ │ ├── readers/ │ │ │ ├── __init__.py │ │ │ ├── dataframe_reader.py │ │ │ ├── file_reader.py │ │ │ ├── jdbc_reader.py │ │ │ ├── kafka_reader.py │ │ │ ├── query_reader.py │ │ │ ├── sap_b4_reader.py │ │ │ ├── sap_bw_reader.py │ │ │ ├── sftp_reader.py │ │ │ ├── sharepoint_reader.py │ │ │ └── table_reader.py │ │ ├── writer.py │ │ ├── writer_factory.py │ │ └── writers/ │ │ ├── __init__.py │ │ ├── console_writer.py │ │ ├── dataframe_writer.py │ │ ├── delta_merge_writer.py │ │ ├── file_writer.py │ │ ├── jdbc_writer.py │ │ ├── kafka_writer.py │ │ ├── rest_api_writer.py │ │ ├── sharepoint_writer.py │ │ └── table_writer.py │ ├── terminators/ │ │ ├── __init__.py │ │ ├── cdf_processor.py │ │ ├── dataset_optimizer.py │ │ ├── notifier.py │ │ ├── notifier_factory.py │ │ ├── notifiers/ │ │ │ ├── __init__.py │ │ │ ├── email_notifier.py │ │ │ ├── exceptions.py │ │ │ └── notification_templates.py │ │ ├── sensor_terminator.py │ │ ├── spark_terminator.py │ │ └── terminator_factory.py │ ├── transformers/ │ │ ├── __init__.py │ │ ├── aggregators.py │ │ ├── column_creators.py │ │ ├── column_reshapers.py │ │ ├── condensers.py │ │ ├── custom_transformers.py │ │ ├── data_maskers.py │ │ ├── date_transformers.py │ │ ├── exceptions.py │ │ ├── filters.py │ │ ├── joiners.py │ │ ├── null_handlers.py │ │ ├── optimizers.py │ │ ├── regex_transformers.py │ │ ├── repartitioners.py │ │ ├── transformer_factory.py │ │ ├── unions.py │ │ └── watermarker.py │ └── utils/ │ ├── __init__.py │ ├── acon_utils.py │ ├── configs/ │ │ ├── __init__.py │ │ └── config_utils.py │ ├── databricks_utils.py │ ├── dq_utils.py │ ├── engine_usage_stats.py │ ├── expectations_utils.py │ ├── extraction/ │ │ ├── __init__.py │ │ ├── jdbc_extraction_utils.py │ │ ├── sap_b4_extraction_utils.py │ │ ├── sap_bw_extraction_utils.py │ │ └── sftp_extraction_utils.py │ ├── file_utils.py │ ├── gab_utils.py │ ├── logging_handler.py │ ├── rest_api.py │ ├── schema_utils.py │ ├── sharepoint_utils.py │ ├── spark_utils.py │ ├── sql_parser_utils.py │ └── storage/ │ ├── __init__.py │ ├── dbfs_storage.py │ ├── file_storage.py │ ├── file_storage_functions.py │ ├── local_fs_storage.py │ └── s3_storage.py ├── lakehouse_engine_usage/ │ ├── __init__.py │ ├── data_loader/ │ │ ├── __init__.py │ │ ├── append_load_from_jdbc_with_permissive_mode/ │ │ │ ├── __init__.py │ │ │ └── append_load_from_jdbc_with_permissive_mode.md │ │ ├── append_load_with_failfast/ │ │ │ ├── __init__.py │ │ │ └── append_load_with_failfast.md │ │ ├── batch_delta_load_init_delta_backfill_with_merge/ │ │ │ ├── __init__.py │ │ │ └── batch_delta_load_init_delta_backfill_with_merge.md │ │ ├── custom_transformer/ │ │ │ ├── __init__.py │ │ │ ├── custom_transformer.md │ │ │ └── sql_custom_transformer.md │ │ ├── custom_transformer_sql/ │ │ │ ├── __init__.py │ │ │ └── custom_transformer_sql.md │ │ ├── data_loader.md │ │ ├── extract_from_sap_b4_adso/ │ │ │ ├── __init__.py │ │ │ └── extract_from_sap_b4_adso.md │ │ ├── extract_from_sap_bw_dso/ │ │ │ ├── __init__.py │ │ │ └── extract_from_sap_bw_dso.md │ │ ├── extract_from_sftp/ │ │ │ ├── __init__.py │ │ │ └── extract_from_sftp.md │ │ ├── extract_using_jdbc_connection/ │ │ │ ├── __init__.py │ │ │ └── extract_using_jdbc_connection.md │ │ ├── filtered_full_load/ │ │ │ ├── __init__.py │ │ │ └── filtered_full_load.md │ │ ├── filtered_full_load_with_selective_replace/ │ │ │ ├── __init__.py │ │ │ └── filtered_full_load_with_selective_replace.md │ │ ├── flatten_schema_and_explode_columns/ │ │ │ ├── __init__.py │ │ │ └── flatten_schema_and_explode_columns.md │ │ ├── full_load/ │ │ │ ├── __init__.py │ │ │ └── full_load.md │ │ ├── read_from_dataframe/ │ │ │ ├── __init__.py │ │ │ └── read_from_dataframe.md │ │ ├── read_from_sharepoint/ │ │ │ ├── __init__.py │ │ │ └── read_from_sharepoint.md │ │ ├── streaming_append_load_with_malformed/ │ │ │ ├── __init__.py │ │ │ └── streaming_append_load_with_malformed.md │ │ ├── streaming_append_load_with_terminator/ │ │ │ ├── __init__.py │ │ │ └── streaming_append_load_with_terminator.md │ │ ├── streaming_delta_load_with_group_and_rank_condensation/ │ │ │ ├── __init__.py │ │ │ └── streaming_delta_load_with_group_and_rank_condensation.md │ │ ├── streaming_delta_with_late_arriving_and_out_of_order_events/ │ │ │ ├── __init__.py │ │ │ └── streaming_delta_with_late_arriving_and_out_of_order_events.md │ │ ├── write_and_read_dataframe/ │ │ │ ├── __init__.py │ │ │ └── write_and_read_dataframe.md │ │ ├── write_to_console/ │ │ │ ├── __init__.py │ │ │ └── write_to_console.md │ │ ├── write_to_rest_api/ │ │ │ ├── __init__.py │ │ │ └── write_to_rest_api.md │ │ └── write_to_sharepoint/ │ │ ├── __init__.py │ │ └── write_to_sharepoint.md │ ├── data_quality/ │ │ ├── __init__.py │ │ ├── custom_expectations/ │ │ │ ├── __init__.py │ │ │ └── custom_expectations.md │ │ ├── data_quality.md │ │ ├── data_quality_validator/ │ │ │ ├── __init__.py │ │ │ └── data_quality_validator.md │ │ ├── minimal_example/ │ │ │ ├── __init__.py │ │ │ └── minimal_example.md │ │ ├── prisma/ │ │ │ ├── __init__.py │ │ │ └── prisma.md │ │ ├── result_sink/ │ │ │ ├── __init__.py │ │ │ └── result_sink.md │ │ ├── row_tagging/ │ │ │ ├── __init__.py │ │ │ └── row_tagging.md │ │ └── validations_failing/ │ │ ├── __init__.py │ │ └── validations_failing.md │ ├── gab/ │ │ ├── __init__.py │ │ ├── gab.md │ │ └── step_by_step/ │ │ ├── __init__.py │ │ └── step_by_step.md │ ├── lakehouse_engine_usage.md │ ├── managerhelper/ │ │ ├── managerhelper.md │ │ ├── operations-script.js │ │ ├── operations-styles-mkdocs.css │ │ └── styles-mkdocs.css │ ├── reconciliator/ │ │ ├── __init__.py │ │ └── reconciliator.md │ ├── sensor/ │ │ ├── __init__.py │ │ ├── delta_table/ │ │ │ ├── __init__.py │ │ │ └── delta_table.md │ │ ├── delta_upstream_sensor_table/ │ │ │ ├── __init__.py │ │ │ └── delta_upstream_sensor_table.md │ │ ├── file/ │ │ │ ├── __init__.py │ │ │ └── file.md │ │ ├── jdbc_table/ │ │ │ ├── __init__.py │ │ │ └── jdbc_table.md │ │ ├── kafka/ │ │ │ ├── __init__.py │ │ │ └── kafka.md │ │ ├── sap_bw_b4/ │ │ │ ├── __init__.py │ │ │ └── sap_bw_b4.md │ │ ├── sensor.md │ │ └── update_sensor_status/ │ │ ├── __init__.py │ │ └── update_sensor_status.md │ └── sensors/ │ ├── __init__.py │ ├── heartbeat/ │ │ ├── __init__.py │ │ ├── delta_table/ │ │ │ ├── __init__.py │ │ │ └── delta_table.md │ │ ├── heartbeat.md │ │ ├── heartbeat_sensor_data_feed/ │ │ │ ├── __init__.py │ │ │ └── heartbeat_sensor_data_feed.md │ │ ├── kafka/ │ │ │ ├── __init__.py │ │ │ └── kafka.md │ │ ├── manual_table/ │ │ │ ├── __init__.py │ │ │ └── manual_table.md │ │ ├── sap_bw_b4/ │ │ │ ├── __init__.py │ │ │ └── sap_bw_b4.md │ │ ├── trigger_file/ │ │ │ ├── __init__.py │ │ │ └── trigger_file.md │ │ └── update_heartbeat_sensor_status/ │ │ ├── __init__.py │ │ └── update_heartbeat_sensor_status.md │ ├── sensor/ │ │ ├── __init__.py │ │ ├── delta_table/ │ │ │ ├── __init__.py │ │ │ └── delta_table.md │ │ ├── delta_upstream_sensor_table/ │ │ │ ├── __init__.py │ │ │ └── delta_upstream_sensor_table.md │ │ ├── file/ │ │ │ ├── __init__.py │ │ │ └── file.md │ │ ├── jdbc_table/ │ │ │ ├── __init__.py │ │ │ └── jdbc_table.md │ │ ├── kafka/ │ │ │ ├── __init__.py │ │ │ └── kafka.md │ │ ├── sap_bw_b4/ │ │ │ ├── __init__.py │ │ │ └── sap_bw_b4.md │ │ ├── sensor.md │ │ └── update_sensor_status/ │ │ ├── __init__.py │ │ └── update_sensor_status.md │ └── sensors.md ├── pyproject.toml ├── samples/ │ ├── cricket_dq_tutorial.py │ └── tpch_load_and_analysis_tutorial.py └── tests/ ├── __init__.py ├── configs/ │ ├── __init__.py │ └── engine.yaml ├── conftest.py ├── feature/ │ ├── __init__.py │ ├── custom_expectations/ │ │ ├── __init__.py │ │ ├── test_custom_expectations.py │ │ └── test_expectation_validity.py │ ├── data_loader_custom_transformer/ │ │ ├── __init__.py │ │ ├── test_data_loader_custom_transformer_calculate_kpi.py │ │ ├── test_data_loader_custom_transformer_delta_load.py │ │ └── test_data_loader_custom_transformer_sql_transformation.py │ ├── delta_load/ │ │ ├── __init__.py │ │ ├── test_delta_load_group_and_rank.py │ │ ├── test_delta_load_merge_options.py │ │ └── test_delta_load_record_mode_cdc.py │ ├── test_append_load.py │ ├── test_data_quality.py │ ├── test_dq_validator.py │ ├── test_engine_usage_stats.py │ ├── test_extract_from_sap_b4.py │ ├── test_extract_from_sap_bw.py │ ├── test_file_manager.py │ ├── test_file_manager_dbfs.py │ ├── test_file_manager_s3.py │ ├── test_full_load.py │ ├── test_gab.py │ ├── test_heartbeat.py │ ├── test_jdbc_reader.py │ ├── test_materialize_cdf.py │ ├── test_notification.py │ ├── test_reconciliation.py │ ├── test_schema_evolution.py │ ├── test_sensors.py │ ├── test_sftp_reader.py │ ├── test_sharepoint_reader.py │ ├── test_sharepoint_writer.py │ ├── test_table_manager.py │ ├── test_writers.py │ └── transformations/ │ ├── __init__.py │ ├── test_chain_transformations.py │ ├── test_column_creators.py │ ├── test_column_reshapers.py │ ├── test_data_maskers.py │ ├── test_date_transformers.py │ ├── test_drop_duplicate_rows.py │ ├── test_joiners.py │ ├── test_multiple_transformations.py │ ├── test_null_handlers.py │ ├── test_optimizers.py │ ├── test_regex_transformers.py │ ├── test_unions.py │ └── test_watermarker.py ├── resources/ │ ├── feature/ │ │ ├── append_load/ │ │ │ ├── failfast/ │ │ │ │ ├── batch.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ └── part-03.csv │ │ │ ├── jdbc_permissive/ │ │ │ │ ├── batch.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ └── part-03.csv │ │ │ ├── streaming_dropmalformed/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ ├── part-02.csv │ │ │ │ │ └── part-03.csv │ │ │ │ └── streaming.json │ │ │ └── streaming_with_terminators/ │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ └── part-01.csv │ │ │ └── streaming.json │ │ ├── custom_expectations/ │ │ │ ├── expect_column_pair_a_to_be_not_equal_to_b/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── dq_sales_schema.json │ │ │ │ └── streaming.json │ │ │ ├── expect_column_pair_a_to_be_smaller_or_equal_than_b/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── dq_sales_schema.json │ │ │ │ └── streaming.json │ │ │ ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── dq_sales_schema.json │ │ │ │ └── streaming.json │ │ │ ├── expect_column_values_to_be_date_not_older_than/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── dq_sales_schema.json │ │ │ │ └── streaming.json │ │ │ ├── expect_column_values_to_not_be_null_or_empty_string/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── dq_sales_schema.json │ │ │ │ └── streaming.json │ │ │ ├── expect_multicolumn_column_a_must_equal_b_or_c/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── dq_sales_schema.json │ │ │ │ └── streaming.json │ │ │ └── expect_queried_column_agg_value_to_be/ │ │ │ ├── batch.json │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── dq_control_success.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ └── part-02.csv │ │ │ ├── dq_sales_schema.json │ │ │ └── streaming.json │ │ ├── data_loader_custom_transformer/ │ │ │ ├── calculate_kpi/ │ │ │ │ ├── control_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source_schema.json │ │ │ ├── delta_load/ │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ ├── part-03.csv │ │ │ │ └── part-04.csv │ │ │ └── sql_transformation/ │ │ │ ├── control_schema.json │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ └── part-01.csv │ │ │ └── source_schema.json │ │ ├── data_quality/ │ │ │ ├── build_data_docs/ │ │ │ │ ├── with_data_docs_local_fs/ │ │ │ │ │ └── 20240410-080323-dq_success-sales_orders-checkpoint/ │ │ │ │ │ └── 20240410T080323.289170Z/ │ │ │ │ │ └── 7ba399ea28cc40bf8c79213a440aeb91.json │ │ │ │ └── without_data_docs_local_fs/ │ │ │ │ └── 20240409-143548-dq_validator-sales_source-checkpoint/ │ │ │ │ └── 20240409T143548.454043Z/ │ │ │ │ └── f0d7bd293d22bcfd3c1fec5a7d566638.json │ │ │ ├── load_with_dq_table/ │ │ │ │ ├── delta_with_dupl_tag_gen_fail/ │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ │ ├── data_validator_schema.json │ │ │ │ │ │ │ ├── sales.json │ │ │ │ │ │ │ └── sales_schema.json │ │ │ │ │ │ ├── dq_functions/ │ │ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init.csv │ │ │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new.csv │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ └── part-04.csv │ │ │ │ │ ├── streaming_init.json │ │ │ │ │ └── streaming_new.json │ │ │ │ ├── delta_with_duplicates_tag/ │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ │ ├── data_validator_schema.json │ │ │ │ │ │ │ ├── sales.json │ │ │ │ │ │ │ └── sales_schema.json │ │ │ │ │ │ ├── dq_functions/ │ │ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init.csv │ │ │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new.csv │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ └── part-04.csv │ │ │ │ │ ├── streaming_init.json │ │ │ │ │ └── streaming_new.json │ │ │ │ └── full_overwrite_tag/ │ │ │ │ ├── batch_init.json │ │ │ │ ├── batch_new.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ ├── data_validator_schema.json │ │ │ │ │ ├── sales.json │ │ │ │ │ └── sales_schema.json │ │ │ │ ├── dq_functions/ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init.csv │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_new.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ └── part-02.csv │ │ │ ├── load_with_dq_validator/ │ │ │ │ ├── delta_with_dupl_tag_gen_fail/ │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ │ ├── data_validator_schema.json │ │ │ │ │ │ │ ├── sales.json │ │ │ │ │ │ │ └── sales_schema.json │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ └── part-04.csv │ │ │ │ │ ├── streaming_init.json │ │ │ │ │ └── streaming_new.json │ │ │ │ ├── delta_with_duplicates/ │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ │ └── data_validator_schema.json │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ └── part-04.csv │ │ │ │ │ ├── streaming_init.json │ │ │ │ │ └── streaming_new.json │ │ │ │ ├── delta_with_duplicates_tag/ │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ │ ├── data_validator_schema.json │ │ │ │ │ │ │ ├── sales.json │ │ │ │ │ │ │ └── sales_schema.json │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ └── part-04.csv │ │ │ │ │ ├── streaming_init.json │ │ │ │ │ └── streaming_new.json │ │ │ │ ├── full_overwrite/ │ │ │ │ │ ├── batch_init.json │ │ │ │ │ ├── batch_new.json │ │ │ │ │ └── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ └── data_validator_schema.json │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── full_overwrite_tag/ │ │ │ │ │ ├── batch_init.json │ │ │ │ │ ├── batch_new.json │ │ │ │ │ └── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ ├── data_validator_schema.json │ │ │ │ │ │ ├── sales.json │ │ │ │ │ │ └── sales_schema.json │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ └── no_transformers/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── data_validator.json │ │ │ │ │ │ └── data_validator_schema.json │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ ├── part-02.csv │ │ │ │ │ ├── part-03.csv │ │ │ │ │ └── part-04.csv │ │ │ │ ├── streaming_init.json │ │ │ │ └── streaming_new.json │ │ │ └── validator/ │ │ │ └── data/ │ │ │ ├── control/ │ │ │ │ └── data_validator.csv │ │ │ ├── dq_functions/ │ │ │ │ ├── test_db.dq_functions_source_dq_failure.csv │ │ │ │ ├── test_db.dq_functions_source_dq_failure_error_disabled.csv │ │ │ │ ├── test_db.dq_functions_source_dq_failure_max_percentage.csv │ │ │ │ └── test_db.dq_functions_source_dq_success.csv │ │ │ └── source/ │ │ │ └── part-01.csv │ │ ├── delta_load/ │ │ │ ├── group_and_rank/ │ │ │ │ ├── fail_with_duplicates_in_same_file/ │ │ │ │ │ ├── batch_delta.json │ │ │ │ │ ├── batch_init.json │ │ │ │ │ ├── control_batch_schema.json │ │ │ │ │ ├── control_streaming_schema.json │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ ├── batch.csv │ │ │ │ │ │ │ └── streaming.csv │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv │ │ │ │ │ │ ├── WE_SO_SCL_202108111500000000.csv │ │ │ │ │ │ └── WE_SO_SCL_202108111600000000.csv │ │ │ │ │ ├── source_schema.json │ │ │ │ │ └── streaming_delta.json │ │ │ │ └── with_duplicates_in_same_file/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ ├── control_batch_schema.json │ │ │ │ ├── control_streaming_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── batch.csv │ │ │ │ │ │ └── streaming.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv │ │ │ │ │ ├── WE_SO_SCL_202108111500000000.csv │ │ │ │ │ └── WE_SO_SCL_202108111600000000.csv │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming_delta.json │ │ │ ├── merge_options/ │ │ │ │ ├── control_batch_schema.json │ │ │ │ ├── insert_column_set/ │ │ │ │ │ ├── batch_delta.json │ │ │ │ │ ├── batch_init.json │ │ │ │ │ └── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── batch.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv │ │ │ │ │ └── WE_SO_SCL_202108111500000000.csv │ │ │ │ ├── source_schema.json │ │ │ │ ├── update_all/ │ │ │ │ │ ├── batch_delta.json │ │ │ │ │ ├── batch_init.json │ │ │ │ │ └── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── batch.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv │ │ │ │ │ └── WE_SO_SCL_202108111500000000.csv │ │ │ │ └── update_column_set/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── batch.csv │ │ │ │ └── source/ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv │ │ │ │ └── WE_SO_SCL_202108111500000000.csv │ │ │ └── record_mode_cdc/ │ │ │ ├── backfill/ │ │ │ │ ├── batch_backfill.json │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ ├── part-03.csv │ │ │ │ ├── part-04.csv │ │ │ │ └── part-05.csv │ │ │ ├── direct_silver_load/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ ├── part-03.csv │ │ │ │ └── part-04.csv │ │ │ ├── late_arriving_changes/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ ├── part-02.csv │ │ │ │ │ ├── part-03.csv │ │ │ │ │ └── part-04.csv │ │ │ │ └── streaming_delta.json │ │ │ ├── out_of_order_changes/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ ├── part-02.csv │ │ │ │ │ ├── part-03.csv │ │ │ │ │ └── part-04.csv │ │ │ │ └── streaming_delta.json │ │ │ ├── with_deletes_additional_columns/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ ├── part-03.csv │ │ │ │ └── part-04.csv │ │ │ ├── with_duplicates/ │ │ │ │ ├── batch_delta.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ ├── part-02.csv │ │ │ │ ├── part-03.csv │ │ │ │ └── part-04.csv │ │ │ └── with_upserts_only_removed_columns/ │ │ │ ├── batch_delta.json │ │ │ ├── batch_init.json │ │ │ └── data/ │ │ │ ├── control/ │ │ │ │ └── part-01.csv │ │ │ └── source/ │ │ │ ├── part-01.json │ │ │ ├── part-02.json │ │ │ ├── part-03.json │ │ │ └── part-04.json │ │ ├── dq_validator/ │ │ │ ├── batch.json │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ ├── data_restore_control.csv │ │ │ │ │ ├── dq_control_failure.csv │ │ │ │ │ ├── dq_control_failure_disabled.csv │ │ │ │ │ ├── dq_control_success.csv │ │ │ │ │ ├── dq_control_success_explode.csv │ │ │ │ │ └── dq_control_success_explode_disabled.csv │ │ │ │ ├── dq_functions/ │ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ └── part-02.csv │ │ │ ├── dq_sales_schema.json │ │ │ ├── streaming.json │ │ │ ├── streaming_dataframe_two_runs/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_functions_streaming_dataframe_two_runs_first_run.csv │ │ │ │ └── test_db.dq_functions_streaming_dataframe_two_runs_second_run.csv │ │ │ ├── table_batch_dataframe_failure_disabled/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ │ ├── table_batch_dataframe_success/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ │ ├── table_batch_dq_rule/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_table_rule_id_failure.csv │ │ │ │ └── test_db.dq_table_rule_id_success.csv │ │ │ ├── table_batch_failure_disabled/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ │ ├── table_batch_success/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ │ ├── table_streaming_dq_rule/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_table_rule_id_failure.csv │ │ │ │ └── test_db.dq_table_rule_id_success.csv │ │ │ ├── table_streaming_failure_disabled/ │ │ │ │ └── data/ │ │ │ │ └── dq_functions/ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ │ └── table_streaming_success/ │ │ │ └── data/ │ │ │ └── dq_functions/ │ │ │ ├── test_db.dq_functions_source_table_failure.csv │ │ │ └── test_db.dq_functions_source_table_success.csv │ │ ├── engine_usage_stats/ │ │ │ ├── dq_validator/ │ │ │ │ └── data/ │ │ │ │ ├── control.json │ │ │ │ └── source.csv │ │ │ ├── load_custom_transf_and_df/ │ │ │ │ └── data/ │ │ │ │ ├── control.json │ │ │ │ └── source.csv │ │ │ ├── load_simple_acon/ │ │ │ │ └── data/ │ │ │ │ ├── control.json │ │ │ │ └── source.csv │ │ │ └── table_manager/ │ │ │ └── data/ │ │ │ └── control.json │ │ ├── extract_from_sap_b4/ │ │ │ ├── extract_aq_dso/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── dummy_table.csv │ │ │ │ │ │ ├── dummy_table_join_condition.csv │ │ │ │ │ │ └── dummy_table_schema.json │ │ │ │ │ └── source/ │ │ │ │ │ ├── dummy_table.csv │ │ │ │ │ ├── dummy_table_1.csv │ │ │ │ │ ├── dummy_table_2.csv │ │ │ │ │ └── rspmrequest.csv │ │ │ │ ├── dummy_table_schema.json │ │ │ │ └── rspmrequest_schema.json │ │ │ └── extract_cl_dso/ │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ ├── dummy_table.csv │ │ │ │ │ ├── dummy_table_join_condition.csv │ │ │ │ │ └── dummy_table_schema.json │ │ │ │ └── source/ │ │ │ │ ├── dummy_table.csv │ │ │ │ ├── dummy_table_cl_1.csv │ │ │ │ ├── dummy_table_cl_2.csv │ │ │ │ └── rspmrequest.csv │ │ │ ├── dummy_table_cl_schema.json │ │ │ ├── dummy_table_schema.json │ │ │ └── rspmrequest_schema.json │ │ ├── extract_from_sap_bw/ │ │ │ ├── derive_changelog_table_name/ │ │ │ │ ├── RSBASIDOC_schema.json │ │ │ │ ├── RSTSODS_schema.json │ │ │ │ └── data/ │ │ │ │ └── source/ │ │ │ │ ├── RSBASIDOC.csv │ │ │ │ └── RSTSODS.csv │ │ │ ├── extract_dso/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── dummy_table.csv │ │ │ │ │ │ ├── dummy_table_join_condition.csv │ │ │ │ │ │ └── dummy_table_schema.json │ │ │ │ │ └── source/ │ │ │ │ │ ├── dummy_table.csv │ │ │ │ │ ├── dummy_table_cl_1.csv │ │ │ │ │ ├── dummy_table_cl_2.csv │ │ │ │ │ └── rsodsactreq.csv │ │ │ │ ├── dummy_table_cl_schema.json │ │ │ │ ├── dummy_table_schema.json │ │ │ │ └── rsodsactreq_schema.json │ │ │ └── extract_write_optimised_dso/ │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ ├── dummy_table.csv │ │ │ │ │ ├── dummy_table_actreq_timestamp.csv │ │ │ │ │ ├── dummy_table_join_condition.csv │ │ │ │ │ └── dummy_table_schema.json │ │ │ │ └── source/ │ │ │ │ ├── dummy_table.csv │ │ │ │ ├── dummy_table_1.csv │ │ │ │ ├── dummy_table_2.csv │ │ │ │ └── rsodsactreq.csv │ │ │ ├── dummy_table_schema.json │ │ │ └── rsodsactreq_schema.json │ │ ├── file_manager/ │ │ │ ├── check_restore_status/ │ │ │ │ ├── acon_check_restore_status_directory.json │ │ │ │ └── acon_check_restore_status_single_object.json │ │ │ ├── copy_object/ │ │ │ │ ├── acon_copy_directory.json │ │ │ │ ├── acon_copy_directory_dry_run.json │ │ │ │ ├── acon_copy_single_object.json │ │ │ │ └── acon_copy_single_object_dry_run.json │ │ │ ├── delete_objects/ │ │ │ │ ├── acon_delete_objects.json │ │ │ │ └── acon_delete_objects_dry_run.json │ │ │ ├── request_restore/ │ │ │ │ ├── acon_request_restore_directory.json │ │ │ │ └── acon_request_restore_single_object.json │ │ │ └── request_restore_to_destination_and_wait/ │ │ │ ├── acon_request_restore_to_destination_and_wait_directory.json │ │ │ ├── acon_request_restore_to_destination_and_wait_single_object.json │ │ │ └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json │ │ ├── file_manager_dbfs/ │ │ │ ├── copy_objects/ │ │ │ │ ├── acon_copy_directory.json │ │ │ │ ├── acon_copy_directory_dry_run.json │ │ │ │ └── acon_copy_single_object.json │ │ │ ├── delete_objects/ │ │ │ │ ├── acon_delete_objects.json │ │ │ │ └── acon_delete_objects_dry_run.json │ │ │ └── move_objects/ │ │ │ ├── acon_move_objects.json │ │ │ └── acon_move_objects_dry_run.json │ │ ├── file_manager_s3/ │ │ │ ├── check_restore_status/ │ │ │ │ ├── acon_check_restore_status_directory.json │ │ │ │ └── acon_check_restore_status_single_object.json │ │ │ ├── copy_objects/ │ │ │ │ ├── acon_copy_directory.json │ │ │ │ ├── acon_copy_directory_dry_run.json │ │ │ │ ├── acon_copy_single_object.json │ │ │ │ └── acon_copy_single_object_dry_run.json │ │ │ ├── delete_objects/ │ │ │ │ ├── acon_delete_objects.json │ │ │ │ └── acon_delete_objects_dry_run.json │ │ │ ├── request_restore/ │ │ │ │ ├── acon_request_restore_directory.json │ │ │ │ └── acon_request_restore_single_object.json │ │ │ └── request_restore_to_destination_and_wait/ │ │ │ ├── acon_request_restore_to_destination_and_wait_directory.json │ │ │ ├── acon_request_restore_to_destination_and_wait_single_object.json │ │ │ └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json │ │ ├── full_load/ │ │ │ ├── full_overwrite/ │ │ │ │ ├── batch.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ └── part-02.csv │ │ │ ├── with_filter/ │ │ │ │ ├── batch.json │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ └── part-02.csv │ │ │ └── with_filter_partition_overwrite/ │ │ │ ├── batch.json │ │ │ ├── batch_init.json │ │ │ └── data/ │ │ │ ├── control/ │ │ │ │ └── part-01.csv │ │ │ └── source/ │ │ │ ├── part-01.csv │ │ │ └── part-02.csv │ │ ├── gab/ │ │ │ ├── control/ │ │ │ │ ├── data/ │ │ │ │ │ ├── vw_dummy_sales_kpi.csv │ │ │ │ │ ├── vw_nam_orders_all_snapshot.csv │ │ │ │ │ ├── vw_nam_orders_filtered_snapshot.csv │ │ │ │ │ ├── vw_negative_offset_orders_all.csv │ │ │ │ │ ├── vw_negative_offset_orders_filtered.csv │ │ │ │ │ ├── vw_orders_all.csv │ │ │ │ │ ├── vw_orders_all_snapshot.csv │ │ │ │ │ ├── vw_orders_filtered.csv │ │ │ │ │ └── vw_orders_filtered_snapshot.csv │ │ │ │ └── schema/ │ │ │ │ ├── vw_dummy_sales_kpi.json │ │ │ │ └── vw_orders.json │ │ │ ├── setup/ │ │ │ │ ├── column_list/ │ │ │ │ │ ├── calendar.json │ │ │ │ │ ├── dummy_sales_kpi.json │ │ │ │ │ ├── gab_log_events.json │ │ │ │ │ ├── gab_use_case_results.json │ │ │ │ │ ├── lkp_query_builder.json │ │ │ │ │ └── order_events.json │ │ │ │ ├── data/ │ │ │ │ │ ├── dummy_sales_kpi.csv │ │ │ │ │ ├── lkp_query_builder.csv │ │ │ │ │ └── order_events.csv │ │ │ │ └── schema/ │ │ │ │ ├── dummy_sales_kpi.json │ │ │ │ ├── lkp_query_builder.json │ │ │ │ └── order_events.json │ │ │ └── usecases/ │ │ │ ├── dummy_sales_kpi/ │ │ │ │ ├── 1_article_category.sql │ │ │ │ ├── 2_dummy_sales_kpi.sql │ │ │ │ └── scenario/ │ │ │ │ └── dummy_sales_kpi.json │ │ │ └── order_events/ │ │ │ ├── 1_order_events.sql │ │ │ └── scenario/ │ │ │ ├── order_events.json │ │ │ ├── order_events_nam.json │ │ │ ├── order_events_negative_timezone_offset.json │ │ │ ├── order_events_snapshot.json │ │ │ ├── skip_use_case_by_empty_reconciliation.json │ │ │ ├── skip_use_case_by_empty_requested_cadence.json │ │ │ ├── skip_use_case_by_not_configured_cadence.json │ │ │ └── skip_use_case_by_unexisting_cadence.json │ │ ├── heartbeat/ │ │ │ ├── control/ │ │ │ │ ├── default/ │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── ctr_heart_tbl_heartb_feed.csv │ │ │ │ │ │ ├── ctrl_heart_tbl_exec_sensor.csv │ │ │ │ │ │ ├── ctrl_heart_tbl_trigger_job.csv │ │ │ │ │ │ ├── ctrl_heart_tbl_updated.csv │ │ │ │ │ │ └── ctrl_sensor_tbl_upd_status.json │ │ │ │ │ └── schema/ │ │ │ │ │ ├── ctrl_heart_tbl_schema.json │ │ │ │ │ └── ctrl_heart_tbl_trig_schema.json │ │ │ │ └── heartbeat_paused_sensor_new_record/ │ │ │ │ ├── data/ │ │ │ │ │ ├── ctr_heart_tbl_heartb_feed.csv │ │ │ │ │ ├── ctrl_heart_tbl_exec_sensor.csv │ │ │ │ │ ├── ctrl_heart_tbl_trigger_job.csv │ │ │ │ │ ├── ctrl_heart_tbl_updated.csv │ │ │ │ │ └── ctrl_sensor_tbl_upd_status.json │ │ │ │ └── schema/ │ │ │ │ └── ctrl_heart_tbl_schema.json │ │ │ └── setup/ │ │ │ ├── default/ │ │ │ │ ├── column_list/ │ │ │ │ │ ├── heartbeat_sensor_control_table.json │ │ │ │ │ └── sensor_table.json │ │ │ │ ├── data/ │ │ │ │ │ ├── setup_heartbeat_data.csv │ │ │ │ │ └── setup_sensor_data.json │ │ │ │ └── schema/ │ │ │ │ └── schema_sensor_df.json │ │ │ └── heartbeat_paused_sensor_new_record/ │ │ │ ├── column_list/ │ │ │ │ ├── heartbeat_sensor_control_table.json │ │ │ │ └── sensor_table.json │ │ │ ├── data/ │ │ │ │ ├── setup_heartbeat_data.csv │ │ │ │ └── setup_sensor_data.json │ │ │ └── schema/ │ │ │ └── schema_sensor_df.json │ │ ├── jdbc_reader/ │ │ │ ├── jdbc_format/ │ │ │ │ ├── correct_arguments/ │ │ │ │ │ ├── batch_init.json │ │ │ │ │ └── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ ├── predicates/ │ │ │ │ │ └── batch_init.json │ │ │ │ └── wrong_arguments/ │ │ │ │ └── batch_init.json │ │ │ └── jdbc_function/ │ │ │ ├── correct_arguments/ │ │ │ │ ├── batch_init.json │ │ │ │ └── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source/ │ │ │ │ └── part-01.csv │ │ │ └── wrong_arguments/ │ │ │ └── batch_init.json │ │ ├── materialize_cdf/ │ │ │ ├── acon_create_table.json │ │ │ ├── control_schema.json │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-01_cdf.csv │ │ │ │ ├── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ └── table/ │ │ │ │ └── streaming_with_cdf.sql │ │ │ ├── streaming_with_clean_and_vacuum.json │ │ │ └── streaming_without_clean_cdf.json │ │ ├── notification/ │ │ │ └── test_attachement.txt │ │ ├── reconciliation/ │ │ │ └── data/ │ │ │ ├── current.json │ │ │ ├── current_different_rows.json │ │ │ ├── current_fail.json │ │ │ ├── current_nulls_and_zeros.json │ │ │ ├── current_nulls_and_zeros_fail.json │ │ │ ├── truth.json │ │ │ ├── truth_different_rows.json │ │ │ ├── truth_empty.json │ │ │ ├── truth_nulls_and_zeros.json │ │ │ └── truth_nulls_and_zeros_fail.json │ │ ├── schema_evolution/ │ │ │ ├── append_load/ │ │ │ │ ├── batch_append_disabled.json │ │ │ │ ├── batch_append_disabled_cast.json │ │ │ │ ├── batch_append_enabled.json │ │ │ │ ├── batch_append_enabled_cast.json │ │ │ │ ├── batch_init_disabled.json │ │ │ │ ├── batch_init_enabled.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ ├── part-05.csv │ │ │ │ │ │ └── part-06.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ ├── part-02.csv │ │ │ │ │ ├── part-03.csv │ │ │ │ │ ├── part-04.csv │ │ │ │ │ ├── part-05.csv │ │ │ │ │ └── part-06.csv │ │ │ │ └── schema/ │ │ │ │ ├── control/ │ │ │ │ │ ├── control_schema.json │ │ │ │ │ ├── control_schema_add_column.json │ │ │ │ │ └── control_schema_rename.json │ │ │ │ └── source/ │ │ │ │ ├── source_part-01_schema.json │ │ │ │ ├── source_part-02_schema.json │ │ │ │ ├── source_part-03_schema.json │ │ │ │ ├── source_part-04_schema.json │ │ │ │ ├── source_part-05_schema.json │ │ │ │ └── source_part-06_schema.json │ │ │ ├── delta_load/ │ │ │ │ ├── batch_delta_disabled.json │ │ │ │ ├── batch_delta_disabled_rename.json │ │ │ │ ├── batch_delta_enabled.json │ │ │ │ ├── batch_init_disabled.json │ │ │ │ ├── batch_init_enabled.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── part-02.csv │ │ │ │ │ │ ├── part-03.csv │ │ │ │ │ │ ├── part-04.csv │ │ │ │ │ │ ├── part-05.csv │ │ │ │ │ │ └── part-06.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ ├── part-02.csv │ │ │ │ │ ├── part-03.csv │ │ │ │ │ ├── part-04.csv │ │ │ │ │ ├── part-05.csv │ │ │ │ │ └── part-06.csv │ │ │ │ └── schema/ │ │ │ │ ├── control/ │ │ │ │ │ ├── control_schema.json │ │ │ │ │ ├── control_schema_add_column.json │ │ │ │ │ └── control_schema_rename.json │ │ │ │ └── source/ │ │ │ │ ├── source_part-01_schema.json │ │ │ │ ├── source_part-02_schema.json │ │ │ │ ├── source_part-03_schema.json │ │ │ │ ├── source_part-04_schema.json │ │ │ │ ├── source_part-05_schema.json │ │ │ │ └── source_part-06_schema.json │ │ │ └── full_load/ │ │ │ ├── batch_init.json │ │ │ ├── batch_merge_disabled.json │ │ │ ├── batch_merge_enabled.json │ │ │ ├── batch_overwrite.json │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── part-02.csv │ │ │ │ └── source/ │ │ │ │ ├── part-01.csv │ │ │ │ └── part-02.csv │ │ │ └── schema/ │ │ │ ├── control/ │ │ │ │ ├── control_schema_merge_enabled.json │ │ │ │ └── control_schema_overwrite.json │ │ │ └── source/ │ │ │ ├── source_part-01_schema.json │ │ │ └── source_part-02_schema.json │ │ ├── sftp_reader/ │ │ │ └── data/ │ │ │ ├── file.csv │ │ │ ├── file1.csv │ │ │ ├── file2.csv │ │ │ ├── file3.json │ │ │ ├── file4.xml │ │ │ └── file5.txt │ │ ├── sharepoint/ │ │ │ ├── exceptions/ │ │ │ │ ├── acons/ │ │ │ │ │ ├── drive_exception.json │ │ │ │ │ ├── endpoint_exception.json │ │ │ │ │ ├── local_path_exception.json │ │ │ │ │ ├── site_exception.json │ │ │ │ │ └── streaming_exception.json │ │ │ │ └── schemas/ │ │ │ │ └── schema.json │ │ │ ├── reader/ │ │ │ │ ├── acons/ │ │ │ │ │ ├── read_file_name_and_file_pattern_conflict_should_fail.json │ │ │ │ │ ├── read_file_name_unsupported_extension_should_fail.json │ │ │ │ │ ├── read_folder_csv_archive_enabled_success.json │ │ │ │ │ ├── read_folder_csv_archive_success_subfolder_override_success.json │ │ │ │ │ ├── read_folder_csv_no_csv_files_should_fail.json │ │ │ │ │ ├── read_folder_csv_one_file_schema_mismatch_custom_error_subfolder_should_archive_error.json │ │ │ │ │ ├── read_folder_csv_one_file_schema_mismatch_should_archive_error.json │ │ │ │ │ ├── read_folder_csv_pattern_matches_no_files_should_fail.json │ │ │ │ │ ├── read_folder_csv_pattern_success.json │ │ │ │ │ ├── read_folder_csv_success.json │ │ │ │ │ ├── read_folder_path_does_not_exist_should_fail.json │ │ │ │ │ ├── read_folder_relative_path_looks_like_file_unsupported_extension_should_fail.json │ │ │ │ │ ├── read_single_csv_archive_default_enabled_success.json │ │ │ │ │ ├── read_single_csv_archive_enabled_success.json │ │ │ │ │ ├── read_single_csv_archive_success_subfolder_override_success.json │ │ │ │ │ ├── read_single_csv_download_error_should_archive_error.json │ │ │ │ │ ├── read_single_csv_empty_file_should_archive_error.json │ │ │ │ │ ├── read_single_csv_full_path_success.json │ │ │ │ │ ├── read_single_csv_full_path_with_file_name_should_fail.json │ │ │ │ │ ├── read_single_csv_full_path_with_file_pattern_should_fail.json │ │ │ │ │ ├── read_single_csv_full_path_with_file_type_should_fail.json │ │ │ │ │ ├── read_single_csv_spark_load_fails_should_archive_error.json │ │ │ │ │ ├── read_single_csv_success.json │ │ │ │ │ └── read_unsupported_file_type_should_fail.json │ │ │ │ ├── data/ │ │ │ │ │ ├── bad_schema.csv │ │ │ │ │ ├── other.csv │ │ │ │ │ ├── sample_1.csv │ │ │ │ │ └── sample_2.csv │ │ │ │ └── mocks/ │ │ │ │ ├── get_drive_id.json │ │ │ │ ├── get_file_metadata.json │ │ │ │ ├── get_site_id.json │ │ │ │ └── rename_file.json │ │ │ └── writer/ │ │ │ ├── acons/ │ │ │ │ └── write_to_local_success.json │ │ │ ├── data/ │ │ │ │ ├── file_control.csv │ │ │ │ └── file_source.csv │ │ │ ├── mocks/ │ │ │ │ ├── create_upload_session.json │ │ │ │ ├── get_drive_id.json │ │ │ │ └── get_site_id.json │ │ │ └── schemas/ │ │ │ └── schema.json │ │ ├── table_manager/ │ │ │ ├── compute_table_statistics/ │ │ │ │ ├── table_stats_complex_default_scenario1.json │ │ │ │ ├── table_stats_complex_default_scenario2.json │ │ │ │ ├── table_stats_complex_different_delimiter_scenario1.json │ │ │ │ ├── table_stats_complex_different_delimiter_scenario2.json │ │ │ │ └── table_stats_simple_split_scenario.json │ │ │ ├── create/ │ │ │ │ ├── acon_create_table.json │ │ │ │ ├── acon_create_table_complex_default_scenario.json │ │ │ │ ├── acon_create_table_complex_different_delimiter_scenario.json │ │ │ │ ├── acon_create_table_simple_split_scenario.json │ │ │ │ ├── acon_create_view.json │ │ │ │ ├── acon_create_view_complex_default_scenario.json │ │ │ │ ├── acon_create_view_complex_different_delimiter_scenario.json │ │ │ │ ├── acon_create_view_simple_split_scenario.json │ │ │ │ ├── table/ │ │ │ │ │ ├── test_table_complex_default_scenario.sql │ │ │ │ │ ├── test_table_complex_different_delimiter_scenario.sql │ │ │ │ │ └── test_table_simple_split_scenario.sql │ │ │ │ └── view/ │ │ │ │ ├── test_view_complex_default_scenario.sql │ │ │ │ ├── test_view_complex_different_delimiter_scenario.sql │ │ │ │ └── test_view_simple_split_scenario.sql │ │ │ ├── delete/ │ │ │ │ └── acon_delete_where_table_simple_split_scenario.json │ │ │ ├── describe/ │ │ │ │ └── acon_describe_simple_split_scenario.json │ │ │ ├── drop/ │ │ │ │ ├── acon_drop_table_simple_split_scenario.json │ │ │ │ └── acon_drop_view_simple_split_scenario.json │ │ │ ├── execute_sql/ │ │ │ │ ├── acon_execute_sql_complex_default_scenario.json │ │ │ │ ├── acon_execute_sql_complex_different_delimiter_scenario.json │ │ │ │ └── acon_execute_sql_simple_split_scenario.json │ │ │ ├── get_tbl_pk/ │ │ │ │ └── get_tbl_pk_simple_split_scenario.json │ │ │ ├── optimize/ │ │ │ │ ├── optimize_location.json │ │ │ │ ├── optimize_location_simple_split_scenario.json │ │ │ │ ├── optimize_table.json │ │ │ │ └── optimize_table_simple_split_scenario.json │ │ │ ├── show_tbl_properties/ │ │ │ │ └── show_tbl_properties_simple_split_scenario.json │ │ │ └── vacuum/ │ │ │ ├── acon_vacuum_location.json │ │ │ ├── acon_vacuum_location_simple_split_scenario.json │ │ │ └── acon_vacuum_table_simple_split_scenario.json │ │ ├── transformations/ │ │ │ ├── chain_transformations/ │ │ │ │ ├── acons/ │ │ │ │ │ ├── batch.json │ │ │ │ │ ├── streaming.json │ │ │ │ │ ├── streaming_batch.json │ │ │ │ │ ├── write_streaming_struct_data.json │ │ │ │ │ └── write_streaming_struct_data_fail.json │ │ │ │ ├── control/ │ │ │ │ │ ├── chain_control.csv │ │ │ │ │ └── struct_data.json │ │ │ │ ├── schema/ │ │ │ │ │ ├── customer_schema.json │ │ │ │ │ ├── sales_schema.json │ │ │ │ │ └── struct_data_schema.json │ │ │ │ └── source/ │ │ │ │ ├── customers.csv │ │ │ │ ├── sales_historical.csv │ │ │ │ ├── sales_new.csv │ │ │ │ └── struct_data.csv │ │ │ ├── column_creators/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.json │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming.json │ │ │ ├── column_reshapers/ │ │ │ │ ├── explode_arrays/ │ │ │ │ │ ├── batch.json │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ │ └── source/ │ │ │ │ │ │ └── part-01.json │ │ │ │ │ ├── source_schema.json │ │ │ │ │ └── streaming.json │ │ │ │ ├── flatten_and_explode_arrays_and_maps/ │ │ │ │ │ ├── batch.json │ │ │ │ │ ├── data/ │ │ │ │ │ │ ├── control/ │ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ │ └── source/ │ │ │ │ │ │ └── part-01.json │ │ │ │ │ ├── source_schema.json │ │ │ │ │ └── streaming.json │ │ │ │ └── flatten_schema/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.json │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming.json │ │ │ ├── data_maskers/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── drop_columns.csv │ │ │ │ │ │ └── hash_masking.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ ├── drop_columns.json │ │ │ │ ├── drop_columns_control_schema.json │ │ │ │ ├── hash_masking.json │ │ │ │ ├── hash_masking_control_schema.json │ │ │ │ └── source_schema.json │ │ │ ├── date_transformers/ │ │ │ │ ├── control_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming.json │ │ │ ├── drop_duplicate_rows/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── batch_distinct.json │ │ │ │ │ │ ├── batch_drop_duplicates.json │ │ │ │ │ │ ├── streaming_distinct.json │ │ │ │ │ │ └── streaming_drop_duplicates.json │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming.json │ │ │ ├── joiners/ │ │ │ │ ├── batch.json │ │ │ │ ├── control_scenario_1_and_2_schema.json │ │ │ │ ├── control_scenario_3_schema.json │ │ │ │ ├── customer_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── control_scenario_1_and_2.csv │ │ │ │ │ │ └── control_scenario_3.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── customer-part-01.csv │ │ │ │ │ ├── sales-part-01.csv │ │ │ │ │ └── sales-part-02.csv │ │ │ │ ├── sales_schema.json │ │ │ │ ├── streaming.json │ │ │ │ ├── streaming_foreachBatch.json │ │ │ │ ├── streaming_without_broadcast.json │ │ │ │ └── streaming_without_column_rename.json │ │ │ ├── multiple_transform/ │ │ │ │ ├── batch.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.json │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ └── source_schema.json │ │ │ ├── null_handlers/ │ │ │ │ ├── control_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── replace_nulls.csv │ │ │ │ │ │ └── replace_nulls_col_subset.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── part-01.csv │ │ │ │ ├── replace_nulls.json │ │ │ │ ├── replace_nulls_col_subset.json │ │ │ │ └── source_schema.json │ │ │ ├── optimizers/ │ │ │ │ └── data/ │ │ │ │ └── source/ │ │ │ │ └── part-01.csv │ │ │ ├── regex_transformers/ │ │ │ │ └── with_regex_value/ │ │ │ │ ├── batch.json │ │ │ │ ├── control_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── part-01.csv │ │ │ │ │ └── source/ │ │ │ │ │ └── WE_SO_SCL_202108111400000029.csv │ │ │ │ └── source_schema.json │ │ │ ├── unions/ │ │ │ │ ├── batch_union.json │ │ │ │ ├── batch_unionByName.json │ │ │ │ ├── batch_unionByName_diff_schema.json │ │ │ │ ├── batch_unionByName_diff_schema_error.json │ │ │ │ ├── batch_union_diff_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ ├── control_sales.csv │ │ │ │ │ │ ├── control_sales_shipment.csv │ │ │ │ │ │ ├── control_sales_shipment_streaming.csv │ │ │ │ │ │ ├── control_sales_shipment_streaming_foreachBatch.csv │ │ │ │ │ │ ├── control_sales_streaming.csv │ │ │ │ │ │ └── control_sales_streaming_foreachBatch.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── sales-historical-part-01.csv │ │ │ │ │ ├── sales-historical-part-02.csv │ │ │ │ │ ├── sales-new-part-01.csv │ │ │ │ │ ├── sales-new-part-02.csv │ │ │ │ │ ├── sales-shipment-part-01.csv │ │ │ │ │ └── sales-shipment-part-02.csv │ │ │ │ ├── sales_schema.json │ │ │ │ ├── sales_shipment_schema.json │ │ │ │ ├── streaming_union.json │ │ │ │ ├── streaming_unionByName_diff_schema.json │ │ │ │ ├── streaming_unionByName_diff_schema_foreachBatch.json │ │ │ │ └── streaming_union_foreachBatch.json │ │ │ └── watermarker/ │ │ │ ├── streaming_drop_duplicates/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── streaming_drop_duplicates.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming_drop_duplicates.json │ │ │ ├── streaming_drop_duplicates_overall_watermark/ │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── streaming_drop_duplicates_overall_watermark.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── part-01.csv │ │ │ │ │ └── part-02.csv │ │ │ │ ├── source_schema.json │ │ │ │ └── streaming_drop_duplicates_overall_watermark.json │ │ │ ├── streaming_inner_join/ │ │ │ │ ├── customer_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── streaming_inner_join.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── customer-part-01.csv │ │ │ │ │ ├── sales-part-01.csv │ │ │ │ │ └── sales-part-02.csv │ │ │ │ ├── sales_schema.json │ │ │ │ ├── streaming_inner_join.json │ │ │ │ └── streaming_inner_join_control_schema.json │ │ │ ├── streaming_left_outer_join/ │ │ │ │ ├── customer_schema.json │ │ │ │ ├── data/ │ │ │ │ │ ├── control/ │ │ │ │ │ │ └── streaming_left_outer_join.csv │ │ │ │ │ └── source/ │ │ │ │ │ ├── customer-part-01.csv │ │ │ │ │ ├── customer-part-02.csv │ │ │ │ │ ├── customer-part-03.csv │ │ │ │ │ ├── customer-part-04.csv │ │ │ │ │ ├── customer-part-05.csv │ │ │ │ │ ├── sales-part-01.csv │ │ │ │ │ ├── sales-part-02.csv │ │ │ │ │ ├── sales-part-03.csv │ │ │ │ │ ├── sales-part-04.csv │ │ │ │ │ └── sales-part-05.csv │ │ │ │ ├── sales_schema.json │ │ │ │ ├── streaming_left_outer_join.json │ │ │ │ └── streaming_left_outer_join_control_schema.json │ │ │ └── streaming_right_outer_join/ │ │ │ ├── customer_schema.json │ │ │ ├── data/ │ │ │ │ ├── control/ │ │ │ │ │ └── streaming_right_outer_join.csv │ │ │ │ └── source/ │ │ │ │ ├── customer-part-01.csv │ │ │ │ ├── sales-part-01.csv │ │ │ │ └── sales-part-02.csv │ │ │ ├── sales_schema.json │ │ │ ├── streaming_right_outer_join.json │ │ │ └── streaming_right_outer_join_control_schema.json │ │ └── writers/ │ │ ├── acons/ │ │ │ ├── write_batch_console.json │ │ │ ├── write_batch_dataframe.json │ │ │ ├── write_batch_files.json │ │ │ ├── write_batch_jdbc.json │ │ │ ├── write_batch_rest_api.json │ │ │ ├── write_batch_table.json │ │ │ ├── write_streaming_console.json │ │ │ ├── write_streaming_dataframe.json │ │ │ ├── write_streaming_df_with_checkpoint.json │ │ │ ├── write_streaming_files.json │ │ │ ├── write_streaming_foreachBatch_console.json │ │ │ ├── write_streaming_foreachBatch_dataframe.json │ │ │ ├── write_streaming_foreachBatch_df_with_checkpoint.json │ │ │ ├── write_streaming_foreachBatch_files.json │ │ │ ├── write_streaming_foreachBatch_jdbc.json │ │ │ ├── write_streaming_foreachBatch_table.json │ │ │ ├── write_streaming_multiple_dfs.json │ │ │ ├── write_streaming_rest_api.json │ │ │ └── write_streaming_table.json │ │ ├── control/ │ │ │ ├── writers_control.csv │ │ │ ├── writers_control_streaming_dataframe_1.csv │ │ │ ├── writers_control_streaming_dataframe_2.csv │ │ │ ├── writers_control_streaming_dataframe_foreachBatch_1.csv │ │ │ └── writers_control_streaming_dataframe_foreachBatch_2.csv │ │ ├── schema/ │ │ │ └── sales_schema.json │ │ └── source/ │ │ ├── sales_historical_1.csv │ │ ├── sales_historical_2.csv │ │ ├── sales_new_1.csv │ │ └── sales_new_2.csv │ └── unit/ │ ├── custom_configs/ │ │ └── custom_engine_config.yaml │ ├── heartbeat/ │ │ ├── heartbeat_acon_creation/ │ │ │ └── setup/ │ │ │ └── column_list/ │ │ │ ├── heartbeat_sensor_control_table.json │ │ │ └── sensor_table.json │ │ └── heartbeat_anchor_job/ │ │ └── setup/ │ │ └── column_list/ │ │ ├── heartbeat_sensor_control_table.json │ │ └── sensor_table.json │ └── sharepoint_reader/ │ └── data/ │ ├── sample_ok.csv │ └── sample_other_delim.csv ├── unit/ │ ├── __init__.py │ ├── test_acon_validation.py │ ├── test_custom_configs.py │ ├── test_databricks_utils.py │ ├── test_failure_notification_creation.py │ ├── test_heartbeat_acon_creation.py │ ├── test_heartbeat_anchor_job.py │ ├── test_log_filter_sensitive_data.py │ ├── test_notification_creation.py │ ├── test_notification_factory.py │ ├── test_prisma_dq_rule_id.py │ ├── test_prisma_function_definition.py │ ├── test_rest_api_functions.py │ ├── test_sensor.py │ ├── test_sensor_manager.py │ ├── test_sharepoint_csv_reader.py │ ├── test_spark_session.py │ └── test_version.py └── utils/ ├── __init__.py ├── dataframe_helpers.py ├── dq_rules_table_utils.py ├── exec_env_helpers.py ├── local_storage.py ├── mocks.py └── smtp_server.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a Bug report to help us improve title: "[BUG] Function X is raising error Y" labels: bug assignees: jmcorreia --- **Describe the bug** A clear and concise description of what the bug is. **Environment Details** - Lakehouse Engine Version - Environment where you are using the Lakehouse Engine (Ex. Databricks 13.3LTS) **To Reproduce** Please include all the necessary details to reproduce the problem, including the full ACON or functions that are being used and at what point the problem is occurring. **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. **Additional context** Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: "[FEATURE] I would like to have the capability to do X" labels: enhancement assignees: jmcorreia --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context, useful links or screenshots about the feature request here. ================================================ FILE: .github/pull_request_template.md ================================================ - [ ] Description of PR changes above includes a link to [an existing GitHub issue](https://github.com/adidas/lakehouse-engine/issues) - [ ] PR title is prefixed with one of: [BUGFIX], [FEATURE] - [ ] Appropriate tests and docs have been updated - [ ] Code is linted and tested - ``` make style make lint make test make test-security ``` For more information about contributing, see [Contribute](https://github.com/adidas/lakehouse-engine/blob/master/CONTRIBUTING.md). After you submit your PR, keep **monitoring its statuses and discuss/apply fixes for any issues or suggestions coming from the PR Reviews**. Thanks for contributing! ================================================ FILE: .gitignore ================================================ # mac os hidden files .DS_Store # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checer .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # intellij and vscode .idea/ **.iml .vscode/ # credentials **credential** # lakehouse and spark /tests/lakehouse/** *derby.log* **/metastore_db/ /metastore_db/ **/spark-warehouse/ /spark-warehouse/ /artefacts/ tmp_os/ ================================================ FILE: CONTRIBUTING.md ================================================ # How to Contribute 📖 Search algorithms, transformations and check implementation details & examples in our [documentation](https://adidas.github.io/lakehouse-engine-docs/lakehouse_engine.html). 💭 In case you have doubts, ideas, want to ask for help or want to discuss different approach and usages, feel free to create a [discussion](https://github.com/adidas/lakehouse-engine/discussions). ⚠️ Are you facing any issues? Open an issue on [GitHub](https://github.com/adidas/lakehouse-engine/issues). 💡 Do you have ideas for new features? Open a feature request on [GitHub](https://github.com/adidas/lakehouse-engine/issues). 🚀 Want to find the available releases? Check our release notes on [GitHub](https://github.com/adidas/lakehouse-engine/releases) and [PyPi](https://pypi.org/project/lakehouse-engine/). ## Prerequisites 1. Git. 2. Your IDE of choice with a Python 3 environment (e.g., virtualenv created from the requirements_cicd.txt file). 3. Docker. **Warning:** The default spark driver memory limit for the tests is set at 2g. This limit is configurable but your testing docker setup **MUST** always have **at least** 2 * spark driver memory limit + 1 gb configured. 4. GNU make. ## General steps for contributing 1. Fork the project. 2. Clone the forked project into your working environment. 3. Create your feature branch following the convention [feature|bugfix]/ISSUE_ID_short_name. 4. Apply your changes in the recently created branch. It is **mandatory** to add tests covering the feature of fix contributed. 5. Style, lint, test and test security: ``` make style make lint make test make test-security ``` --- > ***Note:*** To use the make targets with another docker-compatible cli other than docker you can pass the parameter "container_cli". Example: `make test container_cli=nerdctl` --- --- > ***Note:*** Most make target commands are running on docker. If you face any problem, you can also check the code of the respective make targets and directly execute the code in your python virtual environment. --- 6. (optional) You can build the wheel locally with `make build`. 7. (optional) Install the wheel you have just generated and test it. 8. If you have changed or added new requirements, you should run `make build-lock-files`, to rebuild the lock files. 9. If the transitive dependencies have not been updated for a while, and you want to upgrade them, you can use `make upgrade-lock-files` to update them. This will update the transitive dependencies even if you have not changed the requirements. 10. When you're ready with your changes, open a Pull Request (PR) to develop. 11. Ping the team through the preferred communication channel. 12. The team will come together to review it and approve it (2 approvals required). 13. Your changes will be tested internally, promoted to master and included in the next release. > 🚀🚀🚀 > > **Pull Requests are welcome from anyone**. However, before opening one, please make sure to open an issue on [GitHub](https://github.com/adidas/lakehouse-engine/issues) > and link it. > Moreover, if the Pull Request intends to cover big changes or features, it is recommended to first discuss it on a [GitHub issue](https://github.com/adidas/lakehouse-engine/issues) or [Discussion](https://github.com/adidas/lakehouse-engine/discussions). > > 🚀🚀🚀 ================================================ FILE: LICENSE.txt ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2023 adidas AG Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ SHELL := /bin/bash -euxo pipefail container_cli := docker image_name := lakehouse-engine deploy_env := dev project_version := $(shell cat cicd/.bumpversion.cfg | grep "current_version =" | cut -f 3 -d " ") version := $(project_version) # Gets system information in upper case system_information := $(shell uname -mvp | tr a-z A-Z) meta_conf_file := cicd/meta.yaml meta_os_conf_file := cicd/meta_os.yaml group_id := $(shell id -g ${USER}) engine_conf_file := lakehouse_engine/configs/engine.yaml engine_os_conf_file := lakehouse_engine/configs/engine_os.yaml remove_files_from_os := $(engine_conf_file) $(meta_conf_file) CODEOWNERS sonar-project.properties CONTRIBUTING.md CHANGELOG.md assets/img/os_strategy.png last_commit_msg := "$(shell git log -1 --pretty=%B)" git_tag := $(shell git describe --tags --abbrev=0) commits_url := $(shell cat $(meta_conf_file) | grep commits_url | cut -f 2 -d " ") ifneq ($(project_version), $(version)) wheel_version := $(project_version)+$(subst _,.,$(subst -,.,$(version))) project_name := lakehouse-engine-experimental else wheel_version := $(version) project_name := lakehouse-engine endif # Add \ to make reg safe comparisons (e.g. in the perl commands) wheel_version_reg_safe := $(subst +,\+,$(subst .,\.,$(wheel_version))) project_version_reg_safe := $(subst .,\.,$(project_version)) # Condition to define the Python image to be built based on the machine CPU architecture. # The base Python image only changes if the identified CPU architecture is ARM. ifneq (,$(findstring ARM,$(system_information))) python_image := $(shell cat $(meta_conf_file) | grep arm_python_image | cut -f 2 -d " ") cpu_architecture := arm64 else python_image := $(shell cat $(meta_conf_file) | grep amd_python_image | cut -f 2 -d " ") cpu_architecture := amd64 endif # Condition to define the spark driver memory limit to be used in the tests # In order to change this limit you can use the spark_driver_memory parameter # Example: make test spark_driver_memory=3g # # WARNING: When the tests are being run 2 spark nodes are created, so despite # the default value being 2g, your configured docker environment should have # extra memory for communication and overhead. ifndef $(spark_driver_memory) spark_driver_memory := "2g" endif # A requirements_full.lock file is created based on all the requirements of the project (core, dq, os, azure, sftp, cicd and sharepoint). # The requirements_full.lock file is then used as a constraints file to build the other lock file so that we ensure dependencies are consistent and compatible # with each other, otherwise, the the installations would likely fail. # Moreover, the requirement_full.lock file is also used in the dockerfile to install all project dependencies. full_requirements := -o requirements_full.lock requirements.txt requirements_os.txt requirements_dq.txt requirements_azure.txt requirements_sftp.txt requirements_cicd.txt requirements_sharepoint.txt requirements := -o requirements.lock requirements.txt -c requirements_full.lock os_requirements := -o requirements_os.lock requirements_os.txt -c requirements_full.lock dq_requirements = -o requirements_dq.lock requirements_dq.txt -c requirements_full.lock azure_requirements = -o requirements_azure.lock requirements_azure.txt -c requirements_full.lock sftp_requirements = -o requirements_sftp.lock requirements_sftp.txt -c requirements_full.lock sharepoint_requirements = -o requirements_sharepoint.lock requirements_sharepoint.txt -c requirements_full.lock os_deployment := False container_user_dir := /home/appuser trust_git_host := ssh -oStrictHostKeyChecking=no -i $(container_user_dir)/.ssh/id_rsa git@github.com ifeq ($(os_deployment), True) build_src_dir := tmp_os/lakehouse-engine else build_src_dir := . endif build-image: $(container_cli) build \ --build-arg USER_ID=$(shell id -u ${USER}) \ --build-arg GROUP_ID=$(group_id) \ --build-arg PYTHON_IMAGE=$(python_image) \ --build-arg CPU_ARCHITECTURE=$(cpu_architecture) \ -t $(image_name):$(version) . -f cicd/Dockerfile build-image-windows: $(container_cli) build \ --build-arg PYTHON_IMAGE=$(python_image) \ --build-arg CPU_ARCHITECTURE=$(cpu_architecture) \ -t $(image_name):$(version) . -f cicd/Dockerfile # The build target is used to build the wheel package. # It makes usage of some `perl` commands to change the project wheel version in the pyproject.toml file, # whenever the goal is to release a package for testing, instead of an official release. # Ex: if you run 'make build-image version=feature-x-1276, and the current project version is 1.20.0, the generated wheel will be: lakehouse_engine_experimental-1.20.0+feature.x.1276-py3-none-any, # while for the official 1.20.0 release, the wheel will be: lakehouse_engine-1.20.0-py3-none-any. build: perl -pi -e 's/version = "$(project_version_reg_safe)"/version = "$(wheel_version)"/g' pyproject.toml && \ perl -pi -e 's/name = "lakehouse-engine"/name = "$(project_name)"/g' pyproject.toml && \ $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'python -m build --wheel $(build_src_dir)' && \ perl -pi -e 's/version = "$(wheel_version_reg_safe)"/version = "$(project_version)"/g' pyproject.toml && \ perl -pi -e 's/name = "$(project_name)"/name = "lakehouse-engine"/g' pyproject.toml deploy: build $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ -v $(artifactory_credentials_file):$(container_user_dir)/.pypirc \ $(image_name):$(version) \ /bin/bash -c 'twine upload -r artifactory dist/$(subst -,_,$(project_name))-$(wheel_version)-py3-none-any.whl --skip-existing' docs: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'cd $(build_src_dir) && pip install . && python ./cicd/code_doc/render_docs.py' # mypy incremental mode is used by default, so in case there is any cache related issue, # you can modify the command to include --no-incremental flag or you can delete mypy_cache folder. lint: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'flake8 --docstring-convention google --config=cicd/flake8.conf lakehouse_engine tests cicd/code_doc/render_docs.py \ && mypy --no-incremental lakehouse_engine tests' # useful to print and use make variables. Usage: make print-variable var=variable_to_print. print-variable: @echo $($(var)) style: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c '''isort lakehouse_engine tests cicd/code_doc/render_docs.py && \ black lakehouse_engine tests cicd/code_doc/render_docs.py''' terminal: $(container_cli) run \ -it \ --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash # Can use test only: ```make test test_only="tests/feature/test_delta_load_record_mode_cdc.py"```. # You can also hack it by doing ```make test test_only="-rx tests/feature/test_delta_load_record_mode_cdc.py"``` # to show complete output even of passed tests. # We also fix the coverage filepaths, using perl, so that report has the correct paths test: $(container_cli) run \ --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c "pytest \ --junitxml=artefacts/tests.xml \ --cov-report xml --cov-report xml:artefacts/coverage.xml \ --cov-report term-missing --cov=lakehouse_engine \ --log-cli-level=INFO --color=yes -x -vv \ --spark_driver_memory=$(spark_driver_memory) $(test_only)" && \ perl -pi -e 's/filename=\"/filename=\"lakehouse_engine\//g' artefacts/coverage.xml test-security: $(container_cli) run \ --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'bandit -c cicd/bandit.yaml -r lakehouse_engine tests' ##################################### ##### Dependency Management Targets ##### ##################################### audit-dep-safety: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'pip-audit -r cicd/requirements_full.lock --desc on -f json --fix --dry-run -o artefacts/safety_analysis.json' # This target will build the lock files to be used for building the wheel and delivering it. build-lock-files: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'cd cicd && pip-compile --resolver=backtracking $(full_requirements) && \ pip-compile --resolver=backtracking $(requirements) && \ pip-compile --resolver=backtracking $(os_requirements) && \ pip-compile --resolver=backtracking $(dq_requirements) && \ pip-compile --resolver=backtracking $(azure_requirements) && \ pip-compile --resolver=backtracking $(sftp_requirements) && \ pip-compile --resolver=backtracking $(sharepoint_requirements)' # We test the dependencies to check if they need to be updated because requirements.txt files have changed. # On top of that, we also test if we will be able to install the base and the extra packages together, # as their lock files are built separately and therefore dependency constraints might be too restricted. # If that happens, pip install will fail because it cannot solve the dependency resolution process, and therefore # we need to pin those conflict dependencies in the requirements.txt files to a version that fits both the base and # extra packages. test-deps: @GIT_STATUS="$$(git status --porcelain --ignore-submodules cicd/)"; \ if [ ! "x$$GIT_STATUS" = "x" ]; then \ echo "!!! Requirements lists has been updated but lock file was not rebuilt !!!"; \ echo "!!! Run make build-lock-files !!!"; \ echo -e "$${GIT_STATUS}"; \ git diff cicd/; \ exit 1; \ fi $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'pip install -e .[azure,dq,sftp,os] --dry-run --ignore-installed' # This will update the transitive dependencies even if there were no changes in the requirements files. # This should be a recurrent activity to make sure transitive dependencies are kept up to date. upgrade-lock-files: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'cd cicd && pip-compile --resolver=backtracking --upgrade $(full_requirements) && \ pip-compile --resolver=backtracking --upgrade $(requirements) && \ pip-compile --resolver=backtracking --upgrade $(os_requirements) && \ pip-compile --resolver=backtracking --upgrade $(dq_requirements) && \ pip-compile --resolver=backtracking --upgrade $(azure_requirements) && \ pip-compile --resolver=backtracking --upgrade $(sftp_requirements) && \ pip-compile --resolver=backtracking --upgrade $(sharepoint_requirements)' ##################################### ##### GitHub Deployment Targets ##### ##################################### prepare-github-repo: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ -v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \ $(image_name):$(version) \ /bin/bash -c """mkdir -p tmp_os/$(repository); \ cd tmp_os/$(repository); \ git init -b master; \ git config pull.rebase false; \ git config user.email 'lak-engine@adidas.com'; \ git config user.name 'Lakehouse Engine'; \ $(trust_git_host); \ git remote add origin git@github.com:adidas/$(repository).git; \ git pull origin master --tags""" sync-to-github: prepare-github-repo $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ -v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \ $(image_name):$(version) \ /bin/bash -c """cd tmp_os/lakehouse-engine; \ rsync -r --exclude=.git --exclude=.*cache* --exclude=venv --exclude=dist --exclude=tmp_os /app/ . ; \ rm $(remove_files_from_os); \ mv $(engine_os_conf_file) $(engine_conf_file); \ mv $(meta_os_conf_file) $(meta_conf_file); \ mv CONTRIBUTING_OS.md CONTRIBUTING.md; \ $(trust_git_host); \ git add . ; \ git commit -m "'${last_commit_msg}'"; \ git tag -a $(git_tag) -m 'Release $(git_tag)' ; \ git push origin master --follow-tags;""" deploy-docs-to-github: docs prepare-github-repo $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ -v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \ $(image_name):$(version) \ /bin/bash -c """cp -r tmp_os/lakehouse-engine/artefacts/docs/site/* tmp_os/lakehouse-engine-docs/ ; \ cd tmp_os/lakehouse-engine-docs; \ $(trust_git_host); \ git add . ; \ git commit -m 'Lakehouse Engine $(version) documentation'; \ git push origin master ; \ cd .. && rm -rf tmp_os/lakehouse-engine-docs""" deploy-to-pypi: build $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ -v $(pypi_credentials_file):$(container_user_dir)/.pypirc \ $(image_name):$(version) \ /bin/bash -c 'twine upload tmp_os/lakehouse-engine/dist/lakehouse_engine-$(project_version)-py3-none-any.whl --skip-existing' deploy-to-pypi-and-clean: deploy-to-pypi $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'rm -rf tmp_os/lakehouse-engine' ########################### ##### Release Targets ##### ########################### create-changelog: echo "# Changelog - $(shell date +"%Y-%m-%d") v$(shell cat cicd/.bumpversion.cfg | grep "current_version =" | cut -f 3 -d " ")" > CHANGELOG.md && \ echo "All notable changes to this project will be documented in this file automatically. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)." >> CHANGELOG.md && \ echo "" >> CHANGELOG.md && \ git log --no-decorate --pretty=format:"#### [%cs] [%(describe)]%n [%h]($(commits_url)%H) %s" -n 1000 >> CHANGELOG.md bump-up-version: $(container_cli) run --rm \ -w /app \ -v "$$PWD":/app \ $(image_name):$(version) \ /bin/bash -c 'bump2version --config-file cicd/.bumpversion.cfg $(increment)' prepare-release: bump-up-version create-changelog echo "Prepared version and changelog to release!" commit-release: git commit -a -m 'Create release $(version)' && \ git tag -a 'v$(version)' -m 'Release $(version)' push-release: git push --follow-tags delete-tag: git push --delete origin $(tag) .PHONY: $(MAKECMDGOALS) ================================================ FILE: README.md ================================================ Lakehouse Engine Logo # Lakehouse Engine A configuration driven Spark framework, written in Python, serving as a scalable and distributed engine for several lakehouse algorithms, data flows and utilities for Data Products. --- > ***Note:*** whenever you read Data Product or Data Product team, we want to refer to Teams and use cases, whose main focus is on leveraging the power of data, on a particular topic, end-to-end (ingestion, consumption...) to achieve insights, supporting faster and better decisions, which generate value for their businesses. These Teams should not be focusing on building reusable frameworks, but on re-using the existing frameworks to achieve their goals. --- ## Main Goals The goal of the Lakehouse Engine is to bring some advantages, such as: - offer cutting-edge, standard, governed and battle-tested foundations that several Data Product teams can benefit from; - avoid that Data Product teams develop siloed solutions, reducing technical debts and high operating costs (redundant developments across teams); - allow Data Product teams to focus mostly on data-related tasks, avoiding wasting time & resources on developing the same code for different use cases; - benefit from the fact that many teams are reusing the same code, which increases the likelihood that common issues are surfaced and solved faster; - decrease the dependency and learning curve to Spark and other technologies that the Lakehouse Engine abstracts; - speed up repetitive tasks; - reduced vendor lock-in. --- > ***Note:*** even though you will see a focus on AWS and Databricks, this is just due to the lack of use cases for other technologies like GCP and Azure, but we are open for contribution. --- ## Key Features ⭐ **Data Loads:** perform data loads from diverse source types and apply transformations and data quality validations, ensuring trustworthy data, before integrating it into distinct target types. Additionally, people can also define termination actions like optimisations or notifications. [On the usage section](#load-data-usage-example) you will find an example using all the supported keywords for data loads. --- > ***Note:*** The Lakehouse Engine supports different types of sources and targets, such as, kafka, jdbc, dataframes, files (csv, parquet, json, delta...), sftp, sap bw, sap b4... --- ⭐ **Transformations:** configuration driven transformations without the need to write any spark code. Transformations can be applied by using the `transform_specs` in the Data Loads. --- > ***Note:*** you can search all the available transformations, as well as checking implementation details and examples [here](reference/packages/transformers/index.md). --- ⭐ **Data Quality Validations:** the Lakehouse Engine uses Great Expectations as a backend and abstracts any implementation details by offering people the capability to specify what validations to apply on the data, solely using dict/json based configurations. The Data Quality validations can be applied on: - post-mortem (static) data, using the DQ Validator algorithm (`execute_dq_validation`) - data in-motion, using the `dq_specs` keyword in the Data Loads, to add it as one more step while loading data. [On the usage section](#load-data-usage-example) you will find an example using this type of Data Quality validations. ⭐ **Reconciliation:** useful algorithm to compare two source of data, by defining one version of the `truth` to compare against the `current` version of the data. It can be particularly useful during migrations phases, two compare a few KPIs and ensure the new version of a table (`current`), for example, delivers the same vision of the data as the old one (`truth`). Find usage examples [here](lakehouse_engine_usage/reconciliator/reconciliator.md). ⭐ **Sensors:** an abstraction to otherwise complex spark code that can be executed in very small single-node clusters to check if an upstream system or Data Product contains new data since the last execution. With this feature, people can trigger jobs to run in more frequent intervals and if the upstream does not contain new data, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction, Transformation, and Loading). Find usage examples [here](lakehouse_engine_usage/sensors/sensors.md). ⭐ **Terminators:** this feature allow people to specify what to do as a last action, before finishing a Data Load. Some examples of actions are: optimising target table, vacuum, compute stats, expose change data feed to external location or even send e-mail notifications. Thus, it is specifically used in Data Loads, using the `terminate_specs` keyword. [On the usage section](#load-data-usage-example) you will find an example using terminators. ⭐ **Table Manager:** function `manage_table`, offers a set of actions to manipulate tables/views in several ways, such as: - compute table statistics; - create/drop tables and views; - delete/truncate/repair tables; - vacuum delta tables or locations; - optimize table; - describe table; - show table properties; - execute sql. ⭐ **File Manager:** function `manage_files`, offers a set of actions to manipulate files in several ways, such as: - delete Objects in S3; - copy Objects in S3; - restore Objects from S3 Glacier; - check the status of a restore from S3 Glacier; - request a restore of objects from S3 Glacier and wait for them to be copied to a destination. ⭐ **Notifications:** you can configure and send email notifications. --- > ***Note:*** it can be used as an independent function (`send_notification`) or as a `terminator_spec`, using the function `notify`. --- 📖 In case you want to check further details you can check the documentation of the [Lakehouse Engine facade](reference/packages/engine.md). ## Installation As the Lakehouse Engine is built as wheel (look into our **build** and **deploy** make targets) you can install it as any other python package using **pip**. ``` pip install lakehouse-engine ``` Alternatively, you can also upload the wheel to any target of your like (e.g. S3) and perform a pip installation pointing to that target location. --- > ***Note:*** The Lakehouse Engine is packaged with plugins or optional dependencies, which are not installed by default. The goal is > to make its installation lighter and to avoid unnecessary dependencies. You can check all the optional dependencies in > the [tool.setuptools.dynamic] section of the [pyproject.toml](pyproject.toml) file. They are currently: os, dq, azure, sharepoint and sftp. So, > in case you want to make usage of the Data Quality features offered in the Lakehouse Engine, instead of running the previous command, you should run > the command below, which will bring the core functionalities, plus DQ. > ``` > pip install lakehouse-engine[dq] > ``` > In case you are in an environment without pre-install spark and delta, you will also want to install the `os` optional dependencies, like so: > ``` > pip install lakehouse-engine[os] > ``` > And in case you want to install several optional dependencies, you can run a command like: > ``` > pip install lakehouse-engine[dq,sftp] > ``` > It is advisable for a Data Product to pin a specific version of the Lakehouse Engine (and have recurring upgrading activities) > to avoid breaking changes in a new release. > In case you don't want to be so conservative, you can pin to a major version, which usually shouldn't include changes that break backwards compatibility. --- ## How Data Products use the Lakehouse Engine Framework? The Lakehouse Engine is a configuration-first Data Engineering framework, using the concept of ACONs to configure algorithms. An ACON, stands for Algorithm Configuration and is a JSON representation, as the [Load Data Usage Example](#load-data-usage-example) demonstrates. Below you find described the main keywords you can use to configure and ACON for a Data Load. --- > ***Note:*** the usage logic for the other [algorithms/features presented](#key-features) will always be similar, but using different keywords, which you can search for in the examples and documentation provided in the [Key Features](#key-features) and [Community Support and Contributing](#community-support-and-contributing) sections. --- - **Input specifications (input_specs):** specify how to read data. This is a **mandatory** keyword. - **Transform specifications (transform_specs):** specify how to transform data. - **Data quality specifications (dq_specs):** specify how to execute the data quality process. - **Output specifications (output_specs):** specify how to write data to the target. This is a **mandatory** keyword. - **Terminate specifications (terminate_specs):** specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc). - **Execution environment (exec_env):** custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example). ## Load Data Usage Example You can use the Lakehouse Engine in a **pyspark script** or **notebook**. Below you can find an example on how to execute a Data Load using the Lakehouse Engine, which is doing the following: 1. Read CSV files, from a specified location, in a streaming fashion and providing a specific schema and some additional options for properly read the files (e.g. header, delimiter...); 2. Apply two transformations on the input data: 1. Add a new column having the Row ID; 2. Add a new column `extraction_date`, which extracts the date from the `lhe_extraction_filepath`, based on a regex. 3. Apply Data Quality validations and store the result of their execution in the table `your_database.order_events_dq_checks`: 1. Check if the column `omnihub_locale_code` is not having null values; 2. Check if the distinct value count for the column `product_division` is between 10 and 100; 3. Check if the max of the column `so_net_value` is between 10 and 1000; 4. Check if the length of the values in the column `omnihub_locale_code` is between 1 and 10; 5. Check if the mean of the values for the column `coupon_code` is between 15 and 20. 4. Write the output into the table `your_database.order_events_with_dq` in a delta format, partitioned by `order_date_header` and applying a merge predicate condition, ensuring the data is only inserted into the table if it does not match the predicate (meaning the data is not yet available in the table). Moreover, the `insert_only` flag is used to specify that there should not be any updates or deletes in the target table, only inserts; 5. Optimize the Delta Table that we just wrote in (e.g. z-ordering); 6. Specify 3 custom Spark Session configurations. --- > ⚠️ ***Note:*** `spec_id` is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in `transform_specs`) of a DataFrame that was read in the `input_specs`. --- ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "orders_bronze", "read_type": "streaming", "data_format": "csv", "schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json", "with_filepath": True, "options": { "badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/", "header": False, "delimiter": "\u005E", "dateFormat": "yyyyMMdd", }, "location": "s3://my-data-product-bucket/bronze/orders/", } ], "transform_specs": [ { "spec_id": "orders_bronze_with_extraction_date", "input_id": "orders_bronze", "transformers": [ {"function": "with_row_id"}, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": True, "regex": ".*WE_SO_SCL_(\\d+).csv", }, }, ], } ], "dq_specs": [ { "spec_id": "check_orders_bronze_with_extraction_date", "input_id": "orders_bronze_with_extraction_date", "dq_type": "validator", "result_sink_db_table": "your_database.order_events_dq_checks", "fail_on_error": False, "dq_functions": [ { "dq_function": "expect_column_values_to_not_be_null", "args": { "column": "omnihub_locale_code" } }, { "dq_function": "expect_column_unique_value_count_to_be_between", "args": { "column": "product_division", "min_value": 10, "max_value": 100 }, }, { "dq_function": "expect_column_max_to_be_between", "args": { "column": "so_net_value", "min_value": 10, "max_value": 1000 } }, { "dq_function": "expect_column_value_lengths_to_be_between", "args": { "column": "omnihub_locale_code", "min_value": 1, "max_value": 10 }, }, { "dq_function": "expect_column_mean_to_be_between", "args": { "column": "coupon_code", "min_value": 15, "max_value": 20 } }, ], }, ], "output_specs": [ { "spec_id": "orders_silver", "input_id": "check_orders_bronze_with_extraction_date", "data_format": "delta", "write_type": "merge", "partitions": ["order_date_header"], "merge_opts": { "merge_predicate": """ new.sales_order_header = current.sales_order_header AND new.sales_order_schedule = current.sales_order_schedule AND new.sales_order_item=current.sales_order_item AND new.epoch_status=current.epoch_status AND new.changed_on=current.changed_on AND new.extraction_date=current.extraction_date AND new.lhe_batch_id=current.lhe_batch_id AND new.lhe_row_id=current.lhe_row_id """, "insert_only": True, }, "db_table": "your_database.order_events_with_dq", "options": { "checkpointLocation": "s3://my-data-product-bucket/checkpoints/template_order_events_with_dq/" }, } ], "terminate_specs": [ { "function": "optimize_dataset", "args": { "db_table": "your_database.order_events_with_dq" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` --- > ***Note:*** Although it is possible to interact with the Lakehouse Engine functions directly from your python code, instead of relying on creating an ACON dict and use the engine api, we do not ensure the stability across new Lakehouse Engine releases when calling internal functions (not exposed in the facade) directly. --- --- > ***Note:*** ACON structure might change across releases, please test your Data Product first before updating to a new version of the Lakehouse Engine in your Production environment. --- ## Overwriting default configurations We use a YAML file to specify various configurations needed for different functionalities. You can overwrite these configurations using a dictionary with new settings or by providing a path to a YAML file. This functionality can be particularly useful for the open-source community as it unlocks the usage several functionalities like Prisma and engine usage logs. Check default configurations. ``` from lakehouse_engine.core import exec_env print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket) > default-bucket ``` Change the dq_dev_bucket configuration. ``` exec_env.ExecEnv.set_default_engine_config(custom_configs_dict={"dq_dev_bucket": "your-dq-bucket"}) print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket) > your-dq-bucket ``` Reset to default configurations. ``` exec_env.ExecEnv.set_default_engine_config() print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket) > default-bucket ``` --- ## Who maintains the Lakehouse Engine? The Lakehouse Engine is under active development and production usage by the Adidas Lakehouse Foundations Engineering team. ## Community Support and Contributing 🤝 Do you want to contribute or need any support? Check out all the details in [CONTRIBUTING.md](https://github.com/adidas/lakehouse-engine/blob/master/CONTRIBUTING.md). ## License and Software Information © adidas AG adidas AG publishes this software and accompanied documentation (if any) subject to the terms of the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) with the aim of helping the community with our tools and libraries which we think can be also useful for other people. You will find a copy of the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) in the root folder of this package. All rights not explicitly granted to you under the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) remain the sole and exclusive property of adidas AG. --- > ***NOTICE:*** The software has been designed solely for the purposes described in this ReadMe file. The software is NOT designed, tested or verified for productive use whatsoever, nor or for any use related to high risk environments, such as health care, highly or fully autonomous driving, power plants, or other critical infrastructures or services. --- If you want to contact adidas regarding the software, you can mail us at software.engineering@adidas.com. For further information open the [adidas terms and conditions](https://github.com/adidas/adidas-contribution-guidelines/wiki/Terms-and-conditions) page. ================================================ FILE: assets/gab/metadata/gab/f_agg_dummy_sales_kpi/1_article_category.sql ================================================ SELECT "category_a" AS category_name ,"article1" AS article_id UNION SELECT "category_a" AS category_name ,"article2" AS article_id UNION SELECT "category_a" AS category_name ,"article3" AS article_id UNION SELECT "category_a" AS category_name ,"article4" AS article_id UNION SELECT "category_b" AS category_name ,"article5" AS article_id UNION SELECT "category_b" AS category_name ,"article6" AS article_id UNION SELECT "category_b" AS category_name ,"article7" AS article_id ================================================ FILE: assets/gab/metadata/gab/f_agg_dummy_sales_kpi/2_f_agg_dummy_sales_kpi.sql ================================================ SELECT {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour) {% endif %} AS order_date, {{ to_date }} AS to_date, b.category_name, COUNT(a.article_id) qty_articles, SUM(amount) total_amount FROM `{{ database }}`.`dummy_sales_kpi` a {{ joins }} LEFT JOIN article_categories b ON a.article_id = b.article_id WHERE TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= ( '{{start_date}}' + interval '{{offset_value}}' hour ) AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < ( '{{ end_date}}' + interval '{{offset_value}}' hour ) GROUP BY 1,2,3 ================================================ FILE: assets/gab/metadata/tables/dim_calendar.sql ================================================ DROP TABLE IF EXISTS `database`.dim_calendar; CREATE EXTERNAL TABLE `database`.dim_calendar ( calendar_date DATE COMMENT 'Full calendar date in the format yyyyMMdd.', day_en STRING COMMENT 'Name of the day of the week.', weeknum_mon INT COMMENT 'Week number where the week starts on Monday.', weekstart_mon DATE COMMENT 'First day of the week where the week starts on Monday.', weekend_mon DATE COMMENT 'Last day of the week where the week starts on Monday.', weekstart_sun DATE COMMENT 'First day of the week where the week starts on Sunday.', weekend_sun DATE COMMENT 'Last day of the week where the week starts on Sunday.', month_start DATE COMMENT 'First day of the Month.', month_end DATE COMMENT 'Last day of the Month.', quarter_start DATE COMMENT 'First day of the Quarter.', quarter_end DATE COMMENT 'Last day of the Quarter.', year_start DATE COMMENT 'First day of the Year.', year_end DATE COMMENT 'Last day of the Year.' ) USING DELTA LOCATION 's3://my-data-product-bucket/dim_calendar' COMMENT 'This table stores the calendar information.' TBLPROPERTIES( 'lakehouse.primary_key'='calendar_date', 'delta.enableChangeDataFeed'='false' ) ================================================ FILE: assets/gab/metadata/tables/dummy_sales_kpi.sql ================================================ DROP TABLE IF EXISTS `database`.`dummy_sales_kpi`; CREATE EXTERNAL TABLE `database`.`dummy_sales_kpi` ( `order_date` DATE COMMENT 'date of the orders', `article_id` STRING COMMENT 'article id', `amount` INT COMMENT 'quantity/amount sold on this date' ) USING DELTA PARTITIONED BY (order_date) LOCATION 's3://my-data-product-bucket/dummy_sales_kpi' COMMENT 'Dummy sales KPI (articles sold per date).' TBLPROPERTIES( 'lakehouse.primary_key'='article_id, order_date', 'delta.enableChangeDataFeed'='true' ) ================================================ FILE: assets/gab/metadata/tables/gab_log_events.sql ================================================ DROP TABLE IF EXISTS `database`.`gab_log_events`; CREATE EXTERNAL TABLE `database`.`gab_log_events` ( `run_start_time` TIMESTAMP COMMENT 'Run start time for the use case', `run_end_time` TIMESTAMP COMMENT 'Run end time for the use case', `input_start_date` TIMESTAMP COMMENT 'The start time set for the use case process', `input_end_date` TIMESTAMP COMMENT 'The end time set for the use case process', `query_id` STRING COMMENT 'Query ID for the use case', `query_label` STRING COMMENT 'Query label for the use case', `cadence` STRING COMMENT 'This field stores the cadence of data granularity (Day/Week/Month/Quarter/Year)', `stage_name` STRING COMMENT 'Intermediate stage', `stage_query` STRING COMMENT 'Query run as part of stage', `status` STRING COMMENT 'Status of the stage', `error_code` STRING COMMENT 'Error code' ) USING DELTA PARTITIONED BY (query_id) LOCATION 's3://my-data-product-bucket/gab_log_events' COMMENT 'This table stores the log for all use cases in gab' TBLPROPERTIES( 'lakehouse.primary_key'='run_start_time,query_id,stage_name', 'delta.enableChangeDataFeed'='false' ) ================================================ FILE: assets/gab/metadata/tables/gab_use_case_results.sql ================================================ DROP TABLE IF EXISTS `database`.`gab_use_case_results`; CREATE EXTERNAL TABLE `database`.`gab_use_case_results` ( `query_id` STRING COMMENT 'Query ID for the use case', `cadence` STRING COMMENT 'Cadence of data granularity (Day/Week/Month/Quarter/Year)', `from_date` DATE COMMENT 'Aggregate based on the date column', `to_date` DATE COMMENT 'Snapshot end date', `d1` STRING COMMENT 'Dimension 1', `d2` STRING COMMENT 'Dimension 2', `d3` STRING COMMENT 'Dimension 3', `d4` STRING COMMENT 'Dimension 4', `d5` STRING COMMENT 'Dimension 5', `d6` STRING COMMENT 'Dimension 6', `d7` STRING COMMENT 'Dimension 7', `d8` STRING COMMENT 'Dimension 8', `d9` STRING COMMENT 'Dimension 9', `d10` STRING COMMENT 'Dimension 10', `d11` STRING COMMENT 'Dimension 11', `d12` STRING COMMENT 'Dimension 12', `d13` STRING COMMENT 'Dimension 13', `d14` STRING COMMENT 'Dimension 14', `d15` STRING COMMENT 'Dimension 15', `d16` STRING COMMENT 'Dimension 16', `d17` STRING COMMENT 'Dimension 17', `d18` STRING COMMENT 'Dimension 18', `d19` STRING COMMENT 'Dimension 19', `d20` STRING COMMENT 'Dimension 20', `d21` STRING COMMENT 'Dimension 21', `d22` STRING COMMENT 'Dimension 22', `d23` STRING COMMENT 'Dimension 23', `d24` STRING COMMENT 'Dimension 24', `d25` STRING COMMENT 'Dimension 25', `d26` STRING COMMENT 'Dimension 26', `d27` STRING COMMENT 'Dimension 27', `d28` STRING COMMENT 'Dimension 28', `d29` STRING COMMENT 'Dimension 29', `d30` STRING COMMENT 'Dimension 30', `d31` STRING COMMENT 'Dimension 31', `d32` STRING COMMENT 'Dimension 32', `d33` STRING COMMENT 'Dimension 33', `d34` STRING COMMENT 'Dimension 34', `d35` STRING COMMENT 'Dimension 35', `d36` STRING COMMENT 'Dimension 36', `d37` STRING COMMENT 'Dimension 37', `d38` STRING COMMENT 'Dimension 38', `d39` STRING COMMENT 'Dimension 39', `d40` STRING COMMENT 'Dimension 40', `m1` DOUBLE COMMENT 'Metric 1', `m2` DOUBLE COMMENT 'Metric 2', `m3` DOUBLE COMMENT 'Metric 3', `m4` DOUBLE COMMENT 'Metric 4', `m5` DOUBLE COMMENT 'Metric 5', `m6` DOUBLE COMMENT 'Metric 6', `m7` DOUBLE COMMENT 'Metric 7', `m8` DOUBLE COMMENT 'Metric 8', `m9` DOUBLE COMMENT 'Metric 9', `m10` DOUBLE COMMENT 'Metric 10', `m11` DOUBLE COMMENT 'Metric 11', `m12` DOUBLE COMMENT 'Metric 12', `m13` DOUBLE COMMENT 'Metric 13', `m14` DOUBLE COMMENT 'Metric 14', `m15` DOUBLE COMMENT 'Metric 15', `m16` DOUBLE COMMENT 'Metric 16', `m17` DOUBLE COMMENT 'Metric 17', `m18` DOUBLE COMMENT 'Metric 18', `m19` DOUBLE COMMENT 'Metric 19', `m20` DOUBLE COMMENT 'Metric 20', `m21` DOUBLE COMMENT 'Metric 21', `m22` DOUBLE COMMENT 'Metric 22', `m23` DOUBLE COMMENT 'Metric 23', `m24` DOUBLE COMMENT 'Metric 24', `m25` DOUBLE COMMENT 'Metric 25', `m26` DOUBLE COMMENT 'Metric 26', `m27` DOUBLE COMMENT 'Metric 27', `m28` DOUBLE COMMENT 'Metric 28', `m29` DOUBLE COMMENT 'Metric 29', `m30` DOUBLE COMMENT 'Metric 30', `m31` DOUBLE COMMENT 'Metric 31', `m32` DOUBLE COMMENT 'Metric 32', `m33` DOUBLE COMMENT 'Metric 33', `m34` DOUBLE COMMENT 'Metric 34', `m35` DOUBLE COMMENT 'Metric 35', `m36` DOUBLE COMMENT 'Metric 36', `m37` DOUBLE COMMENT 'Metric 37', `m38` DOUBLE COMMENT 'Metric 38', `m39` DOUBLE COMMENT 'Metric 39', `m40` DOUBLE COMMENT 'Metric 40', `lh_created_on` TIMESTAMP COMMENT 'This field stores the created_on in lakehouse' ) USING DELTA PARTITIONED BY (query_id) LOCATION 's3://my-data-product-bucket/gab_use_case_results' COMMENT 'This table is the common table for all use cases and stores all the dimensions and metrics' TBLPROPERTIES( 'lakehouse.primary_key'='query_id,cadence,to_date,from_date', 'delta.enableChangeDataFeed'='false' ) ================================================ FILE: assets/gab/metadata/tables/lkp_query_builder.sql ================================================ DROP TABLE IF EXISTS `database`.`lkp_query_builder`; CREATE EXTERNAL TABLE `database`.`lkp_query_builder` ( `query_id` INT COMMENT 'Query ID for the use case which is a sequence of numbers', `query_label` STRING COMMENT 'Summarized description of the use case', `query_type` STRING COMMENT 'Type of use case based on region', `mappings` STRING COMMENT 'Dictionary of mappings for dimensions and metrics', `intermediate_stages` STRING COMMENT 'All the stages and their configs such as storageLevel repartitioning date columns', `recon_window` STRING COMMENT 'Configurations for Cadence and Reconciliation Windows', `timezone_offset` INT COMMENT 'Timezone offsets can be configured by a positive or negative integer', `start_of_the_week` STRING COMMENT 'Sunday or Monday can be configured as the start of the week', `is_active` STRING COMMENT 'Active Flag - Can be set to Y or N', `queue` STRING COMMENT 'Can be set to High/Medium/Low based on the cluster computation requirement', `lh_created_on` TIMESTAMP COMMENT 'This field stores the created_on in lakehouse' ) USING DELTA LOCATION 's3://my-data-product-bucket/lkp_query_builder' COMMENT 'This table stores the configuration for the gab framework' TBLPROPERTIES( 'lakehouse.primary_key'='query_id', 'delta.enableChangeDataFeed'='false' ) ================================================ FILE: assets/gab/notebooks/gab.py ================================================ # Databricks notebook source from datetime import datetime, timedelta from lakehouse_engine.engine import execute_gab from pyspark.sql.functions import collect_list, collect_set, lit # COMMAND ---------- dbutils.widgets.text("lookup_table", "lkp_query_builder") lookup_table = dbutils.widgets.get("lookup_table") dbutils.widgets.text("source_database", "source_database") source_database = dbutils.widgets.get("source_database") dbutils.widgets.text("target_database", "target_database") target_database = dbutils.widgets.get("target_database") # COMMAND ---------- def flatten_extend(list_to_flatten: list) -> list: """Flatten python list. Args: list_to_flatten: list to be flattened. Returns: A list containing the flatten values. """ flat_list = [] for row in list_to_flatten: flat_list.extend(row) return flat_list lkp_query_builder_df = spark.read.table( "{}.{}".format(target_database, lookup_table) ) query_label_and_queue = ( lkp_query_builder_df.groupBy(lit(1)).agg(collect_list("query_label"), collect_set("queue")).collect() ) query_list = flatten_extend(query_label_and_queue)[1] queue_list = flatten_extend(query_label_and_queue)[2] # COMMAND ---------- dbutils.widgets.text("start_date", "", label="Start Date") dbutils.widgets.text("end_date", "", label="End Date") dbutils.widgets.text("rerun_flag", "N", label="Re-Run Flag") dbutils.widgets.text("look_back", "1", label="Look Back Window") dbutils.widgets.multiselect( "cadence_filter", "All", ["All", "DAY", "WEEK", "MONTH", "QUARTER", "YEAR"], label="Cadence", ) dbutils.widgets.multiselect("query_label_filter", "All", query_list + ["All"], label="Use Case") dbutils.widgets.multiselect("queue_filter", "All", queue_list + ["All"], label="Query Categorization") dbutils.widgets.text("gab_base_path", "", label="Base Path Use Cases") dbutils.widgets.text("target_table", "", label="Target Table") # Input Parameters lookback_days = "1" if dbutils.widgets.get("look_back") == "" else dbutils.widgets.get("look_back") # COMMAND ---------- end_date_str = ( datetime.today().strftime("%Y-%m-%d") if dbutils.widgets.get("end_date") == "" else dbutils.widgets.get("end_date") ) end_date = datetime.strptime(end_date_str, "%Y-%m-%d") # As part of daily run, when no end_date is given, program always runs # for yesterday date (Unless custom end date is given) if dbutils.widgets.get("end_date") == "": end_date = end_date - timedelta(days=1) start_date_str = ( datetime.date(end_date - timedelta(days=int(lookback_days))).strftime("%Y-%m-%d") if dbutils.widgets.get("start_date") == "" else dbutils.widgets.get("start_date") ) start_date = datetime.strptime(start_date_str, "%Y-%m-%d") end_date_str = end_date.strftime("%Y-%m-%d") rerun_flag = dbutils.widgets.get("rerun_flag") query_label_filter = dbutils.widgets.get("query_label_filter") recon_filter = dbutils.widgets.get("cadence_filter") queue_filter = dbutils.widgets.get("queue_filter") gab_base_path = dbutils.widgets.get("gab_base_path") # COMMAND ---------- query_label_filter = [x.strip() for x in list(set(query_label_filter.split(",")))] queue_filter = list(set(queue_filter.split(","))) recon_filter = list(set(recon_filter.split(","))) if "All" in query_label_filter: query_label_filter = query_list if "All" in queue_filter: queue_filter = queue_list # COMMAND ---------- target_table = ( "gab_use_case_results" if dbutils.widgets.get("target_table") == "" else dbutils.widgets.get("target_table") ) # COMMAND ---------- print(f"Query Label: {query_label_filter}") print(f"Queue Filter: {queue_filter}") print(f"Cadence Filter: {recon_filter}") print(f"Target Database: {target_database}") print(f"Start Date: {start_date}") print(f"End Date: {end_date}") print(f"Look Back Days: {lookback_days}") print(f"Re-run Flag: {rerun_flag}") print(f"Target Table: {target_table}") print(f"Source Database: {source_database}") print(f"Path Use Cases: {gab_base_path}") # COMMAND ---------- gab_acon = { "query_label_filter": query_label_filter, "queue_filter": queue_filter, "cadence_filter": recon_filter, "target_database": target_database, "start_date": start_date, "end_date": end_date, "rerun_flag": rerun_flag, "target_table": target_table, "source_database": source_database, "gab_base_path": gab_base_path, "lookup_table": lookup_table, "calendar_table": "dim_calendar", } # COMMAND ---------- execute_gab(acon=gab_acon) ================================================ FILE: assets/gab/notebooks/gab_dim_calendar.py ================================================ # Databricks notebook source # MAGIC %md # MAGIC # This notebook holds the calendar used as part of the GAB framework. # COMMAND ---------- # Import the required libraries from datetime import datetime, timedelta from pyspark.sql.functions import to_date from pyspark.sql.types import StringType # COMMAND ---------- DIM_CALENDAR_LOCATION = "s3://my-data-product-bucket/dim_calendar" # COMMAND ---------- initial_date = datetime.strptime("1990-01-01", "%Y-%m-%d") dates_list = [datetime.strftime(initial_date, "%Y-%m-%d")] for _ in range(1, 200000): initial_date = initial_date + timedelta(days=1) next_date = datetime.strftime(initial_date, "%Y-%m-%d") dates_list.append(next_date) # COMMAND ---------- df_date_completed = spark.createDataFrame(dates_list, StringType()) df_date_completed = df_date_completed.withColumn("calendar_date", to_date(df_date_completed.value, "yyyy-MM-dd")).drop( df_date_completed.value ) df_date_completed.createOrReplaceTempView("dates_completed") # COMMAND ---------- df_cal = spark.sql( """ WITH monday_calendar AS ( SELECT calendar_date, WEEKOFYEAR(calendar_date) AS weeknum_mon, DATE_FORMAT(calendar_date, 'E') AS day_en, MIN(calendar_date) OVER (PARTITION BY CONCAT(DATE_PART('YEAROFWEEK', calendar_date), WEEKOFYEAR(calendar_date)) ORDER BY calendar_date) AS weekstart_mon FROM dates_completed ORDER BY calendar_date ), monday_calendar_plus_week_num_sunday AS ( SELECT monday_calendar.*, LEAD(weeknum_mon) OVER(ORDER BY calendar_date) AS weeknum_sun FROM monday_calendar ), calendar_complementary_values AS ( SELECT calendar_date, weeknum_mon, day_en, weekstart_mon, weekstart_mon+6 AS weekend_mon, LEAD(weekstart_mon-1) OVER(ORDER BY calendar_date) AS weekstart_sun, DATE(DATE_TRUNC('MONTH', calendar_date)) AS month_start, DATE(DATE_TRUNC('QUARTER', calendar_date)) AS quarter_start, DATE(DATE_TRUNC('YEAR', calendar_date)) AS year_start FROM monday_calendar_plus_week_num_sunday ) SELECT calendar_date, day_en, weeknum_mon, weekstart_mon, weekend_mon, weekstart_sun, weekstart_sun+6 AS weekend_sun, month_start, add_months(month_start, 1)-1 AS month_end, quarter_start, ADD_MONTHS(quarter_start, 3)-1 AS quarter_end, year_start, ADD_MONTHS(year_start, 12)-1 AS year_end FROM calendar_complementary_values """ ) df_cal.createOrReplaceTempView("df_cal") # COMMAND ---------- df_cal.write.format("delta").mode("overwrite").save(DIM_CALENDAR_LOCATION) ================================================ FILE: assets/gab/notebooks/gab_job_manager.py ================================================ # Databricks notebook source import os NOTEBOOK_CONTEXT = dbutils.notebook.entry_point.getDbutils().notebook().getContext() # Import the required libraries import datetime import json import time import uuid import ast from pyspark.sql.functions import col, lit, upper # COMMAND ---------- # MAGIC %run ../utils/databricks_job_utils # COMMAND ---------- AUTH_TOKEN = NOTEBOOK_CONTEXT.apiToken().getOrElse(None) HOST_NAME = spark.conf.get("spark.databricks.workspaceUrl") DATABRICKS_JOB_UTILS = DatabricksJobs(databricks_instance=HOST_NAME, auth=AUTH_TOKEN) # COMMAND ---------- dbutils.widgets.text("gab_job_schedule", "{'hour': {07: 'GLOBAL'}}") gab_job_schedule = ast.literal_eval(dbutils.widgets.get("gab_job_schedule")) dbutils.widgets.text("source_database", "") source_database = dbutils.widgets.get("source_database") dbutils.widgets.text("target_database", "") target_database = dbutils.widgets.get("target_database") dbutils.widgets.text("gab_base_path", "") gab_base_path = dbutils.widgets.get("gab_base_path") dbutils.widgets.text("gab_max_jobs_limit_high_job", "") gab_max_jobs_limit_high_job = dbutils.widgets.get("gab_max_jobs_limit_high_job") dbutils.widgets.text("gab_max_jobs_limit_medium_job", "") gab_max_jobs_limit_medium_job = dbutils.widgets.get("gab_max_jobs_limit_medium_job") dbutils.widgets.text("gab_max_jobs_limit_low_job", "") gab_max_jobs_limit_low_job = dbutils.widgets.get("gab_max_jobs_limit_low_job") # COMMAND ---------- # functions def divide_chunks(input_list: list, max_number_of_jobs: int) -> list: """Split list into predefined chunks, accordingly to the number of jobs. This function reads the maximum job limit defined by the parameter for each queue type in order to determine the number of parallel runs for each queue and divides the use cases into chunks for each run. For example, if the maximum job limit is set to 30 for the high queue and there are 60 use cases for the high queue, then each run will handle 2 use cases. Args: input_list: Input list to be split. max_number_of_jobs: Max job number. Returns: Split chunk list. """ avg_chunk_size = len(input_list) // max_number_of_jobs remainder = len(input_list) % max_number_of_jobs chunks = [ input_list[i * avg_chunk_size + min(i, remainder) : (i + 1) * avg_chunk_size + min(i + 1, remainder)] for i in range(max_number_of_jobs) ] chunks = list(filter(None, chunks)) return chunks def get_run_regions(job_schedule: dict, job_info: dict) -> list: """Get run regions accordingly to job_manager trigger time. Args: job_schedule: Markets schedule list from the parameter `gab_job_schedule`. job_info: Job manager info to match. Returns: Markets run list. """ q_type_match = "" for keys in job_schedule["hour"].keys(): if keys == int(datetime.datetime.fromtimestamp(job_info["start_time"] / 1000).strftime("%H")): q_type_match = job_schedule["hour"][keys] try: print("Matched regions are: ", q_type_match) return list(q_type_match.split(",")) except Exception: raise Exception("None of the query types are configured to be run at this time") # COMMAND ---------- context_json = json.loads(NOTEBOOK_CONTEXT.safeToJson()) run_id = "" if context_json.get("attributes") and context_json["attributes"].get("rootRunId"): run_id = context_json["attributes"]["rootRunId"] print(f"Job Run Id: {run_id}") job_status = DATABRICKS_JOB_UTILS.get_job(run_id) print("Job Status: ", job_status) # COMMAND ---------- list_q_type_match = get_run_regions(gab_job_schedule, job_status) job_queues = { "High": {"queue": "gab_high_queue", "max_jobs": gab_max_jobs_limit_high_job}, "Medium": { "queue": "gab_medium_queue", "max_jobs": gab_max_jobs_limit_medium_job, }, "Low": {"queue": "gab_low_queue", "max_jobs": gab_max_jobs_limit_low_job}, } df = spark.read.table(f"{target_database}.lkp_query_builder") for queue_type, queue_config in job_queues.items(): lst = ( df.filter(upper(col("queue")) == lit(queue_type.upper())) .filter(col("query_type").isin(list_q_type_match)) .select(col("query_label")) .collect() ) query_list = [job_queues[0] for job_queues in lst] chunk = divide_chunks(query_list, int(queue_config["max_jobs"])) chunk = [i for i in chunk if i] if chunk: for i in range(0, len(chunk)): chunk_split = ",".join(chunk[i]) print(chunk_split) time.sleep(2) idempotency_token = uuid.uuid4() print(idempotency_token) result = DATABRICKS_JOB_UTILS.run_now( DATABRICKS_JOB_UTILS.job_id_extraction(queue_config["queue"]), { "query_label_filter": chunk_split, "start_date": "", "look_back": "", "end_date": "", "cadence_filter": "All", "queue_filter": queue_type, "rerun_flag": "N", "target_database": target_database, "source_database": source_database, "gab_base_path": gab_base_path, }, idempotency_token=idempotency_token, ) print(f"{result}\n") ================================================ FILE: assets/gab/notebooks/query_builder_helper.py ================================================ # Databricks notebook source # MAGIC %md # MAGIC # Import Utils # COMMAND ---------- # MAGIC %run ../utils/query_builder_utils # COMMAND ---------- QUERY_BUILDER_UTILS = QueryBuilderUtils() # COMMAND ---------- # MAGIC %md # MAGIC

Use Case Setup # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC The Global Asset Builder (GAB) has been developed to help you automate the creations of aggregate tables for # MAGIC dashboards on top of base fact tables. It reduce the efforts and time to production for new aggregate tables. # MAGIC Users don't need to create separate pipeline for all such cases. # MAGIC # MAGIC This notebook has been developed to help users to create their use cases configurations easily. # MAGIC # MAGIC There is some mandatory information that must be completed for the use case to work correctly: # MAGIC # MAGIC **Use case name:** This parameter must not contain spaces or special characters. # MAGIC The suggestion is to use lowercase and underlined alphanumeric characters. # MAGIC # MAGIC **Market:** Related to the job schedule, example: GLOBAL starting at 07AM UTC # MAGIC It gets the complete coverage of last day for the market. # MAGIC - GLOBAL - 07AM UTC # MAGIC # MAGIC **Reference date:** Reference date of the use case. The parameter should be the column name. # MAGIC The selected column should have the date/datetime format. # MAGIC # MAGIC **To date:** This parameter is used in the template, by default its value must be "to_date". # MAGIC You can change it if you have managed this in your SQL files. # MAGIC The values stored in this column depend on the use case behavior: # MAGIC - if snapshots are enabled, it will contain the snapshot end day. # MAGIC - If snapshot is not enabled, it will contain the last day of the cadence. # MAGIC The snapshot behaviour is set in the reconciliation steps. # MAGIC # MAGIC **How many dimensions?** An integer input of the number of dimensions (columns) expected in the use case. # MAGIC Do not consider the reference date or metrics here, as they have their own parameters. # MAGIC # MAGIC **Time Offset:** The time zone offset that you want to apply to the reference date column. # MAGIC It should be a number to decrement or add to the date (e.g., -8 or 8). The default value is zero, # MAGIC which means that any time zone transformation will be applied to the date. # MAGIC # MAGIC **Week start:** The start of the business week of the use case. Two options are available SUNDAY or MONDAY. # MAGIC # MAGIC **Is Active:** Flag to make the use case active or not. Default value is "Y". # MAGIC # MAGIC **How many views?** Defines how many consumption views you want to have for the use case. # MAGIC You can have as many as you want. However, they will have exactly the same structure # MAGIC (metrics, columns, timelines, etc.), the only change will be the filter applied to them. # MAGIC The default value is 1. # MAGIC # MAGIC **Complexity:** Defines the complexity of your use case. You should mainly consider the volume of data. # MAGIC This parameter directly affects the number of workers that will be spin up to execute the use case. # MAGIC - High # MAGIC - Medium # MAGIC - Low # MAGIC # MAGIC **SQL File Names:** Name of the SQL files used in the use case. # MAGIC You can combine different layers of dependencies between them as shown in the example, # MAGIC where the "2_combined.sql" file depends on "1_product_category.sql" file. # MAGIC The file name should follow the pattern x_file_name (where x is an integer digit) and be separated by a comma # MAGIC (e.g.: 1_first_query.sql, 2_second_query.sql). # MAGIC # MAGIC **DEV - Database Schema Name** Refers to the name of the development environment database where the # MAGIC "lkp_query_builder" table resides. This parameter is used at the end of the notebook to insert data into # MAGIC the "lkp_query_builder" table. # COMMAND ---------- dbutils.widgets.removeAll() dbutils.widgets.text(name="usecase_name", defaultValue="", label="Use Case Name") dbutils.widgets.dropdown( name="market", defaultValue="GLOBAL", label="Market", choices=["APAC", "GLOBAL", "NAM", "NIGHTLY"] ) dbutils.widgets.text(name="from_date", defaultValue="", label="Reference Date") dbutils.widgets.text(name="to_date", defaultValue="to_date", label="Snapshot End Date") dbutils.widgets.text(name="num_dimensions", defaultValue="", label="How many dimensions?") dbutils.widgets.text(name="time_offset", defaultValue="0", label="Time Offset") dbutils.widgets.dropdown(name="week_start", defaultValue="MONDAY", label="Week start", choices=["SUNDAY", "MONDAY"]) dbutils.widgets.dropdown(name="is_active", defaultValue="Y", label="Is Active", choices=["Y", "N"]) dbutils.widgets.text(name="num_of_views", defaultValue="1", label="How many views?") dbutils.widgets.dropdown( name="complexity", defaultValue="Medium", label="Complexity", choices=["Low", "Medium", "High"] ) dbutils.widgets.text(name="sql_files", defaultValue="", label="SQL File Names") dbutils.widgets.text(name="db_schema", defaultValue="", label="DEV - Database Schema Name") # COMMAND ---------- # MAGIC %md # MAGIC Set configurations and validate. # COMMAND ---------- usecase_name = dbutils.widgets.get("usecase_name").lower().strip() market = dbutils.widgets.get("market") from_date = dbutils.widgets.get("from_date") to_date = dbutils.widgets.get("to_date") num_dimensions = dbutils.widgets.get("num_dimensions") time_offset = dbutils.widgets.get("time_offset") week_start = dbutils.widgets.get("week_start") is_active = dbutils.widgets.get("is_active") num_of_views = dbutils.widgets.get("num_of_views") complexity = dbutils.widgets.get("complexity") sql_files = dbutils.widgets.get("sql_files").replace(".sql", "") db_schema = dbutils.widgets.get("db_schema") num_of_metrics = "" QUERY_BUILDER_UTILS.check_config_inputs( usecase_name, from_date, num_dimensions, sql_files, num_of_views, to_date, time_offset, db_schema ) # COMMAND ---------- # MAGIC %md # MAGIC Set Dimensions. # MAGIC # MAGIC In this step you will have to map the dimension columns with their respective order. # MAGIC The options available in the widgets to fill are based on the number of dimensions previously defined. # MAGIC For example, if you have two dimensions to analyze, such as country and category, # MAGIC values must be set to D1 and D2. # MAGIC For example: # MAGIC D1. Dimension name = country # MAGIC D2. Dimension name = category # COMMAND ---------- QUERY_BUILDER_UTILS.set_dimensions(num_dimensions) # COMMAND ---------- dimensions = QUERY_BUILDER_UTILS.get_dimensions(num_dimensions) # COMMAND ---------- QUERY_BUILDER_UTILS.print_definitions( usecase_name=usecase_name, market=market, from_date=from_date, to_date=to_date, dimensions=dimensions, time_offset=time_offset, week_start=week_start, is_active=is_active, num_of_views=num_of_views, complexity=complexity, sql_files=sql_files, db_schema=db_schema, ) # COMMAND ---------- # MAGIC %md # MAGIC

1 - Configure view(s) name(s) and filter(s) # COMMAND ---------- # MAGIC %md # MAGIC The filters defined in this step will be based on the dimensions defined in the previous step. # MAGIC # MAGIC So, if you have set the country as D1, the filter here should be D1 = "Germany". # MAGIC The commands allowed for the filter step are the same as those used in the where clause in SQL language. # COMMAND ---------- QUERY_BUILDER_UTILS.set_views(num_of_views) # COMMAND ---------- dims_dict = QUERY_BUILDER_UTILS.get_view_information(num_of_views) # COMMAND ---------- QUERY_BUILDER_UTILS.print_definitions( usecase_name=usecase_name, market=market, from_date=from_date, to_date=to_date, dimensions=dimensions, time_offset=time_offset, week_start=week_start, is_active=is_active, num_of_views=num_of_views, complexity=complexity, sql_files=sql_files, db_schema=db_schema, dims_dict=dims_dict, ) # COMMAND ---------- # MAGIC %md # MAGIC # 2 - Configure Reconciliation # COMMAND ---------- # MAGIC %md # MAGIC The reconciliation configuration (recon) is mandatory. # MAGIC In this section you will set the cadence, recon and snapshot behaviour of your use case. # MAGIC # MAGIC CADENCE - The cadence sets how often the data will be calculated. E.g: DAY, WEEK, MONTH, QUARTER, YEAR. # MAGIC # MAGIC RECON - The reconciliation for the cadence set. # MAGIC # MAGIC IS SNAPSHOT? - Set yes or no for the combination of cadence and reconciliation. # MAGIC # MAGIC Combination examples: # MAGIC - DAILY CADENCE = DAY - This configuration means that only daily data will be refreshed. # MAGIC - MONTHLY CADENCE - WEEKLY RECONCILIATION - WITHOUT SNAPSHOT = MONTH-WEEK-N - # MAGIC This means after every week, the whole month data is refreshed without snapshot. # MAGIC - WEEKLY CADENCE - DAY RECONCILIATION - WITH SNAPSHOT = WEEK-DAY-Y - # MAGIC This means that every day, the entire week's data (week to date) is refreshed with snapshot. # MAGIC It will generate a record for each day with the specific position of the value for the week. # COMMAND ---------- dbutils.widgets.removeAll() dbutils.widgets.multiselect( name="recon_cadence", defaultValue="DAY", label="Recon Cadence", choices=QUERY_BUILDER_UTILS.get_recon_choices(), ) # COMMAND ---------- recon_list = list(filter(None, dbutils.widgets.get(name="recon_cadence").split(","))) print(f"List of chosen reconciliation values: {recon_list}") # COMMAND ---------- recon_dict = QUERY_BUILDER_UTILS.get_recon_config(recon_list) # COMMAND ---------- QUERY_BUILDER_UTILS.print_definitions( usecase_name=usecase_name, market=market, from_date=from_date, to_date=to_date, dimensions=dimensions, time_offset=time_offset, week_start=week_start, is_active=is_active, num_of_views=num_of_views, complexity=complexity, sql_files=sql_files, db_schema=db_schema, dims_dict=dims_dict, recon_dict=recon_dict, ) # COMMAND ---------- # MAGIC %md # MAGIC

3 - Configure METRICS # COMMAND ---------- # MAGIC %md # MAGIC Define how many metrics your SQL files contain. For example, you have a sum (amount) as total_amount # MAGIC and a count(*) as total_records, you will need to set 2 here. # MAGIC # MAGIC The metrics column must be configured in the same order they appear in the sql files. # MAGIC # MAGIC For example: # MAGIC 1. Metric name = total_amount # MAGIC 2. Metric name = total_records # COMMAND ---------- dbutils.widgets.removeAll() dbutils.widgets.text(name="num_of_metrics", defaultValue="1", label="How many metrics?") # COMMAND ---------- num_of_metrics = dbutils.widgets.get("num_of_metrics") QUERY_BUILDER_UTILS.set_metric(num_of_metrics) # COMMAND ---------- # MAGIC %md # MAGIC Based on the metric setup, it is possible to derive 4 new columns based on each metric. # MAGIC Those new columns will be based on cadences like last_cadence, last_year_cadence and window function. # MAGIC But also, you can create a derived column, which is a SQL statement that you can write on your own # MAGIC by selecting the option of "derived_metric". # COMMAND ---------- metrics_dict = QUERY_BUILDER_UTILS.get_metric_configuration(num_of_metrics) # COMMAND ---------- QUERY_BUILDER_UTILS.set_extra_metric_config(num_of_metrics, metrics_dict) # COMMAND ---------- QUERY_BUILDER_UTILS.print_definitions( usecase_name=usecase_name, market=market, from_date=from_date, to_date=to_date, dimensions=dimensions, time_offset=time_offset, week_start=week_start, is_active=is_active, num_of_views=num_of_views, complexity=complexity, sql_files=sql_files, db_schema=db_schema, dims_dict=dims_dict, recon_dict=recon_dict, metrics_dict=metrics_dict, ) # COMMAND ---------- # MAGIC %md # MAGIC

4 - Configure STAGES # COMMAND ---------- # MAGIC %md # MAGIC The parameters available for this step are: # MAGIC # MAGIC - Filter Date Column - This column will be used to filter the data of your use case. # MAGIC This information will be replaced in the placeholder of the GAB template. # MAGIC - Project Date Column - This column will be used as reference date for the query given. # MAGIC This information will be replaced in the placeholder of the GAB template. # MAGIC - Repartition Value - This parameter only has effect when used with Repartition Type parameter. # MAGIC It sets the way of repartitioning the data while processing. # MAGIC - Repartition Type - Type of repartitioning the data of the query. # MAGIC Available values are Key and Number. When use Key, it expects column names separated by a comma. # MAGIC When set number it expects and integer of how many partitions the user want. # MAGIC - Storage Level - Defines the type of spark persistence storage levels you want to define # MAGIC (e.g. Memory Only, Memory and Disk etc). # MAGIC - Table Alias - The alias name of the sql file that will run. # MAGIC # COMMAND ---------- sql_files_list = QUERY_BUILDER_UTILS.set_stages(sql_files=sql_files) # COMMAND ---------- # MAGIC %md # MAGIC According to the number of sql files provided in the use case, a set of widgets will appear to be configured. # MAGIC Remember that the configuration index matches the given sql file order. # MAGIC # MAGIC For example: 1_categories.sql, 2_fact_kpi.sql. Settings starting with index “1”. # MAGIC will be set to sql file 1_categories.sql. The same will happen with index “2.”. # COMMAND ---------- stages_dict = QUERY_BUILDER_UTILS.get_stages(sql_files_list, usecase_name) # COMMAND ---------- # MAGIC %md # MAGIC

BUILD AND INSERT SQL INSTRUCTION # COMMAND ---------- delete_sttmt, insert_sttmt = QUERY_BUILDER_UTILS.create_sql_statement( usecase_name, market, stages_dict, recon_dict, time_offset, week_start, is_active, complexity, db_schema, dims_dict, dimensions, from_date, to_date, metrics_dict, ) print(delete_sttmt + "\n" + insert_sttmt) # COMMAND ---------- # MAGIC %md # MAGIC

INSERT CONFIGURATION DATA # MAGIC # MAGIC **Note:** This insert will have effect just on dev/uat, to execute it on prod # MAGIC it will need to use the Table/SQL Manager or another job. # COMMAND ---------- QUERY_BUILDER_UTILS.insert_data_into_lkp_query_builder(delete_sttmt, insert_sttmt) ================================================ FILE: assets/gab/utils/databricks_job_utils.py ================================================ # Databricks notebook source # imports import enum from typing import Tuple from uuid import UUID import requests # COMMAND ---------- class BearerAuth: """Create authorisation object to be used in the requests header.""" def __init__(self, token): """Create auth object with personal access token.""" self.token = token def __call__(self, r): """Add bearer token to header. This function is internally called by get or post method of requests. """ r.headers["authorization"] = "Bearer " + self.token return r class ResultState(str, enum.Enum): """Possible values for result state of a job run.""" SUCCESS = "SUCCESS" CANCELED = "CANCELED" FAILED = "FAILED" SKIPPED = "SKIPPED" class DatabricksJobs: """Class with methods to execute databricks jobs API commands. Refer documentation for details: https://docs.databricks.com/dev-tools/api/latest/jobs.html#. """ # api endpoints RUN_NOW = "/2.1/jobs/run-now" GET_OUTPUT = "/2.1/jobs/runs/get-output" GET_JOB = "/2.1/jobs/runs/get" GET_LIST_JOBS = "/2.1/jobs/list" CANCEL_JOB = "/2.1/jobs/runs/cancel" headers = {"Content-type": "application/json"} def __init__(self, databricks_instance: str, auth: str): """ Construct a databricks jobs object using databricks instance and api token. Parameters: databricks_instance: domain name of databricks deployment. Use the form .cloud.databricks.com auth: personal access token """ self.databricks_instance = databricks_instance self.auth = BearerAuth(auth) @staticmethod def _check_response(response): if response.status_code != 200: raise Exception(f"Response Code: {response.status_code} \n {response.content}") def list_jobs(self, name: str = None, limit: int = 20, offset: int = 0, expand_tasks: bool = False) -> dict: """ List the databricks jobs corresponding to given `name`. for details refer API documentation: https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsList Parameters: name: optional, to filter jobs as per name (case-insensitive) limit: optional, The number of jobs to return, valid range 0 to 25. offset: The offset of the first job to return, relative to the most recently created job expand_tasks: Whether to include task and cluster details in the response. Returns: A dictionary of job ids matching the name (if provided) else returns in chunks """ params = {"limit": limit, "offset": offset, "expand_tasks": expand_tasks} if name: params.update({"name": name}) response = requests.get( f"https://{self.databricks_instance}/api{self.GET_LIST_JOBS}", params=params, headers=self.headers, auth=self.auth, ) self._check_response(response) # Raises exception if not successful return response.json() def run_now(self, job_id: int, notebook_params: dict, idempotency_token: UUID = None) -> dict: """ Trigger the job specified by the job id. Note: currently it expects notebook tasks in a job, but can be extended for other tasks Parameters: job_id: databricks job identifier notebook_params: key value pairs of the parameter name and its value to be passed to the job idempotency_token: An optional token to guarantee the idempotency of job run requests, it should have at most 64 characters Returns: A dictionary consisting of run_id and number_in_job """ data = {"job_id": job_id, "notebook_params": notebook_params} if idempotency_token: data.update({"idempotency_token": str(idempotency_token)}) response = requests.post( f"https://{self.databricks_instance}/api{self.RUN_NOW}", json=data, headers=self.headers, auth=self.auth, ) self._check_response(response) # Raises exception if not successful return response.json() def get_output(self, run_id: int) -> dict: """ Fetch the single job run output and metadata for a single task. Reference: https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsGetOutput Parameters: run_id: identifier for the job run Returns: A dictionary containing the output and metadata from task """ params = {} if run_id: params.update({"run_id": run_id}) response = requests.get( f"https://{self.databricks_instance}/api{self.GET_OUTPUT}", params=params, headers=self.headers, auth=self.auth, ) self._check_response(response) # Raises exception if not successful return response.json() def get_job(self, run_id: int) -> dict: """ Retrieve the metadata of a job run identified by run_id. Parameters: run_id: identifier for the job run Returns: A dictionary containing the metadata of a job """ params = {} if run_id: params.update({"run_id": run_id}) response = requests.get( f"https://{self.databricks_instance}/api{self.GET_JOB}", params=params, headers=self.headers, auth=self.auth ) self._check_response(response) # Raises exception if not successful return response.json() def cancel_job(self, run_id: int) -> dict: """ Cancel job specified by run_id. Parameters: run_id: job run identifier Returns: Response received from endpoint """ response = requests.post( f"https://{self.databricks_instance}/api{self.CANCEL_JOB}", json={"run_id": run_id}, headers=self.headers, auth=self.auth, ) self._check_response(response) # Raises exception if not successful return response.json() def trigger_job_by_name(self, job_name: str, notebook_params: dict, idempotency_token: UUID = None) -> dict: """ Triggers a job as specified by the job name, if found. Parameters: job_name: name of the job notebook_params: key value pairs of the parameter name and its value to be passed to the job idempotency_token: Optional token to guarantee the idempotency of job run requests, 64 characters max Returns: A dictionary consisting of run_id and number_in_job """ result = self.list_jobs(name=job_name) if result.get("jobs") is None: raise Exception(f"job with name {job_name} not found.") return self.run_now(int(result.get("jobs")[0].get("job_id")), notebook_params, idempotency_token) def get_job_status(self, run_id: int) -> Tuple[bool, dict]: """ Fetch the status of the job run id. Parameters: run_id: identifier for the job run Returns: Tuple bool and dict containing whether the job run has succeeded and its state """ state = self.get_job(run_id)["state"] result_state = state.get("result_state") or state.get("life_cycle_state") return result_state == ResultState.SUCCESS, state def job_id_extraction(self, job_name: str) -> int: """Extract the job id from the job run. Args: job_name: Job name. Returns: Job ID number. """ jobs_list = self.list_jobs(name=job_name) if jobs_list.get("jobs") is None: raise Exception("No jobs found.") return int(jobs_list.get("jobs")[0].get("job_id")) ================================================ FILE: assets/gab/utils/query_builder_utils.py ================================================ # Databricks notebook source import json import re from databricks.sdk.runtime import * class QueryBuilderUtils: """Class with methods to create GAB use case configuration.""" def __init__(self): """Instantiate objects of the class QueryBuilderUtils.""" self.regex_no_special_characters = "^[a-zA-Z0-9]+(_[a-zA-Z0-9]+)*$" self.cadences = ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"] def check_config_inputs( self, usecase_name: str, from_date: str, num_dimensions: str, sql_files: str, num_of_views: str, to_date: str, time_offset: str, db_schema: str ) -> str: """ Check the parameters input. Args: usecase_name: The use case name. from_date: The reference date of the use case. num_dimensions: The number of dimensions chosen for analysis. sql_files: Name of the SQL files that will be submitted for the framework to process (e.g. file1.sql, file2.sql). num_of_views: Number of views the use case has. to_date: The end date of the snapshot configuration. time_offset: Hours related to the timezone (e.g. 8, -8). db_schema: Database name that lkp_query_builder is located. Returns: A message with the status of the validation. """ message = "" if ( usecase_name.strip() == "" or from_date.strip() == "" or num_dimensions.strip() == "" or sql_files.strip() == "" or num_of_views.strip() == "" or to_date.strip() == "" or db_schema.strip() == "" ): message = "WRONG CONFIGURATION:" if usecase_name.strip() == "": message += "\n\t - Please, add the Use Case Name." if from_date.strip() == "": message += "\n\t - Please, add the From Date." if num_dimensions.strip() == "": message += "\n\t - Please, add the Number of Dimensions." if sql_files.strip() == "": message += "\n\t - Please, add the SQL File Names." if num_of_views.strip() == "": message += "\n\t - Please, add the number of views." if to_date.strip() == "": message += "\n\t - Please, add the to date value. This information is mandatory. " message += "Keep it as 'to_date' unless you change its name in your SQL files." if db_schema.strip() == "": message += "\n\t - Please, add the database schema where the lkp_query_builder table is located." if time_offset.strip(): try: int(re.findall('-?\d+\.?\d*',time_offset.strip())[0]) except Exception: if message: message += "\n\t The timezone offset must be a number (e.g. 0, 12 or -8)." else: message = "WRONG CONFIGURATION:" message += "\n\t - The timezone offset must be a number (e.g. 0, 12 or -8)." if num_dimensions.strip(): try: int(num_dimensions) if int(num_dimensions) == 0: message = "WRONG CONFIGURATION:" message += "\n\t - The number of dimensions must be greater than zero." except Exception: if message: message += "\n\t - The number of dimensions must be an integer." else: message = "WRONG CONFIGURATION:" message += "\n\t - The number of dimensions must be an integer." if sql_files.strip(): files_list = self._sort_files(sql_files) for file in files_list: sql_files_err = f"""\n\t - Check the SQL file name '{file}'. """ sql_files_err += "It must follow the pattern x_file_name (X is an integer digit)." "" try: int(re.match("(.*?)_", file).group()[:-1]) except Exception: if message: message += sql_files_err else: message = "WRONG CONFIGURATION:" message += sql_files_err if not message: message = "Validation status: OK" return print(message) def create_sql_statement( self, usecase_name: str, market: str, stages_dict: dict, recon_dict: dict, time_offset: str, week_start: str, is_active: str, complexity: str, db_schema: str, dims_dict: dict, dimensions: str, from_date: str, to_date: str, metrics_dict: dict, ) -> tuple[str, str]: """ Create the SQL statement to insert data into lkp_query_builder_table. Args: usecase_name: The name of use case. market: The market used for the use case (APAC, GLOBAL, NAM, NIGHTLY). stages_dict: A dictionary of stages and it's configurations. recon_dict: A dictionary of reconciliation setup. time_offset: Hours related to the timezone (e.g. 8, -8). week_start: Day of the start of the week (e.g. Sunday, Monday) is_active: If the use case is active or not. (e.g. Y, N) complexity: The categories are directly related to the number of workers in each cluster. That is, High = 10 workers, Medium = 6 workers and Low = 4 workers. db_schema: Database name that lkp_query_builder is located. dims_dict: The dictionary of views and it's setup. dimensions: Store supporting information to the fact table. from_date: Aggregating date column for the use case. to_date: Contains the current date (default value is to_date). Information used as template for the framework. metrics_dict: The dictionary of metrics and it's setup. Returns: A tuple with a text formatted with the delete and insert statement. """ dbutils.widgets.removeAll() mapping_dict = self._get_mapping(dims_dict, dimensions, from_date, to_date, metrics_dict) query_id = self._generate_query_id(usecase_name) query_label = f"'{usecase_name}'" query_type = f"'{market}'" mapping_str = json.dumps(mapping_dict, indent=4) mappings = '"""' + mapping_str.replace('"', "'").replace("#+#-#", '\\"') + '"""' steps_str = json.dumps(stages_dict, indent=4) intermediate_stages = '"""' + steps_str.replace('"', "'") + '"""' recon_str = json.dumps(recon_dict) recon_window = '"""' + recon_str.replace('"', "'") + '"""' col_time_offset = f"'{time_offset}'" start_of_week = f"'{week_start}'" col_is_active = f"'{is_active}'" queue = f"'{complexity}'" delete_sttmt = f"""DELETE FROM {db_schema}.lkp_query_builder WHERE QUERY_LABEL = {query_label};""" insert_sttmt = f"""INSERT INTO {db_schema}.lkp_query_builder VALUES ( {query_id}, {query_label}, {query_type}, {mappings}, {intermediate_stages}, {recon_window}, {col_time_offset}, {start_of_week}, {col_is_active}, {queue}, current_timestamp());""" return delete_sttmt, insert_sttmt def get_dimensions(self, num_dimensions: str) -> str: """ Get the dimensions set on the widgets and validate. Args: num_dimensions: The number of dimensions set. Returns: A string with comma-separated dimensions names. """ dimensions = "" list_status = [] for i in range(int(num_dimensions)): i = i + 1 if re.match(self.regex_no_special_characters, dbutils.widgets.get(f"D{i}").strip()): dimensions += "," + dbutils.widgets.get(f"D{i}").strip() list_status.append("success") else: print("WRONG CONFIGURATION:") print(f"\t- {dbutils.widgets.get(f'D{i}')} is empty of malformed!") print( "\t Names can contain only alphanumeric characters and must begin with " "an alphabetic character or an underscore (_)." ) list_status.append("fail") if "fail" not in list_status: print("Dimensions validation status: OK") return dimensions[1:] @classmethod def get_recon_choices(cls) -> list: """ Return all possible combinations for cadences, reconciliations and the snapshot flag value (Y,N). Returns: List used to generate a multiselect widget for the users to interact with. """ return [ "DAY", "DAY-WEEK-N", "DAY-MONTH-N", "DAY-QUARTER-N", "DAY-YEAR-N", "WEEK", "WEEK-DAY-N", "WEEK-DAY-Y", "WEEK-MONTH-N", "WEEK-QUARTER-N", "WEEK-YEAR-N", "MONTH", "MONTH-DAY-N", "MONTH-DAY-Y", "MONTH-WEEK-Y", "MONTH-WEEK-N", "MONTH-QUARTER-N", "MONTH-YEAR-N", "QUARTER", "QUARTER-DAY-N", "QUARTER-DAY-Y", "QUARTER-WEEK-N", "QUARTER-WEEK-Y", "QUARTER-MONTH-N", "QUARTER-MONTH-Y", "QUARTER-YEAR-N", "YEAR", "YEAR-DAY-N", "YEAR-DAY-Y", "YEAR-WEEK-N", "YEAR-WEEK-Y", "YEAR-MONTH-N", "YEAR-MONTH-Y", "YEAR-QUARTER-N", "YEAR-QUARTER-Y", ] @classmethod def get_metric_configuration(cls, num_of_metrics: str) -> dict: """ Get metrics information based on the widget setup. Args: num_of_metrics: Number of metrics selected. Returns: metrics_dict: The dictionary of metrics and their setup. """ metrics_dict = {} for i in range(int(num_of_metrics)): i = i + 1 if dbutils.widgets.get(f"metric_name{i}"): metrics_dict[f"m{i}"] = { "metric_name": dbutils.widgets.get(f"metric_name{i}"), "calculated_metric": {}, "derived_metric": {}, } calculated_metric_list = list(filter(None, dbutils.widgets.get(f"calculated_metric{i}").split(","))) for calc_metric in calculated_metric_list: if calc_metric == "last_cadence": metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}}) # add label and window for last_cadence dbutils.widgets.text( name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label" ) dbutils.widgets.text( name=f"{i}_{calc_metric}_window", defaultValue="", label=f"{i}_{calc_metric}.Window" ) if calc_metric == "last_year_cadence": metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}}) # add label and window for last_cadence dbutils.widgets.text( name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label" ) if calc_metric == "window_function": metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}}) # add label and window for window_function dbutils.widgets.text( name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label" ) dbutils.widgets.text( name=f"{i}_{calc_metric}_window", defaultValue="", label=f"{i}_{calc_metric}.Window Interval", ) dbutils.widgets.dropdown( name=f"{i}_{calc_metric}_agg_func", defaultValue="sum", label=f"{i}_{calc_metric}.Agg Func", choices=["sum", "avg", "max", "min", "count"], ) # add label and window for derived_metric if calc_metric == "derived_metric": dbutils.widgets.text( name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label" ) dbutils.widgets.text( name=f"{i}_{calc_metric}_formula", defaultValue="", label=f"{i}_{calc_metric}.Formula" ) print("Metric configuration status: OK") else: print("WRONG CONFIGURATION:") print("\t- The metric name is mandatory!") return metrics_dict def get_recon_config(self, recon_list: list) -> dict: """ Get reconciliation information based on the widget setup. Args: recon_list: List of cadences setup for the reconciliation. Returns: A dictionary of reconciliation setup. """ cadence_list = [] # create a list with the distinct cadences values. for cadence in recon_list: cadence_name = cadence.split("-")[0] cadence_list.append(cadence_name) cadence_list = list(dict.fromkeys(cadence_list)) # create a dict with the structure of each cadence. recon_dict = {} for cad in cadence_list: recon_dict[f"{cad}"] = {} recon_dict[f"{cad}"]["recon_window"] = {} # updates the dict of each cadence with the recon configurations selected. for cadence in recon_list: if cadence in self.cadences: recon_dict[f"{cad}"]["recon_window"] = {} else: cadence_name = cadence.split("-")[0] recon = cadence.split("-")[1] snapshot = cadence.split("-")[2] for cad in cadence_list: if cadence_name == cad: recon_dict[cad]["recon_window"].update({recon: {"snapshot": snapshot}}) # remove empty recon_window when the selected just cadence. for cadence in recon_list: if cadence in ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"]: if recon_dict[f"{cadence}"]["recon_window"] == {}: del recon_dict[f"{cadence}"]["recon_window"] if recon_dict: print("Reconciliation configuration status: OK") else: print("WRONG CONFIGURATION:") print("\t- The recon information is mandatory!") return recon_dict def get_stages(self, sql_files_list: list, usecase_name: str) -> dict: """ Set stages based on the widget setup. Args: sql_files_list: A list of sql files and their setup. usecase_name: The use case name. Returns: stages_dict: A dictionary of stages and their setup. """ stages_dict = {} i = 0 list_status = [] for file in sql_files_list: i = i + 1 if dbutils.widgets.get(name=f"{i}_script_table_alias"): stages_dict[f"{i}"] = { "file_path": usecase_name + "/" + file.strip() + ".sql", "table_alias": dbutils.widgets.get(name=f"{i}_script_table_alias"), "storage_level": dbutils.widgets.get(name=f"{i}_script_storage_level"), "project_date_column": dbutils.widgets.get(name=f"{i}_script_project_dt_col"), "filter_date_column": dbutils.widgets.get(name=f"{i}_script_filter_dt_col"), } repartition_value = self._format_keys_list(dbutils.widgets.get(name=f"{i}_script_repartition_value")) stages_dict[f"{i}"]["repartition"] = {} if dbutils.widgets.get(name=f"{i}_script_repartition_type") == "NUMBER": try: int(dbutils.widgets.get(name=f"{i}_script_repartition_value").split(",")[0]) stages_dict[f"{i}"]["repartition"] = { "numPartitions": dbutils.widgets.get(name=f"{i}_script_repartition_value") .split(",")[0] .replace("'", "") } except Exception: print("The repartition value must be INTEGER when the type is defined as NUMBER.") list_status.append("fail") elif dbutils.widgets.get(name=f"{i}_script_repartition_type") == "KEY": stages_dict[f"{i}"]["repartition"] = {"keys": repartition_value} else: print(f"The field script alias is missing for {i}.Script Table Alias. This field is mandatory!") stages_dict = {} list_status.append("fail") if "fail" not in list_status: print("Stages configuration status: OK") return stages_dict def get_view_information(self, num_of_views: str) -> dict: """ Get the views information based on the widget setup. Args: num_of_views: Number of views selected. Returns: The dictionary of views and their setup. """ dims_dict = {} for i in range(int(num_of_views)): i = i + 1 if re.match(self.regex_no_special_characters, dbutils.widgets.get(f"view_name{i}")): dims_dict[f"view_name{i}"] = { "name": dbutils.widgets.get(f"view_name{i}"), "filter": dbutils.widgets.get(f"view_filter{i}").replace("'", "#+#-#").replace('"', "#+#-#"), } print("Views validation status: OK") else: print("WRONG CONFIGURATION:") print("\t- View name is empty of malformed!") print( "\t Names can contain only alphanumeric characters and must begin with " "an alphabetic character or an underscore (_)." ) return dims_dict @classmethod def insert_data_into_lkp_query_builder(cls, delete_sttmt: str, insert_sttmt: str): """ Insert data into the lkp query builder table. Args: delete_sttmt: The delete statement. insert_sttmt: The insert statement. """ try: spark.sql(f"{delete_sttmt}") spark.sql(f"{insert_sttmt}") print("CONFIGURATION INSERTED SUCCESSFULLY!") except Exception as e: print(e) def print_definitions( self, usecase_name, market, from_date, to_date, dimensions, time_offset, week_start, is_active, num_of_views, complexity, sql_files, db_schema, dims_dict: dict = None, recon_dict: dict = None, metrics_dict: dict = None, stages_dict: dict = None, ): """ Print the definitions set on widgets. Args: usecase_name: The name of use case. market: The market used for the use case (APAC, GLOBAL, NAM, NIGHTLY). from_date: Aggregating date column for the use case. to_date: Contains the current date (default value is to_date). Information used as template for the framework. dimensions: Store supporting information to the fact table time_offset: Hours related to the timezone (e.g. 8, -8). week_start: Day of the start of the week (e.g. Sunday, Monday) is_active: If the use case is active or not. (e.g. Y, N) num_of_views: Number of views desired for the use case (e.g. 1, 2, 3). complexity: The categories are directly related to the number of workers in each cluster. That is, High = 10 workers, Medium = 6 workers and Low = 4 workers sql_files: Name of the SQL files that will be submitted for the framework to process (e.g. file1.sql, file2.sql). Database name that lkp_query_builder is located. dims_dict: A dictionary of dimensions. recon_dict: A dictionary of reconciliation setup. metrics_dict: The dictionary of metrics and their setup. stages_dict: A dictionary of stages and their setup. """ print("USE CASE DEFINITIONS:") print("Use Case Name:", usecase_name) print("Market:", market) print("From Date:", from_date) print("To Date:", to_date) print("Dimensions:", dimensions) print("Time Offset:", time_offset) print("Week Start:", week_start) print("Is Active:", is_active) print("How many views?", num_of_views) print("Complexity:", complexity) print("SQL Files:", sql_files) print("Database Schema Name:", db_schema) self._print_dims_dict(dims_dict) self._print_recon_dict(recon_dict) if metrics_dict: print("METRICS CONFIGURED:") for key_metrics in metrics_dict: self._print_metrics_dict(key_metrics, metrics_dict) self._print_stages_dict(stages_dict) @classmethod def set_dimensions(cls, num_dimensions: str): """ Set the dimension mappings based on the widget setup. Args: num_dimensions: Number of dimensions selected. """ dbutils.widgets.removeAll() for i in range(int(num_dimensions)): i = i + 1 dbutils.widgets.text(name=f"D{i}", defaultValue="", label=f"D{i}.Dimension Name") print("Please, configure the dimensions using the widgets and proceed to the next cmd.") def set_extra_metric_config(self, num_of_metrics: str, metrics_dict: dict): """ Set extra metrics information based on the widget setup. Args: num_of_metrics: Number of metrics selected. """ for i in range(int(num_of_metrics)): i = i + 1 calculated_metric_list = list(filter(None, dbutils.widgets.get(f"calculated_metric{i}").split(","))) if calculated_metric_list: for calc_metric in calculated_metric_list: self._validate_metrics_config(calc_metric, metrics_dict, i) else: print("Extra metrics configuration status: OK") @classmethod def set_metric(cls, num_of_metrics: str): """ Set metrics information based on the widget setup. Args: num_of_metrics: Number of metrics selected. """ dbutils.widgets.removeAll() for i in range(1, int(num_of_metrics) + 1): dbutils.widgets.text(name=f"metric_name{i}", defaultValue="", label=f"{i}.Metric Name") dbutils.widgets.multiselect( name=f"calculated_metric{i}", defaultValue="", label=f"{i}.Calculated Metric", choices=["", "last_cadence", "last_year_cadence", "window_function", "derived_metric"], ) print("Please, configure the metrics using the widgets and proceed to the next cmd.") def set_stages(self, sql_files: list) -> list: """ Set stages based on the widget setup. Args: sql_files: The SQL file names that will be used in the use case. Returns: sql_files_list: A list of sql files and their setup. """ dbutils.widgets.removeAll() sql_files_list = self._sort_files(sql_files) for i in range(1, len(sql_files_list) + 1): dbutils.widgets.dropdown( name=f"{i}_script_storage_level", defaultValue="MEMORY_ONLY", label=f"{i}.Storage Level", choices=[ "DISK_ONLY", "DISK_ONLY_2", "DISK_ONLY_3", "MEMORY_AND_DISK", "MEMORY_AND_DISK_2", "MEMORY_AND_DISK_DESER", "MEMORY_ONLY", "MEMORY_ONLY_2", "OFF_HEAP", ], ) dbutils.widgets.text(name=f"{i}_script_table_alias", defaultValue="", label=f"{i}.Table Alias") dbutils.widgets.text(name=f"{i}_script_project_dt_col", defaultValue="", label=f"{i}.Project Date Column") dbutils.widgets.text(name=f"{i}_script_filter_dt_col", defaultValue="", label=f"{i}.Filter Date Column") dbutils.widgets.dropdown( name=f"{i}_script_repartition_type", defaultValue="", label=f"{i}.Repartition Type", choices=["", "KEY", "NUMBER"], ) dbutils.widgets.text(name=f"{i}_script_repartition_value", defaultValue="", label=f"{i}.Repartition Value") print("Please, configure the stages using the widgets and proceed to the next cmd.") return sql_files_list @classmethod def set_views(cls, num_of_views: str): """ Set views that will be used in the use case. Args: num_of_views: Number of views selected. """ dbutils.widgets.removeAll() for i in range(1, int(num_of_views) + 1): dbutils.widgets.text(name=f"view_name{i}", defaultValue="", label=f"{i}.View Name") dbutils.widgets.text(name=f"view_filter{i}", defaultValue="", label=f"{i}.View Filter") print("Please, configure the views using the widgets and proceed to the next cmd.") @classmethod def _format_keys_list(cls, key_str: str) -> list: """ Format the list of keys based on the widget keys data provided. Args: key_str: Input text with key column names. Returns: A formatted list with the keys selected for repartitioning. """ key_list = key_str.strip().split(",") output_list = [] for key in key_list: output_list.append(key.replace("'", "").replace('"', "").strip()) return output_list @classmethod def _generate_query_id(cls, usecase_name: str) -> int: """ Generate the query id for the lookup query builder table. The logic to create the ID is a hash of the use case name converted to an integer. Args: usecase_name: The name of use case. Returns: The use case name hashed. """ hash_val = int(str(hash(usecase_name))[0:9]) return hash_val if hash_val > 0 else hash_val * -1 @classmethod def _get_mapping(cls, dims_dict: dict, dimensions: str, from_date: str, to_date: str, metrics_dict: dict) -> dict: """ Get mappings based on the dimensions defined on the widget setup. Args: dims_dict: A dictionary of dimensions. dimensions: Store supporting information to the fact table. from_date: Aggregating date column for the use case. to_date: Contains the current date (default value is to_date). Information used as template for the framework. metrics_dict: The dictionary of metrics and their setup. Returns: mapping_dict: A dictionary of mappings configuration. """ mapping_dict = {} for key in dims_dict: mapping_dict.update({dims_dict[key]["name"]: {"dimensions": {}, "metric": {}, "filter": {}}}) i = 0 for d in dimensions.split(","): i = i + 1 mapping_dict[dims_dict[key]["name"]]["dimensions"].update( {"from_date": from_date, "to_date": to_date, f"d{i}": d.strip()} ) mapping_dict[dims_dict[key]["name"]]["metric"].update(metrics_dict) if dims_dict[key]["filter"]: mapping_dict[dims_dict[key]["name"]]["filter"] = dims_dict[key]["filter"] return mapping_dict @classmethod def _print_dims_dict(cls, dims_dict: dict): """ Print the dictionary of dimensions and views formatted. Args: dims_dict: The dictionary of views and their setup. """ if dims_dict: print("VIEWS CONFIGURED:") for key in dims_dict: print(f"{key}:") keys = [k for k, v in dims_dict[key].items()] for k in keys: print(f"\t{k}:", dims_dict[key][k].replace("#+#-#", '"')) @classmethod def _print_derived_metrics(cls, key_metrics: str, derived_metric: str, metrics_dict: dict): """ Print the derived dict formatted. Args: key_metrics: The key name of each metric configured (e.g. m1, m2, m3). derived_metric: The name of the derived metric configuration (e.g. last_cadence, last_year_cadence, derived_metric, window_function). metrics_dict: The dictionary of metrics and their setup. """ if derived_metric == "derived_metric": if metrics_dict[key_metrics][derived_metric]: print(f"\t- {derived_metric}:") derived_metric_val_list = [k for k, v in metrics_dict[key_metrics][derived_metric][0].items()] for derived_metric_val in derived_metric_val_list: print( f"\t - {derived_metric_val} = " f"{metrics_dict[key_metrics][derived_metric][0][derived_metric_val]}" ) def _print_metrics_dict(self, key_metrics: str, metrics_dict: dict): """ Print the metrics configured formatted. Args: key_metrics: The key name of each metric configured (e.g. m1, m2, m3). metrics_dict: The dictionary of metrics and their setup. """ print(f"{key_metrics}:") list_key_metrics = [k for k, v in metrics_dict[key_metrics].items()] if list_key_metrics: for metric in list_key_metrics: if metric == "metric_name": print(f" {metric} = {metrics_dict[key_metrics][metric]}") else: for derived_metric in metrics_dict[key_metrics][metric]: if derived_metric in ["last_cadence", "last_year_cadence", "window_function"]: print(f"\t- {derived_metric}:") derived_metric_val_list = [ k for k, v in metrics_dict[key_metrics][metric][derived_metric][0].items() ] for derived_metric_val in derived_metric_val_list: print( f"\t - {derived_metric_val} = " f"{metrics_dict[key_metrics][metric][derived_metric][0][derived_metric_val]}" ) else: self._print_derived_metrics(key_metrics, metric, metrics_dict) @classmethod def _print_recon_dict(cls, recon_dict: dict): """ Print the recon dict formatted. Args: recon_dict: A dictionary of reconciliation setup. """ if recon_dict: print("RECON CONFIGURED:") for key_cadence in recon_dict: if recon_dict[f"{key_cadence}"] == {}: print(f"{key_cadence}") else: print(f"{key_cadence}:") keys_recon = [k for k, v in recon_dict[key_cadence].items()] if keys_recon: for k_recon in keys_recon: print(f" {k_recon}:") keys_recon = [k for k, v in recon_dict[key_cadence][k_recon].items()] for recon_val in keys_recon: print( f"\t- {recon_val}:snapshot = {recon_dict[key_cadence][k_recon][recon_val]['snapshot']}" ) @classmethod def _print_stages_dict(cls, stages_dict: dict): """ Print the dictionary of stages formatted. Args: stages_dict: A dictionary of stages and their setup. """ if stages_dict: print("STEPS CONFIGURED:") for key_stages in stages_dict: print(f"step {key_stages}:") keys_stages = [k for k, v in stages_dict[key_stages].items()] for k_stages in keys_stages: if k_stages != "repartition": print(f" - {k_stages} = {stages_dict[key_stages][k_stages]}") else: repartition_stages = [k for k, v in stages_dict[key_stages][k_stages].items()] for stg in repartition_stages: print(" - repartition_type:") print(f"\t {stg} = {stages_dict[key_stages][k_stages][stg]}") @classmethod def _sort_files(cls, sql_files: str) -> list: """ Create a list sorted alphabetically based on the sql files provided. Args: sql_files: Name of the SQL files that will be sent to the framework to process (e.g. file1.sql, file2.sql). Returns: A list of sql files sorted alphabetically. """ fileslist = sql_files.split(",") # remove extra spaces from items in the list fileslist = [x.strip() for x in fileslist] for file in range(len(fileslist)): fileslist[file] = fileslist[file].lower().strip() # apply bubble sort to sort the words for n in range(len(fileslist) - 1, 0, -1): for i in range(n): if fileslist[i] > fileslist[i + 1]: # swap data if the element is less than the next element in the array fileslist[i], fileslist[i + 1] = fileslist[i + 1], fileslist[i] return fileslist @classmethod def _validate_metrics_config(cls, calc_metric: str, metrics_dict: dict, widget_index: int): """ Validate the metrics widgets setup. Args: calc_metric: Name of the metric calculation set (e.g. last_cadence, last_year_cadence). metrics_dict: The dictionary of metrics and their setup. widget_index: Index of the widget selected to be validated. """ if calc_metric == "last_cadence": if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "": try: int(dbutils.widgets.get(f"{widget_index}_{calc_metric}_window")) metrics_dict[f"m{widget_index}"]["calculated_metric"].update( { f"{calc_metric}": [ { "label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"), "window": dbutils.widgets.get(f"{widget_index}_{calc_metric}_window"), } ] } ) print(f"{calc_metric} configuration status: OK") except Exception: print(f"{calc_metric} - WRONG CONFIGURATION:") print(f"\t- The {calc_metric} window value must be INTEGER.") else: print(f"{calc_metric} - WRONG CONFIGURATION:") print(f"\t- The {calc_metric} label is mandatory.") elif calc_metric == "last_year_cadence": if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "": metrics_dict[f"m{widget_index}"]["calculated_metric"].update( { f"{calc_metric}": [ { "label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"), "window": 1, } ] } ) print(f"{calc_metric} configuration status: OK") else: print(f"{calc_metric} - WRONG CONFIGURATION:") print(f"\t- The {calc_metric} label is mandatory.") elif calc_metric == "window_function": if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "": window_list = dbutils.widgets.get(f"{widget_index}_{calc_metric}_window").split(",") if len(window_list) > 1: metrics_dict[f"m{widget_index}"]["calculated_metric"].update( { f"{calc_metric}": [ { "label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"), "window": [int(x.strip()) for x in window_list], "agg_func": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_agg_func"), } ] } ) print(f"{calc_metric} configuration status: OK") else: print(f"{calc_metric} - WRONG CONFIGURATION:") print( "\t- The window function must follow the pattern of " "two integer digits separated with comma (e.g. 3,1)." ) else: print(f"{calc_metric} - WRONG CONFIGURATION:") print("\t- The window_function label is mandatory.") elif calc_metric == "derived_metric": if ( dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_label").strip() != "" and dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_formula").strip() != "" ): metrics_dict[f"m{widget_index}"].update( { f"{calc_metric}": [ { "label": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_label"), "formula": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_formula"), } ] } ) print(f"{calc_metric} configuration status: OK") else: print(f"{calc_metric} - WRONG CONFIGURATION:") print("\t- The derived_metric label and formula are mandatory.") ================================================ FILE: cicd/.bumpversion.cfg ================================================ [bumpversion] current_version = 2.0.0 commit = False tag = False [bumpversion:file:pyproject.toml] search = version = "{current_version}" replace = version = "{new_version}" ================================================ FILE: cicd/Dockerfile ================================================ ARG PYTHON_IMAGE=python:3.12-slim-bullseye FROM $PYTHON_IMAGE ARG USER_ID=1000 ARG GROUP_ID=1000 ARG CPU_ARCHITECTURE # Install Prerequisites RUN mkdir -p /usr/share/man/man1 && \ apt-get -y update && \ apt-get install -y wget=1.21* gnupg2=2.2* git=1:2* g++=4:10.2.1* rsync=3.2* && \ apt-get -y clean # Install jdk RUN mkdir -p /etc/apt/keyrings && \ wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | gpg --dearmor | tee /etc/apt/trusted.gpg.d/adoptium.gpg > /dev/null && \ echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list && \ apt-get -y update && \ apt-get -y install temurin-17-jdk && \ apt-get -y clean ENV JAVA_HOME=/usr/lib/jvm/temurin-17-jdk-${CPU_ARCHITECTURE} # useradd -l is necessary to avoid docker build hanging in export image phase when using large uids RUN groupadd -g ${GROUP_ID} appuser && \ useradd -rm -l -u ${USER_ID} -d /home/appuser -s /bin/bash -g appuser appuser COPY cicd/requirements_full.lock /tmp/requirements.txt USER appuser ENV PATH="/home/appuser/.local/bin:$PATH" RUN python -m pip install --upgrade pip==25.2 setuptools==74.* --user RUN python -m pip install --user -r /tmp/requirements.txt RUN mkdir /home/appuser/.ssh/ && touch /home/appuser/.ssh/known_hosts RUN echo Image built for $CPU_ARCHITECTURE with python image $PYTHON_IMAGE. ================================================ FILE: cicd/Jenkinsfile ================================================ @Library(['GlobalJenkinsLibrary']) _ pipeline { options { buildDiscarder(logRotator(numToKeepStr: '30', artifactNumToKeepStr: '30')) timeout(time: 2, unit: 'HOURS') disableConcurrentBuilds() skipDefaultCheckout(true) ansiColor('xterm') timestamps() } agent { node { label 'lakehouse_base' } } environment { VERSION = env.BRANCH_NAME.replaceAll("[/-]", "_").toLowerCase() GIT_CREDENTIALS_ID = "git-lakehouse-cicd" } stages { stage('cleanup workspace') { steps { cleanWs(disableDeferredWipeout: true, deleteDirs: true) } } stage('Clone') { steps { retry(3) { script { checkout([ $class : 'GitSCM', branches : scm.branches, userRemoteConfigs: [[url: 'https://bitbucket.tools.3stripes.net/scm/lak/lakehouse-engine.git', credentialsId: GIT_CREDENTIALS_ID]] ]) } } } } stage('Build Image') { steps { sh 'make build-image version=$VERSION' } } stage('Create Docs') { steps { sh 'make docs version=$VERSION' } } stage('Parallel') { parallel { stage('Lint') { steps { sh 'make lint version=$VERSION' } } stage('Test Security') { steps { sh 'make test-security version=$VERSION' } } stage('Audit Dependency Safety'){ steps{ catchError(message: "${STAGE_NAME} is unstable", buildResult: 'SUCCESS', stageResult: 'UNSTABLE') { sh 'make audit-dep-safety version=$VERSION' } } } stage('Test dependencies') { steps { sh 'make test-deps version=$VERSION' } } stage('Test') { steps { sh 'make test version=$VERSION' } } } } stage('Sonar') { steps { script { tools.sonar.run(env: 'COMMUNITY-PRD', version: '1.0', branch: env.BRANCH_NAME) } } } } post { always { archiveArtifacts artifacts: 'artefacts/docs/**/*' archiveArtifacts artifacts: 'artefacts/*.json' junit 'artefacts/tests.xml' step([$class: 'CoberturaPublisher', coberturaReportFile: 'artefacts/coverage.xml']) } } } ================================================ FILE: cicd/Jenkinsfile_deploy ================================================ pipeline { parameters { string(name: 'BRANCH', defaultValue: 'master', description: 'Branch to use for the deployment process.') string(name: 'VERSION', defaultValue: null, description: 'Version to deploy (git tag in master branch without the "v"). E.g., 0.2.0. If you are deploying to dev, from your branch, ignore this.') booleanParam(name: 'SKIP_VALIDATIONS', defaultValue: false, description: 'Whether to skip the validations. Only applicable for feature releases to make them faster.') booleanParam(name: 'SKIP_OS_DEPLOYMENT', defaultValue: false, description: 'Whether to skip the OS Deployment related stages or not.') booleanParam(name: 'NOTIFY', defaultValue: true, description: 'Whether to notify the release or not.') } options { buildDiscarder(logRotator(numToKeepStr: '100', artifactNumToKeepStr: '30')) timeout(time: 2, unit: 'HOURS') disableConcurrentBuilds() skipDefaultCheckout(true) ansiColor('xterm') timestamps() } agent { node { label 'lakehouse_base' } } environment { PYPI_CREDENTIALS = credentials('pypi-credentials') ARTIFACTORY_CREDENTIALS = credentials('artifactory-credentials') GIT_CREDENTIALS_ID = "git-lakehouse-cicd" GIT_CREDENTIALS_LAK = credentials('push-to-github-lak') GIT_CREDENTIALS_LAK_DOCS = credentials('push-to-github-lak-docs') DEPLOY_VERSION = getDeploymentVersion() DEPLOY_GIT_OBJECT = getDeploymentGitObject() } stages { stage('cleanup workspace') { steps { cleanWs(disableDeferredWipeout: true, deleteDirs: true) } } stage('Clone') { steps { retry(3) { script { checkout([ $class : 'GitSCM', branches : [['name': env.DEPLOY_GIT_OBJECT]], userRemoteConfigs: [[url: 'https://bitbucket.tools.3stripes.net/scm/lak/lakehouse-engine.git', credentialsId: GIT_CREDENTIALS_ID]] ]) } } } } stage('Build Image') { steps { sh 'make build-image version=' + "${env.DEPLOY_VERSION}" } } stage('Parallel') { when { expression { (!params.SKIP_VALIDATIONS && params.BRANCH != 'master') } } parallel { stage('Lint') { steps { sh 'make lint version=' + "${env.DEPLOY_VERSION}" } } stage('Test Security') { steps { sh 'make test-security version=' + "${env.DEPLOY_VERSION}" } } stage('Audit Dependency Safety'){ steps{ catchError(message: "${STAGE_NAME} is unstable", buildResult: 'SUCCESS', stageResult: 'UNSTABLE') { sh 'make audit-dep-safety version=$VERSION' } } } stage('Test dependencies') { steps { sh 'make test-deps version=' + "${env.DEPLOY_VERSION}" } } stage('Test') { steps { sh 'make test version=' + "${env.DEPLOY_VERSION}" } } } } stage('Deploy') { steps { script { sh 'make deploy version=' + "${env.DEPLOY_VERSION}" + ' artifactory_credentials_file=$ARTIFACTORY_CREDENTIALS' } } } stage('Open Source Deployment') { when { expression { (params.BRANCH == 'master' && !params.SKIP_OS_DEPLOYMENT) } } stages { stage('Sync Code with GitHub') { steps { script { sh 'make sync-to-github version=' + "${env.DEPLOY_VERSION}" + ' git_credentials_file=$GIT_CREDENTIALS_LAK repository=lakehouse-engine' } } } stage('Deploy Docs to Github') { steps { script { sh 'make deploy-docs-to-github version=' + "${env.DEPLOY_VERSION}" + ' git_credentials_file=$GIT_CREDENTIALS_LAK_DOCS repository=lakehouse-engine-docs os_deployment=True' } } } stage('Deploy to Pypi') { steps { script { // we are forcing make build as it was not happening sometimes, for no reason. sh 'make build os_deployment=True' sh 'make deploy-to-pypi-and-clean os_deployment=True version=' + "${env.DEPLOY_VERSION}" + ' pypi_credentials_file=$PYPI_CREDENTIALS' } } } } } stage('Notify') { when { expression { params.BRANCH == 'master' && params.NOTIFY } } steps { script { params = readYaml file: 'cicd/meta.yaml' release_notes = sh(script:'cat CHANGELOG.md | cut -d ")" -f 2 | head -n 10', returnStdout: true).trim() recipients = params["mail_recipients"].join(";") emailext( attachLog: false, compressLog: true, body: """
A new version $env.DEPLOY_VERSION of the Lakehouse Engine was deployed into Artifactory.

You can install it just like any other python library, either notebook scoped with pip install or cluster scoped by specifying the library in the cluster configuration.: You can check the lakehouse-engine documentation here: ${params["engine_docs"]}. Check the latest updates here:
                            ${release_notes}
                            

For more details, please check the complete changelog and/or the additional resources listed below: """, mimeType: 'text/html', replyTo: "${params['reply_to']}", from: "${params['from']}", to: recipients, subject: "Lakehouse Engine Updates - $env.DEPLOY_VERSION" ) } } } } } /** * Get deployment git object (branch name or tag reference) given certain Jenkins parameters and the team's deployment guidelines. * @return git object (branch or tag) */ def String getDeploymentGitObject() { gitObject = params.BRANCH if (params.BRANCH == 'master') { if (params.VERSION ==~ '[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}') { // force the git object to checkout to be a version tag gitObject = "refs/tags/v${params.VERSION}" return gitObject } else { throw new Exception("Version ${params.VERSION} does not match valid git version tag. It should be in the form of ...") } } else { return gitObject } } /** * Get deployment version given certain Jenkins parameters and the team's deployment guidelines. * @return deployment version */ def String getDeploymentVersion() { version = params.VERSION if (params.BRANCH == 'master') { if (version ==~ '[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}') { return version } else { throw new Exception("Version ${version} does not match valid git version tag. It should be in the form of ...") } } else { // force branch as the version to be deployed when we are dealing with feature branches. return params.BRANCH.replaceAll("[/-]", "_").toLowerCase() } } ================================================ FILE: cicd/bandit.yaml ================================================ assert_used: skips: ['*test*'] ================================================ FILE: cicd/code_doc/content.css ================================================ /* This CSS file contains all style definitions for documentation content. All selectors are scoped with ".pdoc". This makes sure that the pdoc styling doesn't leak to the rest of the page when pdoc is embedded. */ .pdoc { color: var(--text); /* enforce some styling even if bootstrap reboot is not included */ box-sizing: border-box; line-height: 1.5; /* override background from pygments */ /*unnecessary since pdoc 10, only left here to keep old custom templates working. */ background: none; } .pdoc .pdoc-button { cursor: pointer; display: inline-block; border: solid black 1px; border-radius: 2px; font-size: .75rem; padding: calc(0.5em - 1px) 1em; transition: 100ms all; } /* Admonitions */ .pdoc .pdoc-alert { padding: 1rem 1rem 1rem calc(1.5rem + 24px); border: 1px solid transparent; border-radius: .25rem; background-repeat: no-repeat; background-position: 1rem center; margin-bottom: 1rem; } .pdoc .pdoc-alert > *:last-child { margin-bottom: 0; } /* Admonitions are currently not stylable via theme.css */ .pdoc .pdoc-alert-note { color: #000000; background-color: #f1efef; border-color: #f1f1f1; background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/info-circle-fill.svg' %}{% endfilter %}"); } .pdoc .pdoc-alert-warning { color: #664d03; background-color: #fff3cd; border-color: #ffecb5; background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/exclamation-triangle-fill.svg' %}{% endfilter %}"); } .pdoc .pdoc-alert-danger { color: #842029; background-color: #f8d7da; border-color: #f5c2c7; background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/lightning-fill.svg' %}{% endfilter %}"); } .pdoc .visually-hidden { position: absolute !important; width: 1px !important; height: 1px !important; padding: 0 !important; margin: -1px !important; overflow: hidden !important; clip: rect(0, 0, 0, 0) !important; white-space: nowrap !important; border: 0 !important; } .pdoc h1, .pdoc h2, .pdoc h3 { font-weight: 300; margin: .3em 0; padding: .2em 0; } .pdoc > section:not(.module-info) h1 { font-size: 1.5rem; font-weight: 500; } .pdoc > section:not(.module-info) h2 { font-size: 1.4rem; font-weight: 500; } .pdoc > section:not(.module-info) h3 { font-size: 1.3rem; font-weight: 500; } .pdoc > section:not(.module-info) h4 { font-size: 1.2rem; } .pdoc > section:not(.module-info) h5 { font-size: 1.1rem; } .pdoc a { text-decoration: none; color: var(--link); } .pdoc a:hover { color: var(--link-hover); } .pdoc blockquote { margin-left: 2rem; } .pdoc pre { border-top: 1px solid var(--accent2); border-bottom: 1px solid var(--accent2); margin-top: 0; margin-bottom: 1em; padding: .5rem 0 .5rem .5rem; overflow-x: auto; /*unnecessary since pdoc 10, only left here to keep old custom templates working. */ background-color: var(--code); } .pdoc code { color: var(--text); padding: .2em .4em; margin: 0; font-size: 85%; background-color: var(--accent); border-radius: 6px; } .pdoc a > code { color: inherit; } .pdoc pre > code { display: inline-block; font-size: inherit; background: none; border: none; padding: 0; } .pdoc > section:not(.module-info) { /* this margin should collapse with docstring margin, but not for the module docstr which is followed by view_source. */ margin-bottom: 1.5rem; } /* Page Heading */ .pdoc .modulename { margin-top: 0; font-weight: bold; } .pdoc .modulename a { color: var(--link); transition: 100ms all; } /* GitHub Button */ .pdoc .git-button { float: right; border: solid var(--link) 1px; } .pdoc .git-button:hover { background-color: var(--link); color: var(--pdoc-background); } .view-source-toggle-state, .view-source-toggle-state ~ .pdoc-code { display: none; } .view-source-toggle-state:checked ~ .pdoc-code { display: block; } .view-source-button { display: inline-block; float: right; font-size: .75rem; line-height: 1.5rem; color: var(--muted); padding: 0 .4rem 0 1.3rem; cursor: pointer; /* odd hack to reduce space between "bullet" and text */ text-indent: -2px; } .view-source-button > span { visibility: hidden; } .module-info .view-source-button { float: none; display: flex; justify-content: flex-end; margin: -1.2rem .4rem -.2rem 0; } .view-source-button::before { /* somewhat awkward recreation of a element. ideally we'd just use `display: inline list-item`, but that does not work in Chrome (yet), see https://crbug.com/995106. */ position: absolute; content: "View Source"; display: list-item; list-style-type: disclosure-closed; } .view-source-toggle-state:checked ~ .attr .view-source-button::before, .view-source-toggle-state:checked ~ .view-source-button::before { list-style-type: disclosure-open; } /* Docstrings */ .pdoc .docstring { margin-bottom: 1.5rem; } .pdoc section:not(.module-info) .docstring { margin-left: clamp(0rem, 5vw - 2rem, 1rem); } .pdoc .docstring .pdoc-code { margin-left: 1em; margin-right: 1em; } /* Highlight focused element */ .pdoc h1:target, .pdoc h2:target, .pdoc h3:target, .pdoc h4:target, .pdoc h5:target, .pdoc h6:target, .pdoc .pdoc-code > pre > span:target { background-color: var(--active); box-shadow: -1rem 0 0 0 var(--active); } .pdoc .pdoc-code > pre > span:target { /* make the highlighted line full width so that the background extends */ display: block; } .pdoc div:target > .attr, .pdoc section:target > .attr, .pdoc dd:target > a { background-color: var(--active); } .pdoc * { scroll-margin: 2rem; } .pdoc .pdoc-code .linenos { user-select: none; } .pdoc .attr:hover { filter: contrast(0.95); } /* Header link */ .pdoc section, .pdoc .classattr { position: relative; } .pdoc .headerlink { --width: clamp(1rem, 3vw, 2rem); position: absolute; top: 0; left: calc(0rem - var(--width)); transition: all 100ms ease-in-out; opacity: 0; } .pdoc .headerlink::before { content: "#"; display: block; text-align: center; width: var(--width); height: 2.3rem; line-height: 2.3rem; font-size: 1.5rem; } .pdoc .attr:hover ~ .headerlink, .pdoc *:target > .headerlink, .pdoc .headerlink:hover { opacity: 1; } /* Attributes */ .pdoc .attr { display: block; margin: .5rem 0 .5rem; padding: .4rem .4rem .4rem 1rem; background-color: var(--accent); overflow-x: auto; } .pdoc .classattr { margin-left: 2rem; } .pdoc .name { color: var(--name); font-weight: bold; } .pdoc .def { color: var(--def); font-weight: bold; } .pdoc .signature { /* override pygments background color */ background-color: transparent; } .pdoc .param, .pdoc .return-annotation { white-space: pre; } .pdoc .signature.multiline .param { display: block; } .pdoc .signature.condensed .param { display:inline-block; } .pdoc .annotation { color: var(--annotation); } /* Show/Hide buttons for long default values */ .pdoc .view-value-toggle-state, .pdoc .view-value-toggle-state ~ .default_value { display: none; } .pdoc .view-value-toggle-state:checked ~ .default_value { display: inherit; } .pdoc .view-value-button { font-size: .5rem; vertical-align: middle; border-style: dashed; margin-top: -0.1rem; } .pdoc .view-value-button:hover { background: white; } .pdoc .view-value-button::before { content: "show"; text-align: center; width: 2.2em; display: inline-block; } .pdoc .view-value-toggle-state:checked ~ .view-value-button::before { content: "hide"; } /* Inherited Members */ .pdoc .inherited { margin-left: 2rem; } .pdoc .inherited dt { font-weight: 700; } .pdoc .inherited dt, .pdoc .inherited dd { display: inline; margin-left: 0; margin-bottom: .5rem; } .pdoc .inherited dd:not(:last-child):after { content: ", "; } .pdoc .inherited .class:before { content: "class "; } .pdoc .inherited .function a:after { content: "()"; } /* Search results */ .pdoc .search-result .docstring { overflow: auto; max-height: 25vh; } .pdoc .search-result.focused > .attr { background-color: var(--active); } /* "built with pdoc" attribution */ .pdoc .attribution { margin-top: 2rem; display: block; opacity: 0.5; transition: all 200ms; filter: grayscale(100%); } .pdoc .attribution:hover { opacity: 1; filter: grayscale(0%); } .pdoc .attribution img { margin-left: 5px; height: 35px; vertical-align: middle; width: 70px; transition: all 200ms; } .pdoc table { display: block; width: max-content; max-width: 150%; overflow: auto; margin-bottom: 1rem; } .pdoc table th, .pdoc table td { padding: 12px 13px; border: 1px solid var(--accent2); } .pdoc table th { font-weight: 600; } ================================================ FILE: cicd/code_doc/custom_example_macros.py ================================================ """Macro methods to be used on Lakehouse Engine Docs.""" import warnings import json import pygments.formatters.html from markupsafe import Markup STACK_LEVEL = 2 def _search_files(file: dict, search_string: str) -> list: """Searches for a string and outputs the line. Search for a given string in a file and output the line where it is first found. Args: file: path of the file to be searched. search_string: string that will be searched for. Returns: The number of the first line where a given search_string appears. """ range_lines = [] with open(file) as f: for num, line in enumerate(f, 1): if search_string in line: range_lines.append(num - 1) return range_lines[0] def _link_example(method_name: str) -> str or None: """Searches for a link in a dict. Searches for the link of a given method_name, in a specific config file and outputs it. Args: method_name: name of the method to be searched for. Returns: None or the example link for the given method_name. """ if method_name in list(lakehouse_engine_examples.keys()): file_link = lakehouse_engine_examples[str(method_name)] return lakehouse_engine_examples["base_link"] + file_link if file_link != "" else None else: warnings.warn( "No entry provided for the following transformer: " + method_name, RuntimeWarning, STACK_LEVEL, ) return None def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict: """Searches for a transformer and returns the first dictionary occurrence. Search for a given transformer in a dictionary and return the first occurrence. Args: dict_to_search: path of the file to be searched. transformer: string that will be searched for. Returns: First dictionary where a given transformer is found. """ dict_transformer = [] for spec in dict_to_search["transform_specs"]: for transformer_dict in spec["transformers"]: if transformer_dict["function"] == transformer: dict_transformer.append(transformer_dict) return json.dumps(dict_transformer[0], indent=4) def _highlight_examples(method_name: str) -> str or None: """Creates a code snippet. Constructs and exposes the code snippet of a given method_name. Args: method_name: name of the module to be searched for. Returns: None or the code snippet wrapped in html tags. """ for key, item in lakehouse_engine_examples.items(): if method_name == key: file_path = f"../../{item}" if file_path == "../../": warnings.warn( "No unit testing for the following transformer: " + method_name, RuntimeWarning, STACK_LEVEL, ) return None first_line = _search_files(file_path, f'"function": "{method_name}"') with open(file_path) as json_file: acon_file = json.load(json_file) code_snippet = _get_dict_transformer(acon_file, method_name) # Defining the lexer which will parse through the snippet of code we want # to highlight lexer = pygments.lexers.JsonLexer() # Defining the format that will be outputted by the pygments library # (on our case it will output the code within html tags) formatter = pygments.formatters.html.HtmlFormatter( linenos="inline", anchorlinenos=True, ) formatter.linenostart = first_line return Markup(pygments.highlight(code_snippet, lexer, formatter)) def get_example(method_name: str) -> str: """Get example based on given argument. Args: method_name: name of the module to be searched for. Returns: A example. """ example_link = _link_example(method_name=method_name) json_example = _highlight_examples(method_name=method_name) if example_link: return ( """
\n""" f"""View Example of {method_name} (See full example here)""" f"""
{json_example}
\n""" """
""" ) else: return "" with open("./examples.json") as json_file: lakehouse_engine_examples = json.load(json_file) def define_env(env): "Declare environment for jinja2 templates for markdown" for fn in [get_example]: env.macro(fn) # get mkdocstrings' Python handler python_handler = env.conf["plugins"]["mkdocstrings"].get_handler("python") # get the `update_env` method of the Python handler update_env = python_handler.update_env # override the `update_env` method of the Python handler def patched_update_env(md, config): update_env(md, config) # get the `convert_markdown` filter of the env convert_markdown = python_handler.env.filters["convert_markdown"] # build a chimera made of macros+mkdocstrings def render_convert(markdown: str, *args, **kwargs): return convert_markdown(env.render(markdown), *args, **kwargs) # patch the filter python_handler.env.filters["convert_markdown"] = render_convert # patch the method python_handler.update_env = patched_update_env ================================================ FILE: cicd/code_doc/examples.json ================================================ { "base_link":"https://github.com/adidas/lakehouse-engine/blob/master/", "get_max_value": "tests/resources/feature/delta_load/merge_options/update_column_set/batch_delta.json", "with_row_id": "tests/resources/feature/transformations/chain_transformations/acons/streaming_batch.json", "with_auto_increment_id": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json", "with_literals": "tests/resources/feature/transformations/column_creators/batch.json", "cast": "tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled.json", "column_selector": "", "flatten_schema": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json", "explode_columns": "tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json", "with_expressions": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json", "rename": "tests/resources/feature/schema_evolution/append_load/batch_append_disabled.json", "from_avro": "", "from_avro_with_registry": "", "from_json": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json", "to_json": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json", "condense_record_mode_cdc": "tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_init.json", "group_and_rank": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json", "hash_masker": "tests/resources/feature/transformations/data_maskers/hash_masking.json", "column_dropper": "tests/resources/feature/transformations/data_maskers/drop_columns.json", "add_current_date": "tests/resources/feature/transformations/date_transformers/streaming.json", "convert_to_date": "tests/resources/feature/transformations/date_transformers/streaming.json", "convert_to_timestamp": "tests/resources/feature/transformations/date_transformers/streaming.json", "format_date": "tests/resources/feature/transformations/date_transformers/streaming.json", "get_date_hierarchy": "tests/resources/feature/transformations/date_transformers/streaming.json", "incremental_filter": "tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_delta.json", "expression_filter": "tests/resources/feature/full_load/with_filter/batch.json", "column_filter_exp": "tests/resources/feature/transformations/multiple_transform/batch.json", "join": "tests/resources/feature/transformations/joiners/batch.json", "replace_nulls": "tests/resources/feature/transformations/null_handlers/replace_nulls_col_subset.json", "with_regex_value": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json", "coalesce": "tests/resources/feature/writers/acons/write_batch_console.json", "repartition": "tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json", "get_transformer": "", "with_watermark": "tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/streaming_drop_duplicates_overall_watermark.json" } ================================================ FILE: cicd/code_doc/gen_ref_nav.py ================================================ """Module to generate code reference docs.""" # Import necessary libraries from pathlib import Path import mkdocs_gen_files # Create a new navigation structure nav = mkdocs_gen_files.Nav() # Define the root directory and the source directory root = Path(__file__).parent src = root / "mkdocs/lakehouse_engine" print(f"Looking for files in {src}") # Loop over all Python files in the source directory for path in sorted(src.rglob("*.py")): # Get the module path and the documentation path for each file module_path = path.relative_to(src).with_suffix("") doc_path = path.relative_to(src / "").with_suffix(".md") full_doc_path = Path("reference", doc_path) # Split the module path into parts parts = tuple(module_path.parts) # Skip files that start with an underscore or have no parts if not parts: continue # If the file is an __init__.py file, remove the last part and rename the doc file to index.md if parts[-1] == "__init__" and str(parts[:-1]) != "()": parts = parts[:-1] doc_path = doc_path.with_name("index.md") full_doc_path = full_doc_path.with_name("index.md") elif parts[-1].startswith("_"): continue # Skip the loop iteration if there is no doc path if not doc_path: continue # If the doc path has at least one part, add it to the navigation if len(doc_path.parts) >= 1: nav_parts = [f"{part}" for part in parts] nav[tuple(nav_parts)] = doc_path.as_posix() # Open the full doc path and write the module identifier to it with mkdocs_gen_files.open(full_doc_path, "w") as fd: ident = ".".join(parts) fd.write(f"::: {ident}") # Set the edit path for the file mkdocs_gen_files.set_edit_path( full_doc_path, ".." / path.relative_to(root)) # Open the index.md file and write the built navigation to it with mkdocs_gen_files.open("reference/index.md", "w") as nav_file: nav_file.writelines(nav.build_literate_nav()) ================================================ FILE: cicd/code_doc/index.html.jinja2 ================================================ {% set root_module_name = "" %} {% extends "default/index.html.jinja2" %} {% block title %}Lakehouse Engine Documentation{% endblock %} {% block nav %}

Available Modules

{% endblock %} {% block content %}

Lakehouse Engine Documentation

{% filter to_html %} {% include "README.md" %} {% endfilter %}
{% if search %} {% include "search.html.jinja2" %} {% endif %} {% endblock %} ================================================ FILE: cicd/code_doc/mkdocs.yml ================================================ site_name: Lakehouse Engine Documentation site_url: https://adidas.github.io/lakehouse-engine-docs repo_url: https://github.com/adidas/lakehouse-engine repo_name: lakehouse-engine docs_dir: "mkdocs/docs" nav: - Lakehouse Engine: index.md - How to use the Lakehouse Engine?: - Overview: lakehouse_engine_usage/lakehouse_engine_usage.md - Algorithms: - Data Loader: - Overview: lakehouse_engine_usage/data_loader/data_loader.md - Scenarios: - Append Load from JDBC with PERMISSIVE mode (default): lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/append_load_from_jdbc_with_permissive_mode.md - Append Load with FAILFAST: lakehouse_engine_usage/data_loader/append_load_with_failfast/append_load_with_failfast.md - Batch Delta Load Init, Delta and Backfill with Merge: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/batch_delta_load_init_delta_backfill_with_merge.md - Custom Transformer: lakehouse_engine_usage/data_loader/custom_transformer/custom_transformer.md - Custom Transformer (SQL): lakehouse_engine_usage/data_loader/custom_transformer_sql/custom_transformer_sql.md - Extract from SAP B4 ADSOs: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md - Extract from SAP BW DSOs: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md - Extract from SFTP: lakehouse_engine_usage/data_loader/extract_from_sftp/extract_from_sftp.md - Extract using JDBC connection: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/extract_using_jdbc_connection.md - Filtered Full Load: lakehouse_engine_usage/data_loader/filtered_full_load/filtered_full_load.md - Filtered Full Load with Selective Replace: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/filtered_full_load_with_selective_replace.md - Flatten Schema and Explode Columns: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/flatten_schema_and_explode_columns.md - Full Load: lakehouse_engine_usage/data_loader/full_load/full_load.md - Read from Dataframe: lakehouse_engine_usage/data_loader/read_from_dataframe/read_from_dataframe.md - Read from Sharepoint: lakehouse_engine_usage/data_loader/read_from_sharepoint/read_from_sharepoint.md - Streaming Append Load with DROPMALFORMED: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/streaming_append_load_with_malformed.md - Streaming Append Load with Optimize Dataset Terminator: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/streaming_append_load_with_terminator.md - Streaming Delta Load with Group and Rank Condensation: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/streaming_delta_load_with_group_and_rank_condensation.md - Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking): lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md - Write and Read Dataframe: lakehouse_engine_usage/data_loader/write_and_read_dataframe/write_and_read_dataframe.md - Write to Console: lakehouse_engine_usage/data_loader/write_to_console/write_to_console.md - Write to REST API: lakehouse_engine_usage/data_loader/write_to_rest_api/write_to_rest_api.md - Write to Sharepoint: lakehouse_engine_usage/data_loader/write_to_sharepoint/write_to_sharepoint.md - Data Quality: - Overview: lakehouse_engine_usage/data_quality/data_quality.md - Scenarios: - Custom Expectations: lakehouse_engine_usage/data_quality/custom_expectations/custom_expectations.md - Data Quality Validator: lakehouse_engine_usage/data_quality/data_quality_validator/data_quality_validator.md - Minimal Example: lakehouse_engine_usage/data_quality/minimal_example/minimal_example.md - Prisma: lakehouse_engine_usage/data_quality/prisma/prisma.md - Result Sink: lakehouse_engine_usage/data_quality/result_sink/result_sink.md - Row Tagging: lakehouse_engine_usage/data_quality/row_tagging/row_tagging.md - Validations Failing: lakehouse_engine_usage/data_quality/validations_failing/validations_failing.md - Reconciliator: - Overview: lakehouse_engine_usage/reconciliator/reconciliator.md - Sensors: - Overview: lakehouse_engine_usage/sensors/sensors.md - Sensor: - Overview: lakehouse_engine_usage/sensors/sensor/sensor.md - Supported Sources: - Delta Table: lakehouse_engine_usage/sensors/sensor/delta_table/delta_table.md - Sensor from other Sensor Delta Table: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md - Sensor from Files: lakehouse_engine_usage/sensors/sensor/file/file.md - Sensor from JDBC: lakehouse_engine_usage/sensors/sensor/jdbc_table/jdbc_table.md - Sensor from Kafka: lakehouse_engine_usage/sensors/sensor/kafka/kafka.md - Sensor from SAP: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/sap_bw_b4.md - Update Sensor control Delta Table after processing the data: lakehouse_engine_usage/sensors/sensor/update_sensor_status/update_sensor_status.md - Heartbeat Sensor: - Overview: lakehouse_engine_usage/sensors/heartbeat/heartbeat.md - Supported Sources: - Delta Table: lakehouse_engine_usage/sensors/heartbeat/delta_table/delta_table.md - Kafka: lakehouse_engine_usage/sensors/heartbeat/kafka/kafka.md - Manual Table: lakehouse_engine_usage/sensors/heartbeat/manual_table/manual_table.md - SAP BW/4HANA: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/sap_bw_b4.md - Trigger File: lakehouse_engine_usage/sensors/heartbeat/trigger_file/trigger_file.md - Feed Heartbeat Sensor Control Delta Table: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md - Update Heartbeat Sensor control Delta Table after processing the data: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/update_heartbeat_sensor_status.md - GAB: - Overview: lakehouse_engine_usage/gab/gab.md - Step-by-Step: lakehouse_engine_usage/gab/step_by_step/step_by_step.md - Tools: - Table & File Manager Helper: lakehouse_engine_usage/managerhelper/managerhelper.md - API Documentation: reference/ # (1)! theme: name: material language: en logo: assets/img/lakehouse_engine_logo.png favicon: assets/img/lakehouse_engine_logo_symbol_large.png icon: repo: fontawesome/brands/github-alt palette: - media: "(prefers-color-scheme: light)" scheme: default primary: blue accent: yellow toggle: icon: material/toggle-switch name: Switch to dark mode - media: "(prefers-color-scheme: dark), (prefers-color-scheme: no-preference)" scheme: slate primary: blue accent: yellow toggle: icon: material/toggle-switch-off name: Switch to light mode features: - content.code.annotate - content.code.annotation - content.code.copy - content.code.select - content.tabs.link - content.tooltips - navigation.indexes - navigation.path - navigation.tabs - navigation.tabs.instant - navigation.tabs.sticky - navigation.top - navigation.sections - toc.follow - toc.integrate - search.highlight - search.suggest extra: social: - icon: fontawesome/brands/github-alt link: https://adidas.github.io/lakehouse-engine version: provider: mike name: Version plugins: - search - markdown-exec - offline - section-index - mkdocstrings: enabled: !ENV [ENABLE_MKDOCSTRINGS, true] default_handler: python handlers: python: paths: [mkdocs/lakehouse_engine] options: show_source: true - macros: module_name: mkdocs_macros - gen-files: scripts: - gen_ref_nav.py - literate-nav: nav_file: SUMMARY.md - mike: alias_type: symlink canonical_version: latest extra: social: - icon: fontawesome/brands/github-alt link: https://adidas.github.io/lakehouse-engine markdown_extensions: - admonition - attr_list - extra - footnotes - markdown_include.include: base_path: mkdocs/docs - md_in_html - pymdownx.arithmatex: generic: true - pymdownx.details - pymdownx.emoji: emoji_index: !!python/name:materialx.emoji.twemoji emoji_generator: !!python/name:materialx.emoji.to_svg - pymdownx.highlight: anchor_linenums: true line_spans: __span pygments_lang_class: true - pymdownx.inlinehilite - pymdownx.mark - pymdownx.tabbed: alternate_style: true - pymdownx.snippets - pymdownx.superfences: custom_fences: - name: mermaid class: mermaid format: !!python/name:pymdownx.superfences.fence_code_format '' - toc: permalink: true copyright: | © 2025 adidas ================================================ FILE: cicd/code_doc/mkdocs_macros.py ================================================ """Macro methods to be used on Lakehouse Engine Docs.""" import warnings import json import pygments.formatters.html from markupsafe import Markup STACK_LEVEL = 2 def _search_files(file: dict, search_string: str) -> list: """Searches for a string and outputs the line. Search for a given string in a file and output the line where it is first found. Args: file: path of the file to be searched. search_string: string that will be searched for. Returns: The number of the first line where a given search_string appears. """ range_lines = [] with open(file) as f: for num, line in enumerate(f, 1): if search_string in line: range_lines.append(num - 1) return range_lines[0] def _link_example(method_name: str) -> str or None: """Searches for a link in a dict. Searches for the link of a given method_name, in a specific config file and outputs it. Args: method_name: name of the method to be searched for. Returns: None or the example link for the given method_name. """ if method_name in list(lakehouse_engine_examples.keys()): file_link = lakehouse_engine_examples[str(method_name)] return lakehouse_engine_examples["base_link"] + file_link if file_link != "" else None else: warnings.warn( "No entry provided for the following transformer: " + method_name, RuntimeWarning, STACK_LEVEL, ) return None def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict: """Searches for a transformer and returns the first dictionary occurrence. Search for a given transformer in a dictionary and return the first occurrence. Args: dict_to_search: path of the file to be searched. transformer: string that will be searched for. Returns: First dictionary where a given transformer is found. """ dict_transformer = [] for spec in dict_to_search["transform_specs"]: for transformer_dict in spec["transformers"]: if transformer_dict["function"] == transformer: dict_transformer.append(transformer_dict) return json.dumps(dict_transformer[0], indent=4) def _highlight_examples(method_name: str) -> str or None: """Creates a code snippet. Constructs and exposes the code snippet of a given method_name. Args: method_name: name of the module to be searched for. Returns: None or the code snippet wrapped in html tags. """ for key, item in lakehouse_engine_examples.items(): if method_name == key: file_path = f"../../{item}" if file_path == "../../": warnings.warn( "No unit testing for the following transformer: " + method_name, RuntimeWarning, STACK_LEVEL, ) return None first_line = _search_files(file_path, f'"function": "{method_name}"') with open(file_path) as json_file: acon_file = json.load(json_file) code_snippet = _get_dict_transformer(acon_file, method_name) # Defining the lexer which will parse through the snippet of code we want # to highlight lexer = pygments.lexers.JsonLexer() # Defining the format that will be outputted by the pygments library # (on our case it will output the code within html tags) formatter = pygments.formatters.html.HtmlFormatter( linenos="inline", anchorlinenos=True, ) formatter.linenostart = first_line return Markup(pygments.highlight(code_snippet, lexer, formatter)) def get_example(method_name: str) -> str: """Get example based on given argument. Args: method_name: name of the module to be searched for. Returns: A example. """ example_link = _link_example(method_name=method_name) json_example = _highlight_examples(method_name=method_name) if example_link: return ( """
\n""" f"""View Example of {method_name} (See full example here)""" f"""
{json_example}
\n""" """
""" ) else: return "" with open("./examples.json") as json_file: lakehouse_engine_examples = json.load(json_file) def format_operations_table(operations_dict: dict) -> str: """Format operations dictionary into a markdown table. Args: operations_dict: Dictionary containing operations and their parameters. Returns: A markdown formatted table with operation details. """ if not operations_dict: return "" markdown_output = "\n\n**Available Operations:**\n\n" markdown_output += "| Operation | Parameters | Type | Mandatory |\n" markdown_output += "|-----------|------------|------|----------|\n" for operation, params in sorted(operations_dict.items()): if not params: markdown_output += f"| `{operation}` | - | - | - |\n" else: first_param = True for param_name, param_info in params.items(): if first_param: markdown_output += f"| `{operation}` | `{param_name}` | {param_info.get('type', 'N/A')} | {param_info.get('mandatory', False)} |\n" first_param = False else: markdown_output += f"| | `{param_name}` | {param_info.get('type', 'N/A')} | {param_info.get('mandatory', False)} |\n" return markdown_output def get_table_manager_operations() -> str: """Get formatted table of TableManager operations. Returns: A markdown formatted table with TableManager operations. """ from lakehouse_engine.core.definitions import TABLE_MANAGER_OPERATIONS return format_operations_table(TABLE_MANAGER_OPERATIONS) def get_file_manager_operations() -> str: """Get formatted table of FileManager operations. Returns: A markdown formatted table with FileManager operations. """ from lakehouse_engine.core.definitions import FILE_MANAGER_OPERATIONS return format_operations_table(FILE_MANAGER_OPERATIONS) def define_env(env): "Declare environment for jinja2 templates for markdown" for fn in [get_example, get_table_manager_operations, get_file_manager_operations]: env.macro(fn) # get mkdocstrings' Python handler python_handler = env.conf["plugins"]["mkdocstrings"].get_handler("python") # get the `update_env` method of the Python handler update_env = python_handler.update_env # override the `update_env` method of the Python handler def patched_update_env(config): update_env(config) # get the `convert_markdown` filter of the env convert_markdown = python_handler.env.filters["convert_markdown"] # build a chimera made of macros+mkdocstrings def render_convert(markdown: str, *args, **kwargs): return convert_markdown(env.render(markdown), *args, **kwargs) # patch the filter python_handler.env.filters["convert_markdown"] = render_convert # patch the method python_handler.update_env = patched_update_env ================================================ FILE: cicd/code_doc/module.html.jinja2 ================================================ {# On this Jinja template we're extending a pre-existing template, copying the block on which we would like to make changes and adding both the "View Example" summary tag and the "View Full Acon" button. #} {% extends "default/module.html.jinja2" %} {% block title %}{{ module.modulename }}{% endblock %} {% block nav_submodules %} {% if module.submodules %}

Submodules

{% endif %} {% endblock %} {% block module_contents %} {% for m in module.flattened_own_members if is_public(m) | trim %}
{{ member(m) }} {% if m.type == "class" %} {% for m in m.own_members if m.type != "class" and is_public(m) | trim %}
{{ member(m) }} {% if m.fullname | highlight_examples %} {{ view_example(m.fullname) }} {% endif %} {% if m.fullname | link_example %} {{ view_full_acon(m.fullname) }} {% endif %}
{% endfor %} {% set inherited_members = inherited(m) | trim %} {% if inherited_members %}
Inherited Members
{{ inherited_members }}
{% endif %} {% endif %}
{% endfor %} {% endblock %} {% block attribution %} {% endblock %} {% block module_info %}
{% block edit_button %} {% if edit_url %} {% if "github.com" in edit_url %} {% set edit_text = "Edit on GitHub" %} {% elif "gitlab" in edit_url %} {% set edit_text = "Edit on GitLab" %} {% else %} {% set edit_text = "Edit Source" %} {% endif %} {{ edit_text }} {% endif %} {% endblock %} {% if "lakehouse_engine" == module.modulename.split(".")[0] %} {{ module_name() }} {% endif %} {{ docstring(module) }} {% if "lakehouse_engine" == module.modulename.split(".")[0] %} {{ view_source_state(module) }} {{ view_source_button(module) }} {{ view_source_code(module) }} {% endif %}
{% endblock %} {# On this macro we're creating the "View Example" structure. #} {% defaultmacro view_example(doc) %}
View Example {{ doc | highlight_examples }}
{% enddefaultmacro %} {# On this macro we're creating the "View Full Acon" structure. #} {% defaultmacro view_full_acon(doc) %}
{% set edit_text = "View Full Acon" %} {{ edit_text }}


{% enddefaultmacro %} ================================================ FILE: cicd/code_doc/render_doc.py ================================================ """Module for customizing pdoc documentation.""" import json import os import shutil import warnings from pathlib import Path import pygments.formatters.html from markupsafe import Markup from pdoc import pdoc, render STACK_LEVEL = 2 logo_path = ( "https://github.com/adidas/lakehouse-engine/blob/master/assets/img/" "lakehouse_engine_logo_no_bg_160.png?raw=true" ) def _get_project_version() -> str: version = ( os.popen( "cat cicd/.bumpversion.cfg | grep 'current_version =' | cut -f 3 -d ' '" ) .read() .replace("\n", "") ) return version def _search_files(file: dict, search_string: str) -> list: """Searches for a string and outputs the line. Search for a given string in a file and output the line where it is first found. :param file: path of the file to be searched. :param search_string: string that will be searched for. :returns: the number of the first line where a given search_string appears. """ range_lines = [] with open(file) as f: for num, line in enumerate(f, 1): if search_string in line: range_lines.append(num - 1) return range_lines[0] def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict: """Searches for a transformer and returns the first dictionary occurrence. Search for a given transformer in a dictionary and return the first occurrence. :param dict_to_search: path of the file to be searched. :param transformer: string that will be searched for. :returns: first dictionary where a given transformer is found. """ dict_transformer = [] for spec in dict_to_search["transform_specs"]: for transformer_dict in spec["transformers"]: if transformer_dict["function"] == transformer: dict_transformer.append(transformer_dict) return json.dumps(dict_transformer[0], indent=4) def _link_example(module_name: str) -> str or None: """Searches for a link in a dict. Searches for the link of a given module_name, in a specific config file and outputs it. :param module_name: name of the module to be searched for. :returns: None or the example link for the given module_name. """ if module_name in list(link_dict.keys()): file_link = link_dict[str(module_name)] return link_dict["base_link"] + file_link if file_link != "" else None else: return None def _highlight_examples(module_name: str) -> str or None: """Creates a code snippet. Constructs and exposes the code snippet of a given module_name. :param module_name: name of the module to be searched for. :returns: None or the code snippet wrapped in html tags. """ transformers_to_ignore = [ "UNSUPPORTED_STREAMING_TRANSFORMERS", "AVAILABLE_TRANSFORMERS", "__init__", ] if module_name.split(".")[1] == "transformers": if module_name not in list(link_dict.keys()): if module_name.split(".")[-1] not in list(transformers_to_ignore): warnings.warn( "No entry provided for the following transformer: " + module_name.split(".")[-1], RuntimeWarning, STACK_LEVEL, ) return None for key, item in link_dict.items(): if module_name == key: file_path = f"./{item}" transformer = key.split(".")[-1].lower() if file_path == "./": warnings.warn( "No unit testing for the following transformer: " + transformer, RuntimeWarning, STACK_LEVEL, ) return None first_line = _search_files(file_path, f'"function": "{transformer}"') with open(file_path) as json_file: acon_file = json.load(json_file) code_snippet = _get_dict_transformer(acon_file, transformer) # Defining the lexer which will parse through the snippet of code we want # to highlight lexer = pygments.lexers.JsonLexer() # Defining the format that will be outputted by the pygments library # (on our case it will output the code within html tags) formatter = pygments.formatters.html.HtmlFormatter( cssclass="pdoc-code codehilite", linenos="inline", anchorlinenos=True, ) formatter.linenostart = first_line return Markup(pygments.highlight(code_snippet, lexer, formatter)) with open("./cicd/code_doc/examples.json") as json_file: link_dict = json.load(json_file) # Adding our custom filters to jinja environment env_jinja = render.env env_jinja.filters["link_example"] = _link_example env_jinja.filters["highlight_examples"] = _highlight_examples root_path = Path(__file__).parents[2] documentation_path = root_path / "artefacts" / "docs" # Tell pdoc's render to use our jinja template render.configure( template_directory=root_path / "cicd" / "code_doc" / ".", docformat="google", logo=logo_path, favicon=logo_path, footer_text=f"Lakehouse Engine v{_get_project_version()}", mermaid=True, ) # Temporarily copy README file to be used in index.html page shutil.copyfile("README.md", root_path / "cicd" / "code_doc" / "README.md") # Render pdoc's documentation into artefacts/docs pdoc( "./lakehouse_engine/", "./lakehouse_engine_usage/", output_directory=documentation_path, ) # Copy the images used on the documentation, to the path where we have the rendered # html pages. shutil.copytree("./assets", documentation_path / "assets", dirs_exist_ok=True) # Remove the temporary copy README file os.remove(root_path / "cicd" / "code_doc" / "README.md") ================================================ FILE: cicd/code_doc/render_docs.py ================================================ """Module for customizing mkdocs documentation.""" # Import necessary libraries import os import shutil from pathlib import Path # Define the root directory and the necessary directories root_path = Path(__file__).parents[2] code_doc_path = root_path / "cicd" / "code_doc" mkdocs_base_path = code_doc_path / "mkdocs" mkdocs_build_path = mkdocs_base_path / "docs" documentation_path = root_path / "artefacts" / "docs" # Files and directories to be copied to build the mkdocs documentation documentation_to_copy = { "directories_to_copy": [ { "source": root_path / "lakehouse_engine_usage", "target": mkdocs_build_path / "lakehouse_engine_usage", }, { "source": root_path / "lakehouse_engine", "target": mkdocs_base_path / "lakehouse_engine" / "packages", }, { "source": "./assets", "target": mkdocs_build_path / "assets", }, ], "files_to_copy": [ { "source": "README.md", "target": mkdocs_build_path / "index.md", }, { "source": "pyproject.toml", "target": mkdocs_build_path / "pyproject.toml", }, ], } def _copy_documentation(directories: list = "", files: list = ""): """Copy files to other directory based on given parameters. Args: directories (list): list of directories to copy. files (list): list of files to copy. """ if directories: for directory in directories: shutil.copytree( directory.get("source"), directory.get("target"), dirs_exist_ok=True ) if files: for file in files: shutil.copyfile(file.get("source"), file.get("target")) _copy_documentation( directories=documentation_to_copy.get("directories_to_copy"), files=documentation_to_copy.get("files_to_copy"), ) # Use mkdocs build command to build the documentation into the "site" folder os.system(f"cd {code_doc_path} && mkdocs build --site-dir {documentation_path}/site") # Remove the temporary docs directory mkdocs_base_path shutil.rmtree(mkdocs_base_path) ================================================ FILE: cicd/flake8.conf ================================================ [flake8] max-line-length = 88 extend-ignore = E203 inline-quotes=double docstring-quotes=""" max-expression-complexity=11 max-cognitive-complexity=15 # there is a python module with same name as io engine module, so # we need to ignore this error per-file-ignores = lakehouse_engine/io/__init__.py:A005 ================================================ FILE: cicd/meta.yaml ================================================ dev_deploy_bucket: s3://sample-dev-bucket prod_deploy_bucket: s3://sample-prod-bucket arm_python_image: arm64v8/python:3.12-slim-bullseye amd_python_image: python:3.12-slim-bullseye engine_docs: https://adidas.github.io/lakehouse-engine-docs/lakehouse_engine.html code_url: https://github.com/adidas/lakehouse-engine ================================================ FILE: cicd/requirements.txt ================================================ # The main dependencies without which the core functionalities of the project will not work. # These dependencies are not optional and are always installed when people install the lakehouse-engine library. # # ! Do not forget running `make build-lock-files` after updating dependency list ! # boto3==1.40.23 Jinja2==3.1.6 pyyaml==6.0.2 pendulum==3.1.0 importlib-resources==6.5.2 ================================================ FILE: cicd/requirements_azure.txt ================================================ # Dependencies necessary for azure related features to work (ex: mail notifications using o365). # # ! Do not forget running `make build-lock-files` after updating dependency list ! # msgraph-sdk==1.40.0 aiohttp==3.13.3 # msgraph-sdk uses a version with known vulnerabilities h2==4.3.0 # msgraph-sdk uses a version with known vulnerabilities azure-core==1.38.0 nest-asyncio==1.6.0 msal==1.32.3 urllib3==2.6.3 # msal uses a version with known vulnerabilities # Fixing the version to solve known vulnerabilities requests==2.32.4 # when updating also update in all files ================================================ FILE: cicd/requirements_cicd.txt ================================================ # Dependencies necessary for the Lakehouse Engine CICD (tests, linting, deployment,...). # # ! Do not forget running `make build-lock-files` after updating dependency list ! # # cicd pytest==8.4.1 pytest-cov==6.2.1 isort==6.0.1 flake8==7.3.0 flake8-black==0.3.6 black==24.4.0 # fixed because flake8-black points always to the latest black flake8-builtins==3.0.0 flake8-bugbear==24.12.12 flake8-isort==6.1.2 flake8-comprehensions==3.16.0 flake8-docstrings==1.7.0 flake8-eradicate==1.5.0 flake8-quotes==3.4.0 flake8-mutable==1.2.0 flake8-cognitive-complexity==0.1.0 flake8-expression-complexity==0.0.11 mypy==1.17.1 bandit==1.8.6 bump2version==1.0.1 lxml==6.0.0 pytest-sftpserver==1.3.0 pip-tools==7.5.0 pip-audit==2.10.0 cachecontrol==0.14.4 filelock==3.20.3 build==1.3.0 aiosmtpd==1.4.6 # docs distlib==0.3.6 ghp-import==2.1.0 griffe==1.15.0 Markdown==3.10 markdown-callouts==0.4.0 markdown-exec==1.12.1 markdown-include==0.8.1 mergedeep==1.3.4 mike==2.0.0 mkdocs==1.6.1 mkdocs-autorefs==1.4.3 mkdocs-material==9.7.1 mkdocs-material-extensions==1.3.1 mkdocstrings-crystal==0.3.9 mkdocs-macros-plugin==1.5.0 mkdocstrings-python==2.0.1 mkdocstrings[python]==1.0.0 mkdocs-gen-files==0.6.0 mkdocs-section-index==0.3.10 mkdocs-literate-nav==0.6.2 pymdown-extensions==10.20 pyyaml_env_tag==0.1 regex==2023.6.3 watchdog==3.0.0 # Fixing the version to solve known vulnerabilities requests==2.32.4 # when updating also update in all files # types types-boto3==1.40.23 types-paramiko==2.12.0 types-requests<2.31.0.7 # test moto==4.2.14 Werkzeug==3.1.6 # deploy to pypi twine==5.1.1 ================================================ FILE: cicd/requirements_dq.txt ================================================ # Dependencies necessary for the Data Quality features to work. # # ! Do not forget running `make build-lock-files` after updating dependency list ! # great-expectations==1.11.0 marshmallow==3.26.2 # Note: Numpy is not a direct dependency. # It is included temporarily to prevent version conflicts. #numpy==1.26.4 #dbr17 uses 2.1.3 # Fixing the version to solve known vulnerabilities requests==2.32.4 # when updating also update in all files dbr17 uses 2.32.3 ================================================ FILE: cicd/requirements_os.txt ================================================ # Special requirements from which the project depends on, but for which some use cases might use environments with # these dependencies pre-installed from the vendors. Thus, they are delivered as optional OS dependencies. # # ! Do not forget running `make build-lock-files` after updating dependency list ! # pyspark==4.0.0 delta-spark==4.0.0 ================================================ FILE: cicd/requirements_sftp.txt ================================================ # # ! Do not forget running `make build-lock-files` after updating dependency list ! # paramiko==4.0.0 pynacl==1.6.2 ================================================ FILE: cicd/requirements_sharepoint.txt ================================================ # # ! Do not forget running `make build-lock-files` after updating dependency list ! # tenacity==9.0.0 msal==1.32.3 azure-core==1.38.0 ================================================ FILE: lakehouse_engine/__init__.py ================================================ """Lakehouse engine package containing all the system subpackages.""" ================================================ FILE: lakehouse_engine/algorithms/__init__.py ================================================ """Package containing all the lakehouse engine algorithms.""" ================================================ FILE: lakehouse_engine/algorithms/algorithm.py ================================================ """Module containing the Algorithm class.""" from typing import List, Tuple from lakehouse_engine.core.definitions import ( DQDefaults, DQFunctionSpec, DQSpec, OutputFormat, ) from lakehouse_engine.core.executable import Executable class Algorithm(Executable): """Class to define the behavior of every algorithm based on ACONs.""" def __init__(self, acon: dict): """Construct Algorithm instances. Args: acon: algorithm configuration. """ self.acon = acon @classmethod def get_dq_spec( cls, spec: dict ) -> Tuple[DQSpec, List[DQFunctionSpec], List[DQFunctionSpec]]: """Get data quality specification object from acon. Args: spec: data quality specifications. Returns: The DQSpec and the List of DQ Functions Specs. """ dq_spec = DQSpec( spec_id=spec["spec_id"], input_id=spec["input_id"], dq_type=spec["dq_type"], dq_functions=[], dq_db_table=spec.get("dq_db_table"), dq_table_table_filter=spec.get("dq_table_table_filter"), dq_table_extra_filters=spec.get( "dq_table_extra_filters", DQSpec.dq_table_extra_filters ), execution_point=spec.get("execution_point"), unexpected_rows_pk=spec.get( "unexpected_rows_pk", DQSpec.unexpected_rows_pk ), gx_result_format=spec.get("gx_result_format", DQSpec.gx_result_format), tbl_to_derive_pk=spec.get("tbl_to_derive_pk", DQSpec.tbl_to_derive_pk), tag_source_data=spec.get("tag_source_data", DQSpec.tag_source_data), data_asset_name=spec.get("data_asset_name", DQSpec.data_asset_name), expectation_suite_name=spec.get( "expectation_suite_name", DQSpec.expectation_suite_name ), store_backend=spec.get("store_backend", DQDefaults.STORE_BACKEND.value), local_fs_root_dir=spec.get("local_fs_root_dir", DQSpec.local_fs_root_dir), bucket=spec.get("bucket", DQSpec.bucket), checkpoint_store_prefix=spec.get( "checkpoint_store_prefix", DQDefaults.CHECKPOINT_STORE_PREFIX.value ), expectations_store_prefix=spec.get( "expectations_store_prefix", DQDefaults.EXPECTATIONS_STORE_PREFIX.value, ), validations_store_prefix=spec.get( "validations_store_prefix", DQDefaults.VALIDATIONS_STORE_PREFIX.value, ), result_sink_db_table=spec.get( "result_sink_db_table", DQSpec.result_sink_db_table ), result_sink_location=spec.get( "result_sink_location", DQSpec.result_sink_location ), processed_keys_location=spec.get( "processed_keys_location", DQSpec.processed_keys_location ), result_sink_partitions=spec.get( "result_sink_partitions", DQSpec.result_sink_partitions ), result_sink_chunk_size=spec.get( "result_sink_chunk_size", DQSpec.result_sink_chunk_size ), result_sink_format=spec.get( "result_sink_format", OutputFormat.DELTAFILES.value ), result_sink_options=spec.get( "result_sink_options", DQSpec.result_sink_options ), result_sink_explode=spec.get( "result_sink_explode", DQSpec.result_sink_explode ), result_sink_extra_columns=spec.get("result_sink_extra_columns", []), source=spec.get("source", spec["input_id"]), fail_on_error=spec.get("fail_on_error", DQSpec.fail_on_error), cache_df=spec.get("cache_df", DQSpec.cache_df), critical_functions=spec.get( "critical_functions", DQSpec.critical_functions ), max_percentage_failure=spec.get( "max_percentage_failure", DQSpec.max_percentage_failure ), enable_row_condition=spec.get( "enable_row_condition", DQSpec.enable_row_condition ), ) dq_functions = cls._get_dq_functions(spec, "dq_functions") critical_functions = cls._get_dq_functions(spec, "critical_functions") cls._validate_dq_tag_strategy(dq_spec) return dq_spec, dq_functions, critical_functions @staticmethod def _get_dq_functions(spec: dict, function_key: str) -> List[DQFunctionSpec]: """Get DQ Functions from a DQ Spec, based on a function_key. Args: spec: data quality specifications. function_key: dq function key ("dq_functions" or "critical_functions"). Returns: a list of DQ Function Specs. """ functions = [] if spec.get(function_key, []): for f in spec.get(function_key, []): dq_fn_spec = DQFunctionSpec( function=f["function"], args=f.get("args", {}), ) functions.append(dq_fn_spec) return functions @staticmethod def _validate_dq_tag_strategy(spec: DQSpec) -> None: """Validate DQ Spec arguments related with the data tagging strategy. Args: spec: data quality specifications. """ if spec.tag_source_data: spec.gx_result_format = DQSpec.gx_result_format spec.fail_on_error = False spec.result_sink_explode = DQSpec.result_sink_explode elif spec.gx_result_format != DQSpec.gx_result_format: spec.tag_source_data = False ================================================ FILE: lakehouse_engine/algorithms/data_loader.py ================================================ """Module to define DataLoader class.""" from collections import OrderedDict from copy import deepcopy from logging import Logger from typing import List, Optional from lakehouse_engine.algorithms.algorithm import Algorithm from lakehouse_engine.core.definitions import ( DQFunctionSpec, DQSpec, DQType, InputSpec, MergeOptions, OutputFormat, OutputSpec, ReadType, SharepointOptions, TerminatorSpec, TransformerSpec, TransformSpec, ) from lakehouse_engine.dq_processors.exceptions import DQDuplicateRuleIdException from lakehouse_engine.io.reader_factory import ReaderFactory from lakehouse_engine.io.writer_factory import WriterFactory from lakehouse_engine.terminators.notifier_factory import NotifierFactory from lakehouse_engine.terminators.terminator_factory import TerminatorFactory from lakehouse_engine.transformers.transformer_factory import TransformerFactory from lakehouse_engine.utils.dq_utils import PrismaUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class DataLoader(Algorithm): """Load data using an algorithm configuration (ACON represented as dict). This algorithm focuses on the cases where users will be specifying all the algorithm steps and configurations through a dict based configuration, which we name ACON in our framework. Since an ACON is a dict you can pass a custom transformer through a python function and, therefore, the DataLoader can also be used to load data with custom transformations not provided in our transformers package. As the algorithm base class of the lakehouse-engine framework is based on the concept of ACON, this DataLoader algorithm simply inherits from Algorithm, without overriding anything. We designed the codebase like this to avoid instantiating the Algorithm class directly, which was always meant to be an abstraction for any specific algorithm included in the lakehouse-engine framework. """ def __init__(self, acon: dict): """Construct DataLoader algorithm instances. A data loader needs several specifications to work properly, but some of them might be optional. The available specifications are: - input specifications (mandatory): specify how to read data. - transform specifications (optional): specify how to transform data. - data quality specifications (optional): specify how to execute the data quality process. - output specifications (mandatory): specify how to write data to the target. - terminate specifications (optional): specify what to do after writing into the target (e.g., optimizing target table, vacuum, compute stats, etc). Args: acon: algorithm configuration. """ self._logger: Logger = LoggingHandler(self.__class__.__name__).get_logger() super().__init__(acon) self.input_specs: List[InputSpec] = self._get_input_specs() # the streaming transformers plan is needed to future change the # execution specification to accommodate streaming mode limitations in invoking # certain functions (e.g., sort, window, generate row ids/auto increments, ...). self._streaming_micro_batch_transformers_plan: dict = {} self.transform_specs: List[TransformSpec] = self._get_transform_specs() # our data quality process is not compatible with streaming mode, hence we # have to run it in micro batches, similar to what happens to certain # transformation functions not supported in streaming mode. self._streaming_micro_batch_dq_plan: dict = {} self.dq_specs: List[DQSpec] = self._get_dq_specs() self.output_specs: List[OutputSpec] = self._get_output_specs() self.terminate_specs: List[TerminatorSpec] = self._get_terminate_specs() def read(self) -> OrderedDict: """Read data from an input location into a distributed dataframe. Returns: An ordered dict with all the dataframes that were read. """ read_dfs: OrderedDict = OrderedDict({}) for spec in self.input_specs: self._logger.info(f"Found input specification: {spec}") read_dfs[spec.spec_id] = ReaderFactory.get_data(spec) return read_dfs def transform(self, data: OrderedDict) -> OrderedDict: """Transform (optionally) the data that was read. If there isn't a transformation specification this step will be skipped, and the original dataframes that were read will be returned. Transformations can have dependency from another transformation result, however we need to keep in mind if we are using streaming source and for some reason we need to enable micro batch processing, this result cannot be used as input to another transformation. Micro batch processing in pyspark streaming is only available in .write(), which means this transformation with micro batch needs to be the end of the process. Args: data: input dataframes in an ordered dict. Returns: Another ordered dict with the transformed dataframes, according to the transformation specification. """ if not self.transform_specs: return data else: transformed_dfs = OrderedDict(data) for spec in self.transform_specs: self._logger.info(f"Found transform specification: {spec}") transformed_df = transformed_dfs[spec.input_id] for transformer in spec.transformers: transformed_df = transformed_df.transform( TransformerFactory.get_transformer(transformer, transformed_dfs) ) transformed_dfs[spec.spec_id] = transformed_df return transformed_dfs def process_dq( self, data: OrderedDict ) -> tuple[OrderedDict, Optional[dict[str, str]]]: """Process the data quality tasks for the data that was read and/or transformed. It supports multiple input dataframes. Although just one is advisable. It is possible to use data quality validators/expectations that will validate your data and fail the process in case the expectations are not met. The DQ process also generates and keeps updating a site containing the results of the expectations that were done on your data. The location of the site is configurable and can either be on file system or S3. If you define it to be stored on S3, you can even configure your S3 bucket to serve the site so that people can easily check the quality of your data. Moreover, it is also possible to store the result of the DQ process into a defined result sink. Args: data: dataframes from previous steps of the algorithm that we which to run the DQ process on. Returns: Another ordered dict with the validated dataframes and a dictionary with the errors if they exist, or None. """ if not self.dq_specs: return data, None dq_processed_dfs, error = self._verify_dq_rule_id_uniqueness( data, self.dq_specs ) if error: return dq_processed_dfs, error else: from lakehouse_engine.dq_processors.dq_factory import DQFactory dq_processed_dfs = OrderedDict(data) for spec in self.dq_specs: df_processed_df = dq_processed_dfs[spec.input_id] self._logger.info(f"Found data quality specification: {spec}") if ( spec.dq_type == DQType.PRISMA.value or spec.dq_functions ) and spec.spec_id not in self._streaming_micro_batch_dq_plan: if spec.cache_df: df_processed_df.cache() dq_processed_dfs[spec.spec_id] = DQFactory.run_dq_process( spec, df_processed_df ) else: dq_processed_dfs[spec.spec_id] = df_processed_df return dq_processed_dfs, None def write(self, data: OrderedDict) -> OrderedDict: """Write the data that was read and transformed (if applicable). It supports writing multiple datasets. However, we only recommend to write one dataframe. This recommendation is based on easy debugging and reproducibility, since if we start mixing several datasets being fueled by the same algorithm, it would unleash an infinite sea of reproducibility issues plus tight coupling and dependencies between datasets. Having said that, there may be cases where writing multiple datasets is desirable according to the use case requirements. Use it accordingly. Args: data: dataframes that were read and transformed (if applicable). Returns: Dataframes that were written. """ written_dfs: OrderedDict = OrderedDict({}) for spec in self.output_specs: self._logger.info(f"Found output specification: {spec}") written_output = WriterFactory.get_writer( spec, data[spec.input_id], data ).write() if written_output: written_dfs.update(written_output) else: written_dfs[spec.spec_id] = data[spec.input_id] return written_dfs def terminate(self, data: OrderedDict) -> None: """Terminate the algorithm. Args: data: dataframes that were written. """ if self.terminate_specs: for spec in self.terminate_specs: self._logger.info(f"Found terminate specification: {spec}") TerminatorFactory.execute_terminator( spec, data[spec.input_id] if spec.input_id else None ) def execute(self) -> Optional[OrderedDict]: """Define the algorithm execution behaviour.""" try: self._logger.info("Starting read stage...") read_dfs = self.read() self._logger.info("Starting transform stage...") transformed_dfs = self.transform(read_dfs) self._logger.info("Starting data quality stage...") validated_dfs, errors = self.process_dq(transformed_dfs) self._logger.info("Starting write stage...") written_dfs = self.write(validated_dfs) self._logger.info("Starting terminate stage...") self.terminate(written_dfs) self._logger.info("Execution of the algorithm has finished!") except Exception as e: NotifierFactory.generate_failure_notification(self.terminate_specs, e) raise e if errors: raise DQDuplicateRuleIdException( "Data Written Successfully, but DQ Process Encountered an Issue.\n" "We detected a duplicate dq_rule_id in the dq_spec definition. " "As a result, none of the Data Quality (DQ) processes (dq_spec) " "were executed.\n" "Please review and verify the following dq_rules:\n" f"{errors}" ) return written_dfs def _get_input_specs(self) -> List[InputSpec]: """Get the input specifications from an acon. Returns: List of input specifications. """ return [InputSpec(**spec) for spec in self.acon["input_specs"]] def _get_transform_specs(self) -> List[TransformSpec]: """Get the transformation specifications from an acon. If we are executing the algorithm in streaming mode and if the transformer function is not supported in streaming mode, it is important to note that ONLY those unsupported operations will go into the streaming_micro_batch_transformers (see if in the function code), in the same order that they appear in the list of transformations. This means that other supported transformations that appear after an unsupported one continue to stay one the normal execution plan, i.e., outside the foreachBatch function. Therefore, this may make your algorithm to execute a different logic than the one you originally intended. For this reason: 1) ALWAYS PLACE UNSUPPORTED STREAMING TRANSFORMATIONS AT LAST; 2) USE force_streaming_foreach_batch_processing option in transform_spec section. 3) USE THE CUSTOM_TRANSFORMATION AND WRITE ALL YOUR TRANSFORMATION LOGIC THERE. Check list of unsupported spark streaming operations here: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#unsupported-operations Returns: List of transformation specifications. """ input_read_types = self._get_input_read_types(self.acon["input_specs"]) transform_input_ids = self._get_transform_input_ids( self.acon.get("transform_specs", []) ) prev_spec_read_types = self._get_previous_spec_read_types( input_read_types, transform_input_ids ) transform_specs = [] for spec in self.acon.get("transform_specs", []): transform_spec = TransformSpec( spec_id=spec["spec_id"], input_id=spec["input_id"], transformers=[], force_streaming_foreach_batch_processing=spec.get( "force_streaming_foreach_batch_processing", False ), ) for s in spec["transformers"]: transformer_spec = TransformerSpec( function=s["function"], args=s.get("args", {}) ) if ( prev_spec_read_types[transform_spec.input_id] == ReadType.STREAMING.value and s["function"] in TransformerFactory.UNSUPPORTED_STREAMING_TRANSFORMERS ) or ( prev_spec_read_types[transform_spec.input_id] == ReadType.STREAMING.value and transform_spec.force_streaming_foreach_batch_processing ): self._move_to_streaming_micro_batch_transformers( transform_spec, transformer_spec ) else: transform_spec.transformers.append(transformer_spec) transform_specs.append(transform_spec) return transform_specs def _get_dq_specs(self) -> List[DQSpec]: """Get list of data quality specification objects from acon. In streaming mode, we automatically convert the data quality specification in the streaming_micro_batch_dq_processors list for the respective output spec. This is needed because our dq process cannot be executed using native streaming functions. Returns: List of data quality spec objects. """ input_read_types = self._get_input_read_types(self.acon["input_specs"]) transform_input_ids = self._get_transform_input_ids( self.acon.get("transform_specs", []) ) prev_spec_read_types = self._get_previous_spec_read_types( input_read_types, transform_input_ids ) dq_specs = [] for spec in self.acon.get("dq_specs", []): dq_spec, dq_functions, critical_functions = Algorithm.get_dq_spec(spec) if prev_spec_read_types[dq_spec.input_id] == ReadType.STREAMING.value: # we need to use deepcopy to explicitly create a copy of the dict # otherwise python only create binding for dicts, and we would be # modifying the original dict, which we don't want to. self._move_to_streaming_micro_batch_dq_processors( deepcopy(dq_spec), dq_functions, critical_functions ) else: dq_spec.dq_functions = dq_functions dq_spec.critical_functions = critical_functions self._logger.info( f"Streaming Micro Batch DQ Plan: " f"{str(self._streaming_micro_batch_dq_plan)}" ) dq_specs.append(dq_spec) return dq_specs def _get_output_specs(self) -> List[OutputSpec]: """Get the output specifications from an acon. Returns: List of output specifications. """ return [ OutputSpec( spec_id=spec["spec_id"], input_id=spec["input_id"], write_type=spec.get("write_type", None), data_format=spec.get("data_format", OutputFormat.DELTAFILES.value), db_table=spec.get("db_table", None), location=spec.get("location", None), merge_opts=( MergeOptions(**spec["merge_opts"]) if spec.get("merge_opts") else None ), sharepoint_opts=( SharepointOptions(**spec["sharepoint_opts"]) if spec.get("sharepoint_opts") else None ), partitions=spec.get("partitions", []), streaming_micro_batch_transformers=self._get_streaming_transformer_plan( spec["input_id"], self.dq_specs ), streaming_once=spec.get("streaming_once", None), streaming_processing_time=spec.get("streaming_processing_time", None), streaming_available_now=spec.get( "streaming_available_now", ( False if ( spec.get("streaming_once", None) or spec.get("streaming_processing_time", None) or spec.get("streaming_continuous", None) ) else True ), ), streaming_continuous=spec.get("streaming_continuous", None), streaming_await_termination=spec.get( "streaming_await_termination", True ), streaming_await_termination_timeout=spec.get( "streaming_await_termination_timeout", None ), with_batch_id=spec.get("with_batch_id", False), options=spec.get("options", None), streaming_micro_batch_dq_processors=( self._streaming_micro_batch_dq_plan.get(spec["input_id"], []) ), ) for spec in self.acon["output_specs"] ] def _get_streaming_transformer_plan( self, input_id: str, dq_specs: Optional[List[DQSpec]] ) -> List[TransformerSpec]: """Gets the plan for transformations to be applied on streaming micro batches. When running both DQ processes and transformations in streaming micro batches, the _streaming_micro_batch_transformers_plan to consider is the one associated with the transformer spec_id and not with the dq spec_id. Thus, on those cases, this method maps the input id of the output_spec (which is the spec_id of a dq_spec) with the dependent transformer spec_id. Args: input_id: id of the corresponding input specification. dq_specs: data quality specifications. Returns: a list of TransformerSpec, representing the transformations plan. """ transformer_id = ( [dq_spec.input_id for dq_spec in dq_specs if dq_spec.spec_id == input_id][0] if self._streaming_micro_batch_dq_plan.get(input_id) and self._streaming_micro_batch_transformers_plan else input_id ) streaming_micro_batch_transformers_plan: list[TransformerSpec] = ( self._streaming_micro_batch_transformers_plan.get(transformer_id, []) ) return streaming_micro_batch_transformers_plan def _get_terminate_specs(self) -> List[TerminatorSpec]: """Get the terminate specifications from an acon. Returns: List of terminate specifications. """ return [TerminatorSpec(**spec) for spec in self.acon.get("terminate_specs", [])] def _move_to_streaming_micro_batch_transformers( self, transform_spec: TransformSpec, transformer_spec: TransformerSpec ) -> None: """Move the transformer to the list of streaming micro batch transformations. If the transform specs contain functions that cannot be executed in streaming mode, this function sends those functions to the output specs streaming_micro_batch_transformers, where they will be executed inside the stream foreachBatch function. To accomplish that we use an instance variable that associates the streaming_micro_batch_transformers to each output spec, in order to do reverse lookup when creating the OutputSpec. Args: transform_spec: transform specification (overall transformation specification - a transformation may contain multiple transformers). transformer_spec: the specific transformer function and arguments. """ if transform_spec.spec_id not in self._streaming_micro_batch_transformers_plan: self._streaming_micro_batch_transformers_plan[transform_spec.spec_id] = [] self._streaming_micro_batch_transformers_plan[transform_spec.spec_id].append( transformer_spec ) def _move_to_streaming_micro_batch_dq_processors( self, dq_spec: DQSpec, dq_functions: List[DQFunctionSpec], critical_functions: List[DQFunctionSpec], ) -> None: """Move the dq function to the list of streaming micro batch transformations. If the dq specs contain functions that cannot be executed in streaming mode, this function sends those functions to the output specs streaming_micro_batch_dq_processors, where they will be executed inside the stream foreachBatch function. To accomplish that we use an instance variable that associates the streaming_micro_batch_dq_processors to each output spec, in order to do reverse lookup when creating the OutputSpec. Args: dq_spec: dq specification (overall dq process specification). dq_functions: the list of dq functions to be considered. critical_functions: list of critical functions to be considered. """ if dq_spec.spec_id not in self._streaming_micro_batch_dq_plan: self._streaming_micro_batch_dq_plan[dq_spec.spec_id] = [] dq_spec.dq_functions = dq_functions dq_spec.critical_functions = critical_functions self._streaming_micro_batch_dq_plan[dq_spec.spec_id].append(dq_spec) @staticmethod def _get_input_read_types(list_of_specs: List) -> dict: """Get a dict of spec ids and read types from a list of input specs. Args: list_of_specs: list of input specs ([{k:v}]). Returns: Dict of {input_spec_id: read_type}. """ return {item["spec_id"]: item["read_type"] for item in list_of_specs} @staticmethod def _get_transform_input_ids(list_of_specs: List) -> dict: """Get a dict of transform spec ids and input ids from list of transform specs. Args: list_of_specs: list of transform specs ([{k:v}]). Returns: Dict of {transform_spec_id: input_id}. """ return {item["spec_id"]: item["input_id"] for item in list_of_specs} @staticmethod def _get_previous_spec_read_types( input_read_types: dict, transform_input_ids: dict ) -> dict: """Get the read types of the previous specification: input and/or transform. For the chaining transformations and for DQ process to work seamlessly in batch and streaming mode, we have to figure out if the previous spec to the transform or dq spec(e.g., input spec or transform spec) refers to a batch read type or a streaming read type. Args: input_read_types: dict of {input_spec_id: read_type}. transform_input_ids: dict of {transform_spec_id: input_id}. Returns: Dict of {input_spec_id or transform_spec_id: read_type} """ combined_read_types = input_read_types for spec_id, input_id in transform_input_ids.items(): combined_read_types[spec_id] = combined_read_types[input_id] return combined_read_types @staticmethod def _verify_dq_rule_id_uniqueness( data: OrderedDict, dq_specs: list[DQSpec] ) -> tuple[OrderedDict, dict[str, str]]: """Verify the uniqueness of dq_rule_id. Verify the existence of duplicate dq_rule_id values and prepare the DataFrame for the next stage. Args: data: dataframes. dq_specs: a list of DQSpec to be validated. Returns: processed df and error if existed. """ error_dict = PrismaUtils.validate_rule_id_duplication(dq_specs) dq_processed_dfs = OrderedDict(data) for spec in dq_specs: df_processed_df = dq_processed_dfs[spec.input_id] dq_processed_dfs[spec.spec_id] = df_processed_df return dq_processed_dfs, error_dict ================================================ FILE: lakehouse_engine/algorithms/dq_validator.py ================================================ """Module to define Data Validator class.""" from delta.tables import DeltaTable from pyspark.sql import DataFrame from pyspark.sql.utils import StreamingQueryException from lakehouse_engine.algorithms.algorithm import Algorithm from lakehouse_engine.core.definitions import DQSpec, DQValidatorSpec, InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.dq_processors.dq_factory import DQFactory from lakehouse_engine.dq_processors.exceptions import ( DQDuplicateRuleIdException, DQValidationsFailedException, ) from lakehouse_engine.io.reader_factory import ReaderFactory from lakehouse_engine.utils.dq_utils import PrismaUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class DQValidator(Algorithm): """Validate data using an algorithm configuration (ACON represented as dict). This algorithm focuses on isolate Data Quality Validations from loading, applying a set of data quality functions to a specific input dataset, without the need to define any output specification. You can use any input specification compatible with the lakehouse engine (dataframe, table, files, etc). """ _LOGGER = LoggingHandler(__name__).get_logger() def __init__(self, acon: dict): """Construct DQValidator algorithm instances. A data quality validator needs the following specifications to work properly: - input specification (mandatory): specify how and what data to read. - data quality specification (mandatory): specify how to execute the data quality process. - restore_prev_version (optional): specify if, having delta table/files as input, they should be restored to the previous version if the data quality process fails. Note: this is only considered if fail_on_error is kept as True. Args: acon: algorithm configuration. """ self.spec: DQValidatorSpec = DQValidatorSpec( input_spec=InputSpec(**acon["input_spec"]), dq_spec=self._get_dq_spec(acon["dq_spec"]), restore_prev_version=acon.get("restore_prev_version", None), ) def read(self) -> DataFrame: """Read data from an input location into a distributed dataframe. Returns: Dataframe with data that was read. """ current_df = ReaderFactory.get_data(self.spec.input_spec) return current_df def process_dq(self, data: DataFrame) -> DataFrame: """Process the data quality tasks for the data that was read. It supports a single input dataframe. It is possible to use data quality validators/expectations that will validate your data and fail the process in case the expectations are not met. The DQ process also generates and keeps updating a site containing the results of the expectations that were done on your data. The location of the site is configurable and can either be on file system or S3. If you define it to be stored on S3, you can even configure your S3 bucket to serve the site so that people can easily check the quality of your data. Moreover, it is also possible to store the result of the DQ process into a defined result sink. Args: data: input dataframe on which to run the DQ process. Returns: Validated dataframe. """ return DQFactory.run_dq_process(self.spec.dq_spec, data) def execute(self) -> None: """Define the algorithm execution behaviour.""" self._LOGGER.info("Starting read stage...") read_df = self.read() self._LOGGER.info("Starting data quality validator...") self._LOGGER.info("Validating DQ definitions") error_dict = PrismaUtils.validate_rule_id_duplication(specs=[self.spec.dq_spec]) if error_dict: raise DQDuplicateRuleIdException( "Duplicate dq_rule_id detected in dq_spec definition.\n" "We have identified one or more duplicate dq_rule_id " "entries in the dq_spec definition. " "Please review and verify the following dq_rules:\n" f"{error_dict}" ) try: if read_df.isStreaming: # To handle streaming, and although we are not interested in # writing any data, we still need to start the streaming and # execute the data quality process in micro batches of data. def write_dq_validator_micro_batch( batch_df: DataFrame, batch_id: int ) -> None: ExecEnv.get_for_each_batch_session(batch_df) self.process_dq(batch_df) read_df.writeStream.trigger(once=True).foreachBatch( write_dq_validator_micro_batch ).start().awaitTermination() else: self.process_dq(read_df) except (DQValidationsFailedException, StreamingQueryException): if not self.spec.input_spec.df_name and self.spec.restore_prev_version: self._LOGGER.info("Restoring delta table/files to previous version...") self._restore_prev_version() raise DQValidationsFailedException( "Data Quality Validations Failed! The delta " "table/files were restored to the previous version!" ) elif self.spec.dq_spec.fail_on_error: raise DQValidationsFailedException("Data Quality Validations Failed!") else: self._LOGGER.info("Execution of the algorithm has finished!") @staticmethod def _get_dq_spec(input_dq_spec: dict) -> DQSpec: """Get data quality specification from acon. Args: input_dq_spec: data quality specification. Returns: Data quality spec. """ dq_spec, dq_functions, critical_functions = Algorithm.get_dq_spec(input_dq_spec) dq_spec.dq_functions = dq_functions dq_spec.critical_functions = critical_functions return dq_spec def _restore_prev_version(self) -> None: """Restore delta table or delta files to previous version.""" if self.spec.input_spec.db_table: delta_table = DeltaTable.forName( ExecEnv.SESSION, self.spec.input_spec.db_table ) else: delta_table = DeltaTable.forPath( ExecEnv.SESSION, self.spec.input_spec.location ) previous_version = ( delta_table.history().agg({"version": "max"}).collect()[0][0] - 1 ) delta_table.restoreToVersion(previous_version) ================================================ FILE: lakehouse_engine/algorithms/exceptions.py ================================================ """Package defining all the algorithm custom exceptions.""" class ReconciliationFailedException(Exception): """Exception for when the reconciliation process fails.""" pass class NoNewDataException(Exception): """Exception for when no new data is available.""" pass class SensorAlreadyExistsException(Exception): """Exception for when a sensor with same sensor id already exists.""" pass class RestoreTypeNotFoundException(Exception): """Exception for when the restore type is not found.""" pass ================================================ FILE: lakehouse_engine/algorithms/gab.py ================================================ """Module to define Gold Asset Builder algorithm behavior.""" import copy from datetime import datetime, timedelta import pendulum from jinja2 import Template from pyspark import Row from pyspark.sql import DataFrame from pyspark.sql.functions import lit from lakehouse_engine.algorithms.algorithm import Algorithm from lakehouse_engine.core.definitions import ( GABCadence, GABCombinedConfiguration, GABDefaults, GABKeys, GABReplaceableKeys, GABSpec, GABStartOfWeek, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.gab_manager import GABCadenceManager, GABViewManager from lakehouse_engine.core.gab_sql_generator import ( GABDeleteGenerator, GABInsertGenerator, ) from lakehouse_engine.utils.gab_utils import GABPartitionUtils, GABUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class GAB(Algorithm): """Class representing the gold asset builder.""" _LOGGER = LoggingHandler(__name__).get_logger() _SPARK_DEFAULT_PARALLELISM_CONFIG = ( "spark.sql.sources.parallelPartitionDiscovery.parallelism" ) _SPARK_DEFAULT_PARALLELISM_VALUE = "10000" def __init__(self, acon: dict): """Construct GAB instances. Args: acon: algorithm configuration. """ self.spec: GABSpec = GABSpec.create_from_acon(acon=acon) def execute(self) -> None: """Execute the Gold Asset Builder.""" self._LOGGER.info(f"Reading {self.spec.lookup_table} as lkp_query_builder") lookup_query_builder_df = ExecEnv.SESSION.read.table(self.spec.lookup_table) ExecEnv.SESSION.read.table(self.spec.calendar_table).createOrReplaceTempView( "df_cal" ) self._LOGGER.info(f"Generating calendar from {self.spec.calendar_table}") query_label = self.spec.query_label_filter queue = self.spec.queue_filter cadence = self.spec.cadence_filter self._LOGGER.info(f"Query Label Filter {query_label}") self._LOGGER.info(f"Queue Filter {queue}") self._LOGGER.info(f"Cadence Filter {cadence}") gab_path = self.spec.gab_base_path self._LOGGER.info(f"Gab Base Path {gab_path}") lookup_query_builder_df = lookup_query_builder_df.filter( ( (lookup_query_builder_df.query_label.isin(query_label)) & (lookup_query_builder_df.queue.isin(queue)) & (lookup_query_builder_df.is_active != lit("N")) ) ) cached = True try: lookup_query_builder_df.cache() except Exception as e: cached = False self._LOGGER.warning( "Could not cache lookup_query_builder_df dataframe. " f"Continuing without caching. Exception: {e}" ) for use_case in lookup_query_builder_df.collect(): self._process_use_case( use_case=use_case, lookup_query_builder=lookup_query_builder_df, selected_cadences=cadence, gab_path=gab_path, ) if cached: lookup_query_builder_df.unpersist() def _process_use_case( self, use_case: Row, lookup_query_builder: DataFrame, selected_cadences: list[str], gab_path: str, ) -> None: """Process each gab use case. Args: use_case: gab use case to process. lookup_query_builder: gab configuration data. selected_cadences: selected cadences to process. gab_path: gab base path used to get the use case stages sql files. """ self._LOGGER.info(f"Executing use case: {use_case['query_label']}") reconciliation = GABUtils.get_json_column_as_dict( lookup_query_builder=lookup_query_builder, query_id=use_case["query_id"], query_column="recon_window", ) self._LOGGER.info(f"reconcilation window - {reconciliation}") configured_cadences = list(reconciliation.keys()) stages = GABUtils.get_json_column_as_dict( lookup_query_builder=lookup_query_builder, query_id=use_case["query_id"], query_column="intermediate_stages", ) self._LOGGER.info(f"intermediate stages - {stages}") self._LOGGER.info(f"selected_cadences: {selected_cadences}") self._LOGGER.info(f"configured_cadences: {configured_cadences}") cadences = self._get_filtered_cadences(selected_cadences, configured_cadences) self._LOGGER.info(f"filtered cadences - {cadences}") latest_run_date, latest_config_date = self._get_latest_usecase_data( use_case["query_id"] ) self._LOGGER.info(f"latest_config_date: {latest_config_date}") self._LOGGER.info(f"latest_run_date: - {latest_run_date}") self._set_use_case_stage_template_file(stages, gab_path, use_case) processed_cadences = [] for cadence in cadences: is_cadence_processed = self._process_use_case_query_cadence( cadence, reconciliation, use_case, stages, lookup_query_builder, ) if is_cadence_processed: processed_cadences.append(is_cadence_processed) if processed_cadences: self._generate_ddl( latest_config_date=latest_config_date, latest_run_date=latest_run_date, query_id=use_case["query_id"], lookup_query_builder=lookup_query_builder, ) else: self._LOGGER.info( f"Skipping use case {use_case['query_label']}. No cadence processed " "for the use case." ) @classmethod def _set_use_case_stage_template_file( cls, stages: dict, gab_path: str, use_case: Row ) -> None: """Set templated file for each stage. Args: stages: use case stages with their configuration. gab_path: gab base path used to get the use case stages SQL files. use_case: gab use case to process. """ cls._LOGGER.info("Reading templated file for each stage...") for i in range(1, len(stages) + 1): stage = stages[str(i)] stage_file_path = stage["file_path"] full_path = gab_path + stage_file_path cls._LOGGER.info(f"Stage file path is: {full_path}") file_read = open(full_path, "r").read() templated_file = file_read.replace( "replace_offset_value", str(use_case["timezone_offset"]) ) stage["templated_file"] = templated_file stage["full_file_path"] = full_path def _process_use_case_query_cadence( self, cadence: str, reconciliation: dict, use_case: Row, stages: dict, lookup_query_builder: DataFrame, ) -> bool: """Identify use case reconciliation window and cadence. Args: cadence: cadence to process. reconciliation: configured use case reconciliation window. use_case: gab use case to process. stages: use case stages with their configuration. lookup_query_builder: gab configuration data. """ selected_reconciliation_window = {} selected_cadence = reconciliation.get(cadence) self._LOGGER.info(f"Processing cadence: {cadence}") self._LOGGER.info(f"Reconciliation Window - {selected_cadence}") if selected_cadence: selected_reconciliation_window = selected_cadence.get("recon_window") self._LOGGER.info(f"{cadence}: {self.spec.start_date} - {self.spec.end_date}") start_of_week = use_case["start_of_the_week"] self._set_week_configuration_by_uc_start_of_week(start_of_week) cadence_configuration_at_end_date = ( GABUtils.get_cadence_configuration_at_end_date(self.spec.end_date) ) reconciliation_cadences = GABUtils().get_reconciliation_cadences( cadence=cadence, selected_reconciliation_window=selected_reconciliation_window, cadence_configuration_at_end_date=cadence_configuration_at_end_date, rerun_flag=self.spec.rerun_flag, ) start_date_str = GABUtils.format_datetime_to_default(self.spec.start_date) end_date_str = GABUtils.format_datetime_to_default(self.spec.end_date) for reconciliation_cadence, snapshot_flag in reconciliation_cadences.items(): self._process_reconciliation_cadence( reconciliation_cadence=reconciliation_cadence, snapshot_flag=snapshot_flag, cadence=cadence, start_date_str=start_date_str, end_date_str=end_date_str, use_case=use_case, lookup_query_builder=lookup_query_builder, stages=stages, ) return (cadence in reconciliation.keys()) or ( reconciliation_cadences is not None ) def _process_reconciliation_cadence( self, reconciliation_cadence: str, snapshot_flag: str, cadence: str, start_date_str: str, end_date_str: str, use_case: Row, lookup_query_builder: DataFrame, stages: dict, ) -> None: """Process use case reconciliation window. Reconcile the pre-aggregated data to cover the late events. Args: reconciliation_cadence: reconciliation to process. snapshot_flag: flag indicating if for this cadence the snapshot is enabled. cadence: cadence to process. start_date_str: start date of the period to process. end_date_str: end date of the period to process. use_case: gab use case to process. lookup_query_builder: gab configuration data. stages: use case stages with their configuration. Example: Cadence: week; Reconciliation: monthly; This means every weekend previous week aggregations will be calculated and on month end we will reconcile the numbers calculated for last 4 weeks to readjust the number for late events. """ ( window_start_date, window_end_date, filter_start_date, filter_end_date, ) = GABCadenceManager().extended_window_calculator( cadence, reconciliation_cadence, self.spec.current_date, start_date_str, end_date_str, use_case["query_type"], self.spec.rerun_flag, snapshot_flag, ) if use_case["timezone_offset"]: filter_start_date = filter_start_date + timedelta( hours=use_case["timezone_offset"] ) filter_end_date = filter_end_date + timedelta( hours=use_case["timezone_offset"] ) filter_start_date_str = GABUtils.format_datetime_to_default(filter_start_date) filter_end_date_str = GABUtils.format_datetime_to_default(filter_end_date) partition_end = GABUtils.format_datetime_to_default( (window_end_date - timedelta(days=1)) ) window_start_date_str = GABUtils.format_datetime_to_default(window_start_date) window_end_date_str = GABUtils.format_datetime_to_default(window_end_date) partition_filter = GABPartitionUtils.get_partition_condition( filter_start_date_str, partition_end ) self._LOGGER.info( "extended window for start and end dates are: " f"{filter_start_date_str} - {filter_end_date_str}" ) unpersist_list = [] for i in range(1, len(stages) + 1): stage = stages[str(i)] templated_file = stage["templated_file"] stage_file_path = stage["full_file_path"] templated = self._process_use_case_query_step( stage=stages[str(i)], templated_file=templated_file, use_case=use_case, reconciliation_cadence=reconciliation_cadence, cadence=cadence, snapshot_flag=snapshot_flag, window_start_date=window_start_date_str, partition_end=partition_end, filter_start_date=filter_start_date_str, filter_end_date=filter_end_date_str, partition_filter=partition_filter, ) temp_stage_view_name = self._create_stage_view( templated, stages[str(i)], window_start_date_str, window_end_date_str, use_case["query_id"], use_case["query_label"], cadence, stage_file_path, ) unpersist_list.append(temp_stage_view_name) insert_success = self._generate_view_statement( query_id=use_case["query_id"], cadence=cadence, temp_stage_view_name=temp_stage_view_name, lookup_query_builder=lookup_query_builder, window_start_date=window_start_date_str, window_end_date=window_end_date_str, query_label=use_case["query_label"], ) self._LOGGER.info(f"Inserted data to generate the view: {insert_success}") self._unpersist_cached_views(unpersist_list) def _process_use_case_query_step( self, stage: dict, templated_file: str, use_case: Row, reconciliation_cadence: str, cadence: str, snapshot_flag: str, window_start_date: str, partition_end: str, filter_start_date: str, filter_end_date: str, partition_filter: str, ) -> str: """Process each use case step. Process any intermediate view defined in the gab configuration table as step for the use case. Args: stage: stage to process. templated_file: sql file to process at this stage. use_case: gab use case to process. reconciliation_cadence: configured use case reconciliation window. cadence: cadence to process. snapshot_flag: flag indicating if for this cadence the snapshot is enabled. window_start_date: start date for the configured stage. partition_end: end date for the configured stage. filter_start_date: filter start date to replace in the stage query. filter_end_date: filter end date to replace in the stage query. partition_filter: partition condition. """ filter_col = stage["project_date_column"] if stage["filter_date_column"]: filter_col = stage["filter_date_column"] # dummy value to avoid empty error if empty on the configuration project_col = stage.get("project_date_column", "X") gab_base_configuration_copy = copy.deepcopy( GABCombinedConfiguration.COMBINED_CONFIGURATION.value ) for item in gab_base_configuration_copy.values(): self._update_rendered_item_cadence( reconciliation_cadence, cadence, project_col, item # type: ignore ) ( rendered_date, rendered_to_date, join_condition, ) = self._get_cadence_configuration( gab_base_configuration_copy, cadence, reconciliation_cadence, snapshot_flag, use_case["start_of_the_week"], project_col, window_start_date, partition_end, ) rendered_file = self._render_template_query( templated=templated_file, cadence=cadence, start_of_the_week=use_case["start_of_the_week"], query_id=use_case["query_id"], rendered_date=rendered_date, filter_start_date=filter_start_date, filter_end_date=filter_end_date, filter_col=filter_col, timezone_offset=use_case["timezone_offset"], join_condition=join_condition, partition_filter=partition_filter, rendered_to_date=rendered_to_date, ) return rendered_file @classmethod def _get_filtered_cadences( cls, selected_cadences: list[str], configured_cadences: list[str] ) -> list[str]: """Get filtered cadences. Get the intersection of user selected cadences and use case configured cadences. Args: selected_cadences: user selected cadences. configured_cadences: use case configured cadences. """ return ( configured_cadences if "All" in selected_cadences else GABCadence.order_cadences( list(set(selected_cadences).intersection(configured_cadences)) ) ) def _get_latest_usecase_data(self, query_id: str) -> tuple[datetime, datetime]: """Get latest use case data. Args: query_id: use case query id. """ return ( self._get_latest_run_date(query_id), self._get_latest_use_case_date(query_id), ) def _get_latest_run_date(self, query_id: str) -> datetime: """Get latest use case run date. Args: query_id: use case query id. """ last_success_run_sql = """ SELECT run_start_time FROM {database}.gab_log_events WHERE query_id = {query_id} AND stage_name = 'Final Insert' AND status = 'Success' ORDER BY 1 DESC LIMIT 1 """.format( # nosec: B608 database=self.spec.target_database, query_id=query_id ) try: latest_run_date: datetime = ExecEnv.SESSION.sql( last_success_run_sql ).collect()[0][0] except Exception: latest_run_date = datetime.strptime( "2020-01-01", GABDefaults.DATE_FORMAT.value ) return latest_run_date def _get_latest_use_case_date(self, query_id: str) -> datetime: """Get latest use case configured date. Args: query_id: use case query id. """ query_config_sql = """ SELECT lh_created_on FROM {lkp_query_builder} WHERE query_id = {query_id} """.format( # nosec: B608 lkp_query_builder=self.spec.lookup_table, query_id=query_id, ) latest_config_date: datetime = ExecEnv.SESSION.sql(query_config_sql).collect()[ 0 ][0] return latest_config_date @classmethod def _set_week_configuration_by_uc_start_of_week(cls, start_of_week: str) -> None: """Set week configuration by use case start of week. Args: start_of_week: use case start of week (MONDAY or SUNDAY). """ if start_of_week.upper() == "MONDAY": pendulum.week_starts_at(pendulum.MONDAY) pendulum.week_ends_at(pendulum.SUNDAY) elif start_of_week.upper() == "SUNDAY": pendulum.week_starts_at(pendulum.SUNDAY) pendulum.week_ends_at(pendulum.SATURDAY) else: raise NotImplementedError( f"The requested {start_of_week} is not implemented." "Supported `start_of_week` values: [MONDAY, SUNDAY]" ) @classmethod def _update_rendered_item_cadence( cls, reconciliation_cadence: str, cadence: str, project_col: str, item: dict ) -> None: """Override item properties based in the rendered item cadence. Args: reconciliation_cadence: configured use case reconciliation window. cadence: cadence to process. project_col: use case projection date column name. item: predefined use case combination. """ rendered_item = cls._get_rendered_item_cadence( reconciliation_cadence, cadence, project_col, item ) item["join_select"] = rendered_item["join_select"] item["project_start"] = rendered_item["project_start"] item["project_end"] = rendered_item["project_end"] @classmethod def _get_rendered_item_cadence( cls, reconciliation_cadence: str, cadence: str, project_col: str, item: dict ) -> dict: """Update pre-configured gab parameters with use case data. Args: reconciliation_cadence: configured use case reconciliation window. cadence: cadence to process. project_col: use case projection date column name. item: predefined use case combination. """ return { GABKeys.JOIN_SELECT: ( item[GABKeys.JOIN_SELECT] .replace(GABReplaceableKeys.CONFIG_WEEK_START, "Monday") .replace( GABReplaceableKeys.RECONCILIATION_CADENCE, reconciliation_cadence, ) .replace(GABReplaceableKeys.CADENCE, cadence) ), GABKeys.PROJECT_START: ( item[GABKeys.PROJECT_START] .replace(GABReplaceableKeys.CADENCE, cadence) .replace(GABReplaceableKeys.DATE_COLUMN, project_col) ), GABKeys.PROJECT_END: ( item[GABKeys.PROJECT_END] .replace(GABReplaceableKeys.CADENCE, cadence) .replace(GABReplaceableKeys.DATE_COLUMN, project_col) ), } @classmethod def _get_cadence_configuration( cls, use_case_configuration: dict, cadence: str, reconciliation_cadence: str, snapshot_flag: str, start_of_week: str, project_col: str, window_start_date: str, partition_end: str, ) -> tuple[str, str, str]: """Get use case configuration fields to replace pre-configured parameters. Args: use_case_configuration: use case configuration. cadence: cadence to process. reconciliation_cadence: cadence to be reconciliated. snapshot_flag: flag indicating if for this cadence the snapshot is enabled. start_of_week: use case start of week (MONDAY or SUNDAY). project_col: use case projection date column name. window_start_date: start date for the configured stage. partition_end: end date for the configured stage. Returns: rendered_from_date: projection start date. rendered_to_date: projection end date. join_condition: string containing the join condition to replace in the templated query by jinja substitution. """ cadence_dict = next( ( dict(configuration) for configuration in use_case_configuration.values() if ( (cadence in configuration["cadence"]) and (reconciliation_cadence in configuration["recon"]) and (snapshot_flag in configuration["snap_flag"]) and ( GABStartOfWeek.get_start_of_week()[start_of_week.upper()] in configuration["week_start"] ) ) ), None, ) rendered_from_date = None rendered_to_date = None join_condition = None if cadence_dict: rendered_from_date = ( cadence_dict[GABKeys.PROJECT_START] .replace(GABReplaceableKeys.CADENCE, cadence) .replace(GABReplaceableKeys.DATE_COLUMN, project_col) ) rendered_to_date = ( cadence_dict[GABKeys.PROJECT_END] .replace(GABReplaceableKeys.CADENCE, cadence) .replace(GABReplaceableKeys.DATE_COLUMN, project_col) ) if cadence_dict[GABKeys.JOIN_SELECT]: join_condition = """ inner join ( {join_select} from df_cal where calendar_date between '{bucket_start}' and '{bucket_end}' ) df_cal on date({date_column}) between df_cal.cadence_start_date and df_cal.cadence_end_date """.format( join_select=cadence_dict[GABKeys.JOIN_SELECT], bucket_start=window_start_date, bucket_end=partition_end, date_column=project_col, ) return rendered_from_date, rendered_to_date, join_condition def _render_template_query( self, templated: str, cadence: str, start_of_the_week: str, query_id: str, rendered_date: str, filter_start_date: str, filter_end_date: str, filter_col: str, timezone_offset: str, join_condition: str, partition_filter: str, rendered_to_date: str, ) -> str: """Replace jinja templated parameters in the SQL with the actual data. Args: templated: templated sql file to process at this stage. cadence: cadence to process. start_of_the_week: use case start of week (MONDAY or SUNDAY). query_id: gab configuration table use case identifier. rendered_date: projection start date. filter_start_date: filter start date to replace in the stage query. filter_end_date: filter end date to replace in the stage query. filter_col: use case projection date column name. timezone_offset: timezone offset configured in the use case. join_condition: string containing the join condition. partition_filter: partition condition. rendered_to_date: projection end date. """ return Template(templated).render( cadence="'{cadence}' as cadence".format(cadence=cadence), cadence_run=cadence, week_start=start_of_the_week, query_id="'{query_id}' as query_id".format(query_id=query_id), project_date_column=rendered_date, target_table=self.spec.target_table, database=self.spec.source_database, start_date=filter_start_date, end_date=filter_end_date, filter_date_column=filter_col, offset_value=timezone_offset, joins=join_condition if join_condition else "", partition_filter=partition_filter, to_date=rendered_to_date, ) def _create_stage_view( self, rendered_template: str, stage: dict, window_start_date: str, window_end_date: str, query_id: str, query_label: str, cadence: str, stage_file_path: str, ) -> str: """Create each use case stage view. Each stage has a specific order and refer to a specific SQL to be executed. Args: rendered_template: rendered stage SQL file. stage: stage to process. window_start_date: start date for the configured stage. window_end_date: end date for the configured stage. query_id: gab configuration table use case identifier. query_label: gab configuration table use case name. cadence: cadence to process. stage_file_path: full stage file path (gab path + stage path). """ run_start_time = datetime.now() creation_status: str error_message: Exception | str try: tmp = ExecEnv.SESSION.sql(rendered_template) num_partitions = ExecEnv.SESSION.conf.get( self._SPARK_DEFAULT_PARALLELISM_CONFIG, self._SPARK_DEFAULT_PARALLELISM_VALUE, ) if stage["repartition"]: if stage["repartition"].get("numPartitions"): num_partitions = stage["repartition"]["numPartitions"] if stage["repartition"].get("keys"): tmp = tmp.repartition( int(num_partitions), *stage["repartition"]["keys"] ) self._LOGGER.info("Repartitioned on given Key(s)") else: tmp = tmp.repartition(int(num_partitions)) self._LOGGER.info("Repartitioned on given partition count") temp_step_view_name: str = stage["table_alias"] tmp.createOrReplaceTempView(temp_step_view_name) if stage["storage_level"]: ExecEnv.SESSION.sql( "CACHE TABLE {tbl} " "OPTIONS ('storageLevel' '{type}')".format( tbl=temp_step_view_name, type=stage["storage_level"], ) ) ExecEnv.SESSION.sql( "SELECT COUNT(*) FROM {tbl}".format( # nosec: B608 tbl=temp_step_view_name ) ) self._LOGGER.info(f"Cached stage view - {temp_step_view_name} ") creation_status = "Success" error_message = "NA" except Exception as err: creation_status = "Failed" error_message = err raise err finally: run_end_time = datetime.now() GABUtils().logger( run_start_time, run_end_time, window_start_date, window_end_date, query_id, query_label, cadence, stage_file_path, rendered_template, creation_status, error_message, self.spec.target_database, ) return temp_step_view_name def _generate_view_statement( self, query_id: str, cadence: str, temp_stage_view_name: str, lookup_query_builder: DataFrame, window_start_date: str, window_end_date: str, query_label: str, ) -> bool: """Feed use case data to the insights table (default: unified use case table). Args: query_id: gab configuration table use case identifier. cadence: cadence to process. temp_stage_view_name: name of the temp view generated by the stage. lookup_query_builder: gab configuration data. window_start_date: start date for the configured stage. window_end_date: end date for the configured stage. query_label: gab configuration table use case name. """ run_start_time = datetime.now() creation_status: str error_message: Exception | str GABDeleteGenerator( query_id=query_id, cadence=cadence, temp_stage_view_name=temp_stage_view_name, lookup_query_builder=lookup_query_builder, target_database=self.spec.target_database, target_table=self.spec.target_table, ).generate_sql() gen_ins = GABInsertGenerator( query_id=query_id, cadence=cadence, final_stage_table=temp_stage_view_name, lookup_query_builder=lookup_query_builder, target_database=self.spec.target_database, target_table=self.spec.target_table, ).generate_sql() try: ExecEnv.SESSION.sql(gen_ins) creation_status = "Success" error_message = "NA" inserted = True except Exception as err: creation_status = "Failed" error_message = err raise finally: run_end_time = datetime.now() GABUtils().logger( run_start_time, run_end_time, window_start_date, window_end_date, query_id, query_label, cadence, "Final Insert", gen_ins, creation_status, error_message, self.spec.target_database, ) return inserted @classmethod def _unpersist_cached_views(cls, unpersist_list: list[str]) -> None: """Unpersist cached views. Args: unpersist_list: list containing the view names to unpersist. """ [ ExecEnv.SESSION.sql("UNCACHE TABLE {tbl}".format(tbl=i)) for i in unpersist_list ] def _generate_ddl( self, latest_config_date: datetime, latest_run_date: datetime, query_id: str, lookup_query_builder: DataFrame, ) -> None: """Generate the actual gold asset. It will create and return the view containing all specified dimensions, metrics and computed metric for each cadence/reconciliation window. Args: latest_config_date: latest use case configuration date. latest_run_date: latest use case run date. query_id: gab configuration table use case identifier. lookup_query_builder: gab configuration data. """ if str(latest_config_date) > str(latest_run_date): GABViewManager( query_id=query_id, lookup_query_builder=lookup_query_builder, target_database=self.spec.target_database, target_table=self.spec.target_table, ).generate_use_case_views() else: self._LOGGER.info( "View is not being re-created as there are no changes in the " "configuration after the latest run" ) ================================================ FILE: lakehouse_engine/algorithms/reconciliator.py ================================================ """Module containing the Reconciliator class.""" from enum import Enum from typing import List import pyspark.sql.functions as spark_fns from pyspark.sql import DataFrame from pyspark.sql.functions import ( # noqa: A004 abs, coalesce, col, lit, try_divide, when, ) from pyspark.sql.types import FloatType from lakehouse_engine.algorithms.exceptions import ReconciliationFailedException from lakehouse_engine.core.definitions import InputSpec, ReconciliatorSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.executable import Executable from lakehouse_engine.io.reader_factory import ReaderFactory from lakehouse_engine.transformers.optimizers import Optimizers from lakehouse_engine.utils.logging_handler import LoggingHandler class ReconciliationType(Enum): """Type of Reconciliation.""" PCT = "percentage" ABS = "absolute" class ReconciliationTransformers(Enum): """Transformers Available for the Reconciliation Algorithm.""" AVAILABLE_TRANSFORMERS = { "cache": Optimizers.cache, "persist": Optimizers.persist, } class Reconciliator(Executable): """Class to define the behavior of an algorithm that checks if data reconciles. Checking if data reconciles, using this algorithm, is a matter of reading the 'truth' data and the 'current' data. You can use any input specification compatible with the lakehouse engine to read 'truth' or 'current' data. On top of that, you can pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can preprocess the data before it goes into the actual reconciliation process. Moreover, you can use the 'truth_preprocess_query_args' and 'current_preprocess_query_args' to pass additional arguments to be used to apply additional operations on top of the dataframe, resulting from the previous steps. With these arguments you can apply additional operations like caching or persisting the Dataframe. The way to pass the additional arguments for the operations is similar to the TransformSpec, but only a few operations are allowed. Those are defined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS. The reconciliation process is focused on joining 'truth' with 'current' by all provided columns except the ones passed as 'metrics'. After that it calculates the differences in the metrics attributes (either percentage or absolute difference). Finally, it aggregates the differences, using the supplied aggregation function (e.g., sum, avg, min, max, etc). All of these configurations are passed via the ACON to instantiate a ReconciliatorSpec object. !!! note It is crucial that both the current and truth datasets have exactly the same structure. !!! note You should not use 0 as yellow or red threshold, as the algorithm will verify if the difference between the truth and current values is bigger or equal than those thresholds. !!! note The reconciliation does not produce any negative values or percentages, as we use the absolute value of the differences. This means that the recon result will not indicate if it was the current values that were bigger or smaller than the truth values, or vice versa. """ _logger = LoggingHandler(__name__).get_logger() def __init__(self, acon: dict): """Construct Algorithm instances. Args: acon: algorithm configuration. """ self.spec: ReconciliatorSpec = ReconciliatorSpec( metrics=acon["metrics"], truth_input_spec=InputSpec(**acon["truth_input_spec"]), current_input_spec=InputSpec(**acon["current_input_spec"]), truth_preprocess_query=acon.get("truth_preprocess_query", None), truth_preprocess_query_args=acon.get("truth_preprocess_query_args", None), current_preprocess_query=acon.get("current_preprocess_query", None), current_preprocess_query_args=acon.get( "current_preprocess_query_args", None ), ignore_empty_df=acon.get("ignore_empty_df", False), ) def get_source_of_truth(self) -> DataFrame: """Get the source of truth (expected result) for the reconciliation process. Returns: DataFrame containing the source of truth. """ truth_df = ReaderFactory.get_data(self.spec.truth_input_spec) if self.spec.truth_preprocess_query: truth_df.createOrReplaceTempView("truth") truth_df = ExecEnv.SESSION.sql(self.spec.truth_preprocess_query) return truth_df def get_current_results(self) -> DataFrame: """Get the current results from the table that we are checking if it reconciles. Returns: DataFrame containing the current results. """ current_df = ReaderFactory.get_data(self.spec.current_input_spec) if self.spec.current_preprocess_query: current_df.createOrReplaceTempView("current") current_df = ExecEnv.SESSION.sql(self.spec.current_preprocess_query) return current_df def execute(self) -> None: """Reconcile the current results against the truth dataset.""" truth_df = self.get_source_of_truth() self._apply_preprocess_query_args( truth_df, self.spec.truth_preprocess_query_args ) self._logger.info("Source of truth:") truth_df.show(1000, truncate=False) current_results_df = self.get_current_results() self._apply_preprocess_query_args( current_results_df, self.spec.current_preprocess_query_args ) self._logger.info("Current results:") current_results_df.show(1000, truncate=False) status = "green" # if ignore_empty_df is true, run empty check on truth_df and current_results_df # if both the dataframes are empty then exit with green if ( self.spec.ignore_empty_df and truth_df.isEmpty() and current_results_df.isEmpty() ): self._logger.info( f"ignore_empty_df is {self.spec.ignore_empty_df}, " f"truth_df and current_results_df are empty, " f"hence ignoring reconciliation" ) self._logger.info("The Reconciliation process has succeeded.") return recon_results = self._get_recon_results( truth_df, current_results_df, self.spec.metrics ) self._logger.info(f"Reconciliation result: {recon_results}") for m in self.spec.metrics: metric_name = f"{m['metric']}_{m['type']}_diff_{m['aggregation']}" if m["yellow"] <= recon_results[metric_name] < m["red"]: if status == "green": # only switch to yellow if it was green before, otherwise we want # to preserve 'red' as the final status. status = "yellow" elif m["red"] <= recon_results[metric_name]: status = "red" if status != "green": raise ReconciliationFailedException( f"The Reconciliation process has failed with status: {status}." ) else: self._logger.info("The Reconciliation process has succeeded.") @staticmethod def _apply_preprocess_query_args( df: DataFrame, preprocess_query_args: List[dict] ) -> DataFrame: """Apply transformers on top of the preprocessed query. Args: df: dataframe being transformed. preprocess_query_args: dict having the functions/transformations to apply and respective arguments. Returns: the transformed Dataframe. """ transformed_df = df if preprocess_query_args is None: try: transformed_df = df.transform(Optimizers.cache()) except Exception as e: Reconciliator._logger.warning( f"Could not apply default caching to the dataframe." f"Continuing without caching. Exception: {e}" ) elif len(preprocess_query_args) > 0: for transformation in preprocess_query_args: rec_func = ReconciliationTransformers.AVAILABLE_TRANSFORMERS.value[ transformation["function"] ]( **transformation.get("args", {}) ) # type: ignore transformed_df = df.transform(rec_func) else: transformed_df = df return transformed_df def _get_recon_results( self, truth_df: DataFrame, current_results_df: DataFrame, metrics: List[dict] ) -> dict: """Get the reconciliation results by comparing truth_df with current_results_df. Args: truth_df: dataframe with the truth data to reconcile against. It is typically an aggregated dataset to use as baseline and then we match the current_results_df (Aggregated at the same level) against this truth. current_results_df: dataframe with the current results of the dataset we are trying to reconcile. metrics: list of dicts containing metric, aggregation, yellow threshold and red threshold. Return: dictionary with the results (difference between truth and current results) """ if len(truth_df.head(1)) == 0 or len(current_results_df.head(1)) == 0: raise ReconciliationFailedException( "The reconciliation has failed because either the truth dataset or the " "current results dataset was empty." ) # truth and current are joined on all columns except the metrics joined_df = truth_df.alias("truth").join( current_results_df.alias("current"), [ truth_df[c] == current_results_df[c] for c in current_results_df.columns if c not in [m["metric"] for m in metrics] ], how="full", ) for m in metrics: if m["type"] == ReconciliationType.PCT.value: joined_df = joined_df.withColumn( f"{m['metric']}_{m['type']}_diff", coalesce( ( # we need to make sure we don't produce negative values # because our thresholds only accept > or >= comparisons. abs( try_divide( ( col(f"current.{m['metric']}") - col(f"truth.{m['metric']}") ), abs(col(f"truth.{m['metric']}")), ) ) ), # if the formula above produces null, we need to consider where # it came from: we check below if the values were the same, # and if so the diff is 0, if not the diff is 1 (e.g., the null # result might have come from a division by 0). when( col(f"current.{m['metric']}").eqNullSafe( col(f"truth.{m['metric']}") ), lit(0), ).otherwise(lit(1)), ), ) elif m["type"] == ReconciliationType.ABS.value: joined_df = joined_df.withColumn( f"{m['metric']}_{m['type']}_diff", abs( coalesce(col(f"current.{m['metric']}"), lit(0)) - coalesce(col(f"truth.{m['metric']}"), lit(0)) ), ) else: raise NotImplementedError( "The requested reconciliation type is not yet implemented." ) joined_df = joined_df.withColumn( f"{m['metric']}_{m['type']}_diff", col(f"{m['metric']}_{m['type']}_diff").cast(FloatType()), ) results_df = joined_df.agg( *[ getattr(spark_fns, m["aggregation"])( f"{m['metric']}_{m['type']}_diff" ).alias(f"{m['metric']}_{m['type']}_diff_{m['aggregation']}") for m in metrics ] ) return results_df.collect()[0].asDict() ================================================ FILE: lakehouse_engine/algorithms/sensor.py ================================================ """Module to define Sensor algorithm behavior.""" from pyspark.sql import DataFrame from lakehouse_engine.algorithms.algorithm import Algorithm from lakehouse_engine.algorithms.exceptions import ( NoNewDataException, SensorAlreadyExistsException, ) from lakehouse_engine.core.definitions import ( SENSOR_ALLOWED_DATA_FORMATS, InputFormat, ReadType, SensorSpec, SensorStatus, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.sensor_manager import ( SensorControlTableManager, SensorUpstreamManager, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class Sensor(Algorithm): """Class representing a sensor to check if the upstream has new data.""" _LOGGER = LoggingHandler(__name__).get_logger() def __init__(self, acon: dict): """Construct Sensor instances. Args: acon: algorithm configuration. """ self.spec: SensorSpec = SensorSpec.create_from_acon(acon=acon) self._validate_sensor_spec() if self._check_if_sensor_already_exists(): raise SensorAlreadyExistsException( "There's already a sensor registered with same id or assets!" ) def execute(self) -> bool: """Execute the sensor.""" self._LOGGER.info(f"Starting {self.spec.input_spec.data_format} sensor...") new_data_df = SensorUpstreamManager.read_new_data(sensor_spec=self.spec) if self.spec.input_spec.read_type == ReadType.STREAMING.value: Sensor._run_streaming_sensor(sensor_spec=self.spec, new_data_df=new_data_df) elif self.spec.input_spec.read_type == ReadType.BATCH.value: Sensor._run_batch_sensor( sensor_spec=self.spec, new_data_df=new_data_df, ) has_new_data = SensorControlTableManager.check_if_sensor_has_acquired_data( self.spec.sensor_id, self.spec.control_db_table_name, ) self._LOGGER.info( f"Sensor {self.spec.sensor_id} has previously " f"acquired data? {has_new_data}" ) if self.spec.fail_on_empty_result and not has_new_data: raise NoNewDataException( f"No data was acquired by {self.spec.sensor_id} sensor." ) return has_new_data def _check_if_sensor_already_exists(self) -> bool: """Check if sensor already exists in the table to avoid duplicates.""" row = SensorControlTableManager.read_sensor_table_data( sensor_id=self.spec.sensor_id, control_db_table_name=self.spec.control_db_table_name, ) if row and row.assets != self.spec.assets: return True else: row = SensorControlTableManager.read_sensor_table_data( assets=self.spec.assets, control_db_table_name=self.spec.control_db_table_name, ) return row is not None and row.sensor_id != self.spec.sensor_id @classmethod def _run_streaming_sensor( cls, sensor_spec: SensorSpec, new_data_df: DataFrame ) -> None: """Run sensor in streaming mode (internally runs in batch mode).""" def foreach_batch_check_new_data(df: DataFrame, batch_id: int) -> None: # forcing session to be available inside forEachBatch on # Spark Connect ExecEnv.get_or_create() Sensor._run_batch_sensor( sensor_spec=sensor_spec, new_data_df=df, ) new_data_df.writeStream.trigger(availableNow=True).option( "checkpointLocation", sensor_spec.checkpoint_location ).foreachBatch(foreach_batch_check_new_data).start().awaitTermination() @classmethod def _run_batch_sensor( cls, sensor_spec: SensorSpec, new_data_df: DataFrame, ) -> None: """Run sensor in batch mode. Args: sensor_spec: sensor spec containing all sensor information. new_data_df: DataFrame possibly containing new data. """ new_data_first_row = SensorUpstreamManager.get_new_data(new_data_df) cls._LOGGER.info( f"Sensor {sensor_spec.sensor_id} has new data from upstream? " f"{new_data_first_row is not None}" ) if new_data_first_row: SensorControlTableManager.update_sensor_status( sensor_spec=sensor_spec, status=SensorStatus.ACQUIRED_NEW_DATA.value, upstream_key=( new_data_first_row.UPSTREAM_KEY if "UPSTREAM_KEY" in new_data_df.columns else None ), upstream_value=( new_data_first_row.UPSTREAM_VALUE if "UPSTREAM_VALUE" in new_data_df.columns else None ), ) cls._LOGGER.info( f"Successfully updated sensor status for sensor " f"{sensor_spec.sensor_id}..." ) def _validate_sensor_spec(self) -> None: """Validate if sensor spec Read Type is allowed for the selected Data Format.""" if InputFormat.exists(self.spec.input_spec.data_format): if ( self.spec.input_spec.data_format not in SENSOR_ALLOWED_DATA_FORMATS[self.spec.input_spec.read_type] ): raise NotImplementedError( f"A sensor has not been implemented yet for this data format or, " f"this data format is not available for the read_type" f" {self.spec.input_spec.read_type}. " f"Check the allowed combinations of read_type and data_formats:" f" {SENSOR_ALLOWED_DATA_FORMATS}" ) else: raise NotImplementedError( f"Data format {self.spec.input_spec.data_format} isn't implemented yet." ) ================================================ FILE: lakehouse_engine/algorithms/sensors/__init__.py ================================================ """Package containing all the lakehouse engine Sensor Heartbeat algorithms.""" ================================================ FILE: lakehouse_engine/algorithms/sensors/heartbeat.py ================================================ """Module to define Heartbeat Sensor algorithm behavior.""" import re from typing import Optional from delta import DeltaTable from pyspark import Row from pyspark.sql import DataFrame from pyspark.sql.column import Column from pyspark.sql.functions import ( col, concat_ws, count, current_timestamp, lit, regexp_replace, row_number, trim, upper, ) from pyspark.sql.window import Window from lakehouse_engine.algorithms.algorithm import Algorithm from lakehouse_engine.algorithms.sensors.sensor import Sensor from lakehouse_engine.core.definitions import ( HEARTBEAT_SENSOR_UPDATE_SET, HeartbeatConfigSpec, HeartbeatSensorSource, HeartbeatStatus, SensorStatus, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.sensor_manager import ( SensorJobRunManager, SensorUpstreamManager, ) from lakehouse_engine.terminators.sensor_terminator import SensorTerminator from lakehouse_engine.utils.databricks_utils import DatabricksUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class Heartbeat(Algorithm): """Class representing a Heartbeat to check if the upstream has new data.""" _LOGGER = LoggingHandler(__name__).get_logger() def __init__(self, acon: dict): """Construct Heartbeat instances. Args: acon: algorithm configuration. """ self.spec: HeartbeatConfigSpec = HeartbeatConfigSpec.create_from_acon(acon=acon) def execute(self) -> None: """Execute the heartbeat.""" latest_event_current_timestamp = current_timestamp() heartbeat_sensor_delta_table = DeltaTable.forName( ExecEnv.SESSION, self.spec.heartbeat_sensor_db_table, ) sensor_source = self.spec.sensor_source active_jobs_from_heartbeat_control_table_df = self._get_active_heartbeat_jobs( heartbeat_sensor_delta_table, sensor_source ) for ( control_table_df_row ) in active_jobs_from_heartbeat_control_table_df.collect(): sensor_acon = self._get_sensor_acon_from_heartbeat( self.spec, control_table_df_row ) sensors_with_new_data = self._execute_batch_of_sensor( sensor_acon, control_table_df_row ) if sensors_with_new_data: self._update_heartbeat_status_with_sensor_info( active_jobs_from_heartbeat_control_table_df, heartbeat_sensor_delta_table, self._get_heartbeat_sensor_condition(sensors_with_new_data), latest_event_current_timestamp, sensor_source, ) @classmethod def _get_active_heartbeat_jobs( cls, heartbeat_sensor_delta_table: DeltaTable, sensor_source: str ) -> DataFrame: """Get UNPAUSED and NULL or COMPLETED status record from control table. :param heartbeat_sensor_delta_table: DeltaTable for heartbeat sensor. :param sensor_source: source system from Spec(e.g. sap_b4, delta, kafka etc.). Returns: A control table DataFrame containing records for specified sensor_source that are UNPAUSED and have a status of either NULL or COMPLETED. """ full_control_table = heartbeat_sensor_delta_table.toDF() filtered_control_table = full_control_table.filter( f"lower(sensor_source) == '{sensor_source}'" ).filter( "job_state == 'UNPAUSED' and (status is null OR status == 'COMPLETED')" ) return filtered_control_table @classmethod def generate_unique_column_values(cls, main_col: str, col_to_append: str) -> str: """Generate a unique value by appending columns and replacing specific chars. Generate a unique value by appending another column and replacing spaces, dots, and colons with underscores for consistency. :param main_col: The primary column value. :param col_to_append: Column value to append for uniqueness. Returns: A unique, combined column value. """ return f"{re.sub(r'[ :.]', '_', main_col)}_{col_to_append}" @classmethod def _get_sensor_acon_from_heartbeat( cls, heartbeat_spec: HeartbeatConfigSpec, control_table_df_row: Row ) -> dict: """Create sensor acon from heartbeat config and specifications. :param heartbeat_spec: Heartbeat specifications. :param control_table_df_row: Control table active records Dataframe Row. Returns: The sensor acon dict. """ sensors_to_execute: dict = { "sensor_id": ( cls.generate_unique_column_values( control_table_df_row["sensor_id"], control_table_df_row["trigger_job_id"], ) ), # 1. sensor_id can be same for two or more different trigger_job_id # 2. Replacing colon,space,dot(.) with underscore(_) is required to get the # checkpoint_location fixed in case of delta_table and kafka source "assets": [ cls.generate_unique_column_values( control_table_df_row["asset_description"], control_table_df_row["trigger_job_id"], ) ], "control_db_table_name": heartbeat_spec.lakehouse_engine_sensor_db_table, "input_spec": { "spec_id": "sensor_upstream", "read_type": control_table_df_row["sensor_read_type"], "data_format": heartbeat_spec.data_format, "db_table": ( control_table_df_row["sensor_id"] if heartbeat_spec.data_format == "delta" else None ), "options": heartbeat_spec.options, "location": ( ( heartbeat_spec.base_trigger_file_location + "/" + control_table_df_row["sensor_id"] ) if heartbeat_spec.base_trigger_file_location is not None else None ), "schema": heartbeat_spec.schema_dict, }, "preprocess_query": control_table_df_row["preprocess_query"], "base_checkpoint_location": heartbeat_spec.base_checkpoint_location, "fail_on_empty_result": False, } final_sensors_to_execute = cls._enhance_sensor_acon_extra_options( heartbeat_spec, control_table_df_row, sensors_to_execute ) return final_sensors_to_execute @classmethod def _enhance_sensor_acon_extra_options( cls, heartbeat_spec: HeartbeatConfigSpec, control_table_df_row: Row, sensors_to_execute: dict, ) -> dict: """Enhance sensor acon with extra options for specific source system. :param heartbeat_spec: Heartbeat specifications. :param control_table_df_row: Control table active records Dataframe Row. :param sensors_to_execute: sensor acon dictionary from previous step. Returns: The sensor acon dict having enhanced options for specific sensor_source. """ LATEST_FETCH_EVENT_TIMESTAMP = ( control_table_df_row.latest_event_fetched_timestamp ) upstream_key = control_table_df_row["upstream_key"] upstream_value = ( LATEST_FETCH_EVENT_TIMESTAMP.strftime("%Y%m%d%H%M%S") if LATEST_FETCH_EVENT_TIMESTAMP is not None else "19000101000000" ) if control_table_df_row.sensor_source.lower() in [ HeartbeatSensorSource.SAP_B4.value, HeartbeatSensorSource.SAP_BW.value, ]: sensors_to_execute["input_spec"]["options"]["prepareQuery"] = ( SensorUpstreamManager.generate_sensor_sap_logchain_query( chain_id=control_table_df_row.sensor_id, dbtable=heartbeat_spec.jdbc_db_table, ) ) sensors_to_execute["input_spec"]["options"]["query"] = ( SensorUpstreamManager.generate_filter_exp_query( sensor_id=control_table_df_row.sensor_id, filter_exp="?upstream_key > '?upstream_value'", control_db_table_name=( heartbeat_spec.lakehouse_engine_sensor_db_table ), upstream_key=upstream_key, upstream_value=upstream_value, ) ) elif ( control_table_df_row.sensor_source.lower() == HeartbeatSensorSource.LMU_DELTA_TABLE.value ): sensors_to_execute["preprocess_query"] = ( SensorUpstreamManager.generate_filter_exp_query( sensor_id=control_table_df_row.sensor_id, filter_exp="?upstream_key > '?upstream_value'", control_db_table_name=( heartbeat_spec.lakehouse_engine_sensor_db_table ), upstream_key=upstream_key, upstream_value=upstream_value, ) ) elif ( control_table_df_row.sensor_source.lower() == HeartbeatSensorSource.KAFKA.value ): kafka_options = cls._get_all_kafka_options( heartbeat_spec.kafka_configs, control_table_df_row["sensor_id"], heartbeat_spec.kafka_secret_scope, ) sensors_to_execute["input_spec"]["options"] = kafka_options return sensors_to_execute @classmethod def _get_all_kafka_options( cls, kafka_configs: dict, kafka_sensor_id: str, kafka_secret_scope: str, ) -> dict: """Get all Kafka extra options for sensor ACON. Read all heartbeat sensor related kafka config dynamically based on data product name or any other prefix which should match with sensor_id prefix. :param kafka_configs: kafka config read from yaml file. :param kafka_sensor_id: kafka topic for which new event to be fetched. :param kafka_secret_scope: secret scope used for kafka processing. Returns: The sensor acon dict having enhanced options for kafka source. """ sensor_id_desc = kafka_sensor_id.split(":") dp_name_filter = sensor_id_desc[0].strip() KAFKA_TOPIC = sensor_id_desc[1].strip() KAFKA_BOOTSTRAP_SERVERS = kafka_configs[dp_name_filter][ "kafka_bootstrap_servers_list" ] KAFKA_TRUSTSTORE_LOCATION = kafka_configs[dp_name_filter][ "kafka_ssl_truststore_location" ] KAFKA_KEYSTORE_LOCATION = kafka_configs[dp_name_filter][ "kafka_ssl_keystore_location" ] KAFKA_TRUSTSTORE_PSWD_SECRET_KEY = kafka_configs[dp_name_filter][ "truststore_pwd_secret_key" ] KAFKA_TRUSTSTORE_PSWD = ( DatabricksUtils.get_db_utils(ExecEnv.SESSION).secrets.get( scope=kafka_secret_scope, key=KAFKA_TRUSTSTORE_PSWD_SECRET_KEY, ) if KAFKA_TRUSTSTORE_PSWD_SECRET_KEY else None ) KAFKA_KEYSTORE_PSWD_SECRET_KEY = kafka_configs[dp_name_filter][ "keystore_pwd_secret_key" ] KAFKA_KEYSTORE_PSWD = ( DatabricksUtils.get_db_utils(ExecEnv.SESSION).secrets.get( scope=kafka_secret_scope, key=KAFKA_KEYSTORE_PSWD_SECRET_KEY, ) if KAFKA_KEYSTORE_PSWD_SECRET_KEY else None ) kafka_options_dict = { "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS, "subscribe": KAFKA_TOPIC, "startingOffsets": "earliest", "kafka.security.protocol": "SSL", "kafka.ssl.truststore.location": KAFKA_TRUSTSTORE_LOCATION, "kafka.ssl.truststore.password": KAFKA_TRUSTSTORE_PSWD, "kafka.ssl.keystore.location": KAFKA_KEYSTORE_LOCATION, "kafka.ssl.keystore.password": KAFKA_KEYSTORE_PSWD, } return kafka_options_dict @classmethod def _execute_batch_of_sensor( cls, sensor_acon: dict, control_table_df_row: Row ) -> dict: """Execute sensor acon to fetch NEW EVENT AVAILABLE for sensor source system. :param sensor_acon: sensor acon created from heartbeat config and specs. :param control_table_df_row: Control table active records Dataframe Row. Returns: Dict containing sensor_id and trigger_job_id for sensor with new data. """ sensors_with_new_data: dict = {} cls._LOGGER.info(f"Executing sensor: {sensor_acon}") has_new_data = Sensor(sensor_acon).execute() if has_new_data: sensors_with_new_data["sensor_id"] = control_table_df_row["sensor_id"] sensors_with_new_data["trigger_job_id"] = control_table_df_row[ "trigger_job_id" ] return sensors_with_new_data @classmethod def _get_heartbeat_sensor_condition( cls, sensors_with_new_data: dict, ) -> Optional[str]: """Get heartbeat sensor new event available condition. :param sensors_with_new_data: dict having NEW_EVENT_AVAILABLE sensor_id record. Returns: String having condition for sensor having new data available. """ heartbeat_sensor_with_new_event_available = ( f"(sensor_id = '{sensors_with_new_data['sensor_id']}' AND " f"trigger_job_id = '{sensors_with_new_data['trigger_job_id']}')" ) return heartbeat_sensor_with_new_event_available @classmethod def _update_heartbeat_status_with_sensor_info( cls, heartbeat_sensor_jobs: DataFrame, heartbeat_sensor_delta_table: DeltaTable, heartbeat_with_new_event_available_condition: str, latest_event_current_timestamp: Column, sensor_source: str, ) -> None: """Update heartbeat status with sensor info. :param heartbeat_sensor_jobs: active UNPAUSED jobs from Control table dataframe. :param heartbeat_sensor_delta_table: heartbeat sensor Delta table. :param heartbeat_with_new_event_available_condition: new event available cond. :param latest_event_current_timestamp: timestamp when new event was captured. """ if heartbeat_with_new_event_available_condition: sensors_with_new_event_available = ( heartbeat_sensor_jobs.filter( heartbeat_with_new_event_available_condition ) .withColumn("status", lit(HeartbeatStatus.NEW_EVENT_AVAILABLE.value)) .withColumn("status_change_timestamp", current_timestamp()) .withColumn( "latest_event_fetched_timestamp", latest_event_current_timestamp ) ) new_event_merge_condition = f"""target.sensor_id = src.sensor_id AND target.trigger_job_id = src.trigger_job_id AND target.sensor_source = '{sensor_source}'""" if sensors_with_new_event_available.count() > 0: cls.update_heartbeat_control_table( heartbeat_sensor_delta_table, sensors_with_new_event_available, new_event_merge_condition, ) else: cls._LOGGER.info("No sensors to execute!") @classmethod def update_heartbeat_control_table( cls, heartbeat_sensor_delta_table: DeltaTable, updated_data: DataFrame, heartbeat_control_table_merge_condition: str, ) -> None: """Update heartbeat control table with the new data. :param heartbeat_sensor_delta_table: db_table heartbeat sensor control table. :param updated_data: data to update the control table. :param heartbeat_control_table_merge_condition: merge condition for table. """ cls._LOGGER.info(f"updated data: {updated_data}") heartbeat_sensor_delta_table.alias("target").merge( updated_data.alias("src"), (heartbeat_control_table_merge_condition), ).whenMatchedUpdate( set=HEARTBEAT_SENSOR_UPDATE_SET ).whenNotMatchedInsertAll().execute() @classmethod def get_heartbeat_jobs_to_trigger( cls, heartbeat_sensor_db_table: str, heartbeat_sensor_control_table_df: DataFrame, ) -> list[Row]: """Get heartbeat jobs to trigger. Check if all the dependencies are satisfied to trigger the job. dependency_flag column to be checked for all sensor_id and trigger_job_id combination keeping status as NEW_EVENT_AVAILABLE in mind. Check dependencies based trigger_job_id. From all control table record having status as NEW_EVENT_AVAILABLE, then it will fetch status and dependency_flag for all records having same trigger_job_id. If trigger_job_id, status, dependency_flag combination is same for all dependencies, Get distinct record and do count level aggregation for trigger_job_id, dependency_flag. Count level aggregation based on trigger_job_id, dependency_flag picks all those trigger_job_id which doesn`t satisfy dependency as it denotes there are more than one record present having dependency_flag = "TRUE" and status is different for same trigger_job_id. If count is not more than 1, means condition satisfied, Job id will be considered for triggering. If trigger_job_id, status, dependency_flag combination is not same for all dependencies, aggregated count will result in more than one record and it will go under jobs_to_not_trigger and will not trigger job. :param heartbeat_sensor_db_table: heartbeat sensor table name. :param heartbeat_sensor_control_table_df: Dataframe for heartbeat control table. :return: list of jobs to be triggered. """ # Get all distinct trigger_job_id where status is NEW_EVENT_AVAILABLE trigger_jobs_new_events_df = ( heartbeat_sensor_control_table_df.filter( f"status == '{HeartbeatStatus.NEW_EVENT_AVAILABLE.value}'" ) .select(col("trigger_job_id")) .distinct() ) # Get distinct trigger_job_id, status, dependency_flag for control table records full_data_df = ( ExecEnv.SESSION.table(heartbeat_sensor_db_table) .select( col("trigger_job_id"), col("status"), upper(col("dependency_flag")).alias("dependency_flag"), ) .distinct() ) # Join NEW_EVENT_AVAILABLE records with full table to get all dependencies # based on trigger_job_id. dependency_flag = "TRUE" needs to be checked as # we are only concerned with records where dependencies needs to be checked. full_data_trigger_job_id = col("full_data.trigger_job_id") dep_flag_comparison = trim(upper(col("dependency_flag"))) == "TRUE" jobs_with_new_events_df = ( full_data_df.alias("full_data") .join( trigger_jobs_new_events_df.alias("jobs_with_new_events"), col("jobs_with_new_events.trigger_job_id") == full_data_trigger_job_id, "inner", ) .select( full_data_trigger_job_id, col("full_data.status"), col("full_data.dependency_flag"), ) ).filter(dep_flag_comparison) # Count level aggregation based on trigger_job_id, dependency_flag picks all # those trigger_job_id which doesn`t satisfy dependency as it denotes there # are more than one record present having dependency_flag = "TRUE" and status # is different for same trigger_job_id. jobs_to_not_trigger_with_new_event_df = ( jobs_with_new_events_df.filter(dep_flag_comparison) .groupBy("trigger_job_id", "dependency_flag") .agg(count("trigger_job_id").alias("count")) .where(col("count") > 1) ) jobs_to_trigger_df = ( jobs_with_new_events_df.alias("full_data") .join( jobs_to_not_trigger_with_new_event_df.alias("jobs_to_not_trigger"), (col("jobs_to_not_trigger.trigger_job_id") == full_data_trigger_job_id), "left_anti", ) .groupBy("trigger_job_id", "status") .agg(count("trigger_job_id").alias("count")) .where(col("count") == 1) ) jobs_to_trigger_df = jobs_to_trigger_df.select("trigger_job_id").distinct() jobs_to_trigger = jobs_to_trigger_df.collect() return jobs_to_trigger @classmethod def get_anchor_job_record( cls, heartbeat_sensor_table_df: DataFrame, job_id: str, sensor_source: str ) -> DataFrame: """Identify anchor jobs from the control table. Using trigger_job_id as the partition key, ordered by status_change_timestamp in descending order and sensor_id in ascending order, filtered by the specific sensor_source. This method partitions records by trigger_job_id, orders them by status_change_timestamp (descending) and sensor_id (ascending), and filters by the specified sensor_source. Filtering on sensor_source makes sure if current source is eligible for triggering the job and updates or not. This process ensures that only the appropriate single record triggers the job and the control table is updated accordingly. This approach eliminates redundant triggers and unnecessary updates. :param heartbeat_sensor_table_df: Heartbeat sensor control table Dataframe. :param job_id: Trigger job_id from table for which dependency also satisfies. :param sensor_source: source of the heartbeat sensor record. Returns: Control table DataFrame containing anchor job records valid for triggering. """ heartbeat_anchor_records_df = heartbeat_sensor_table_df.filter( col("trigger_job_id") == job_id ).withColumn( "row_no", row_number().over( Window.partitionBy("trigger_job_id").orderBy( col("status_change_timestamp").desc(), col("sensor_id").asc() ) ), ) heartbeat_anchor_records_df = heartbeat_anchor_records_df.filter( f"row_no = 1 AND sensor_source = '{sensor_source}'" ).drop("row_no") return heartbeat_anchor_records_df def heartbeat_sensor_trigger_jobs(self) -> None: """Get heartbeat jobs to trigger. :param self.spec: HeartbeatConfigSpec having config and control table spec. """ heartbeat_sensor_db_table = self.spec.heartbeat_sensor_db_table sensor_source = self.spec.sensor_source heartbeat_sensor_delta_table = DeltaTable.forName( ExecEnv.SESSION, heartbeat_sensor_db_table ) heartbeat_sensor_control_table_df = ExecEnv.SESSION.table( heartbeat_sensor_db_table ).filter( f"lower(sensor_source) == '{sensor_source}' and (job_state == 'UNPAUSED')" ) jobs_to_trigger = self.get_heartbeat_jobs_to_trigger( heartbeat_sensor_db_table, heartbeat_sensor_control_table_df ) heartbeat_sensor_table_df = ExecEnv.SESSION.table(heartbeat_sensor_db_table) final_df: Optional[DataFrame] = None for row in jobs_to_trigger: run_id = None exception = None heartbeat_anchor_job_records_df = self.get_anchor_job_record( heartbeat_sensor_table_df, row["trigger_job_id"], sensor_source ) if heartbeat_anchor_job_records_df.take(1): run_id, exception = SensorJobRunManager.run_job( row["trigger_job_id"], self.spec.token, self.spec.domain ) if exception is None and run_id is not None: status_df = ( heartbeat_sensor_table_df.filter( (col("trigger_job_id") == row["trigger_job_id"]) ) .withColumn("job_start_timestamp", current_timestamp()) .withColumn("status", lit(HeartbeatStatus.IN_PROGRESS.value)) .withColumn("status_change_timestamp", current_timestamp()) ) final_df = final_df.union(status_df) if final_df else status_df if final_df is not None: in_progress_merge_condition = """target.sensor_id = src.sensor_id AND target.trigger_job_id = src.trigger_job_id AND target.sensor_source = src.sensor_source""" self.update_heartbeat_control_table( heartbeat_sensor_delta_table, final_df, in_progress_merge_condition ) @classmethod def _read_heartbeat_sensor_data_feed_csv( cls, heartbeat_sensor_data_feed_path: str ) -> DataFrame: """Get rows to insert or delete in heartbeat_sensor table. It reads the CSV file stored from the `heartbeat_sensor_data_feed_path` and perform UPSERT and DELETE in control table. - **heartbeat_sensor_data_feed_path**: path where CSV file is stored. """ data_feed_csv_df = ( ExecEnv.SESSION.read.format("csv") .option("header", True) .load(heartbeat_sensor_data_feed_path) ) data_feed_csv_df = data_feed_csv_df.withColumn( "job_state", upper(col("job_state")) ) return data_feed_csv_df @classmethod def merge_control_table_data_feed_records( cls, heartbeat_sensor_control_table: str, heartbeat_sensor_data_feed_csv_df: DataFrame, ) -> None: """Perform merge operation based on the condition. It reads the CSV file stored at `heartbeat_sensor_data_feed_path` folder and perform UPSERT and DELETE in control table. - **heartbeat_sensor_control_table**: Heartbeat sensor control table. - **heartbeat_sensor_data_feed_csv_df**: Dataframe after reading CSV file. """ delta_table = DeltaTable.forName( ExecEnv.SESSION, heartbeat_sensor_control_table ) delta_table.alias("trgt").merge( heartbeat_sensor_data_feed_csv_df.alias("source"), ( """source.sensor_id = trgt.sensor_id and trgt.trigger_job_id = source.trigger_job_id""" ), ).whenNotMatchedInsert( values={ "sensor_source": "source.sensor_source", "sensor_id": "source.sensor_id", "sensor_read_type": "source.sensor_read_type", "asset_description": "source.asset_description", "upstream_key": "source.upstream_key", "preprocess_query": "source.preprocess_query", "latest_event_fetched_timestamp": "null", "trigger_job_id": "source.trigger_job_id", "trigger_job_name": "source.trigger_job_name", "status": "null", "status_change_timestamp": "null", "job_start_timestamp": "null", "job_end_timestamp": "null", "job_state": "source.job_state", "dependency_flag": "source.dependency_flag", } ).whenMatchedUpdate( set={ "sensor_source": "source.sensor_source", "sensor_id": "source.sensor_id", "sensor_read_type": "source.sensor_read_type", "asset_description": "source.asset_description", "upstream_key": "source.upstream_key", "preprocess_query": "source.preprocess_query", "latest_event_fetched_timestamp": "trgt.latest_event_fetched_timestamp", "trigger_job_id": "source.trigger_job_id", "trigger_job_name": "source.trigger_job_name", "status": "trgt.status", "status_change_timestamp": "trgt.status_change_timestamp", "job_start_timestamp": "trgt.job_start_timestamp", "job_end_timestamp": "trgt.job_end_timestamp", "job_state": "source.job_state", "dependency_flag": "source.dependency_flag", } ).whenNotMatchedBySourceDelete().execute() @classmethod def heartbeat_sensor_control_table_data_feed( cls, heartbeat_sensor_data_feed_path: str, heartbeat_sensor_control_table: str, ) -> None: """Control table Data feeder. It reads the CSV file stored at `heartbeat_sensor_data_feed_path` and perform UPSERT and DELETE in control table. - **heartbeat_sensor_data_feed_path**: path where CSV file is stored. - **heartbeat_sensor_control_table**: CONTROL table of Heartbeat sensor. """ heartbeat_sensor_data_feed_csv_df = cls._read_heartbeat_sensor_data_feed_csv( heartbeat_sensor_data_feed_path ) cls.merge_control_table_data_feed_records( heartbeat_sensor_control_table, heartbeat_sensor_data_feed_csv_df ) @classmethod def update_sensor_processed_status( cls, sensor_table: str, job_id_filter_control_table_df: DataFrame, ) -> None: """UPDATE sensor PROCESSED_NEW_DATA status. Update sensor control table with PROCESSED_NEW_DATA status and status_change_timestamp for the triggered job. Args: sensor_table: lakehouse engine sensor table name. job_id_filter_control_table_df: Job Id filtered Heartbeat sensor control table dataframe. """ sensor_id_df = job_id_filter_control_table_df.withColumn( "sensor_table_sensor_id", concat_ws( "_", regexp_replace(col("sensor_id"), r"[ :\.]", "_"), col("trigger_job_id"), ), ) for row in sensor_id_df.select("sensor_table_sensor_id").collect(): SensorTerminator.update_sensor_status( sensor_id=row["sensor_table_sensor_id"], control_db_table_name=sensor_table, status=SensorStatus.PROCESSED_NEW_DATA.value, assets=None, ) @classmethod def update_heartbeat_sensor_completion_status( cls, heartbeat_sensor_control_table: str, sensor_table: str, job_id: str, ) -> None: """UPDATE heartbeat sensor status. Update heartbeat sensor control table with COMPLETE status and job_end_timestamp for the triggered job. Update sensor control table with PROCESSED_NEW_DATA status and status_change_timestamp for the triggered job. Args: job_id: job_id of the running job. It will refer to trigger_job_id in Control table. sensor_table: lakehouse engine sensor table name. heartbeat_sensor_control_table: Heartbeat sensor control table. """ job_id_filter_control_table_df = ( ExecEnv.SESSION.table(heartbeat_sensor_control_table) .filter(col("trigger_job_id") == job_id) .withColumn("status", lit(HeartbeatStatus.COMPLETED.value)) .withColumn("status_change_timestamp", current_timestamp()) .withColumn("job_end_timestamp", current_timestamp()) ) cls.update_sensor_processed_status(sensor_table, job_id_filter_control_table_df) delta_table = DeltaTable.forName( ExecEnv.SESSION, heartbeat_sensor_control_table ) ( delta_table.alias("target") .merge( job_id_filter_control_table_df.alias("source"), ( f"""target.sensor_source = source.sensor_source and target.sensor_id = source.sensor_id and target.trigger_job_id = '{job_id}'""" ), ) .whenMatchedUpdate( set={ "target.status": "source.status", "target.status_change_timestamp": "source.status_change_timestamp", "target.job_end_timestamp": "source.job_end_timestamp", } ) .execute() ) ================================================ FILE: lakehouse_engine/algorithms/sensors/sensor.py ================================================ """Module to define Sensor algorithm behavior.""" from pyspark.sql import DataFrame from lakehouse_engine.algorithms.algorithm import Algorithm from lakehouse_engine.algorithms.exceptions import ( NoNewDataException, SensorAlreadyExistsException, ) from lakehouse_engine.core.definitions import ( SENSOR_ALLOWED_DATA_FORMATS, InputFormat, ReadType, SensorSpec, SensorStatus, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.sensor_manager import ( SensorControlTableManager, SensorUpstreamManager, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class Sensor(Algorithm): """Class representing a sensor to check if the upstream has new data.""" _LOGGER = LoggingHandler(__name__).get_logger() def __init__(self, acon: dict): """Construct Sensor instances. Args: acon: algorithm configuration. """ self.spec: SensorSpec = SensorSpec.create_from_acon(acon=acon) self._validate_sensor_spec() if self._check_if_sensor_already_exists(): raise SensorAlreadyExistsException( "There's already a sensor registered with same id or assets!" ) def execute(self) -> bool: """Execute the sensor.""" self._LOGGER.info(f"Starting {self.spec.input_spec.data_format} sensor...") new_data_df = SensorUpstreamManager.read_new_data(sensor_spec=self.spec) if self.spec.input_spec.read_type == ReadType.STREAMING.value: Sensor._run_streaming_sensor(sensor_spec=self.spec, new_data_df=new_data_df) elif self.spec.input_spec.read_type == ReadType.BATCH.value: Sensor._run_batch_sensor( sensor_spec=self.spec, new_data_df=new_data_df, ) has_new_data = SensorControlTableManager.check_if_sensor_has_acquired_data( self.spec.sensor_id, self.spec.control_db_table_name, ) self._LOGGER.info( f"Sensor {self.spec.sensor_id} has previously " f"acquired data? {has_new_data}" ) if self.spec.fail_on_empty_result and not has_new_data: raise NoNewDataException( f"No data was acquired by {self.spec.sensor_id} sensor." ) return has_new_data def _check_if_sensor_already_exists(self) -> bool: """Check if sensor already exists in the table to avoid duplicates.""" row = SensorControlTableManager.read_sensor_table_data( sensor_id=self.spec.sensor_id, control_db_table_name=self.spec.control_db_table_name, ) if row and row.assets != self.spec.assets: return True else: row = SensorControlTableManager.read_sensor_table_data( assets=self.spec.assets, control_db_table_name=self.spec.control_db_table_name, ) return row is not None and row.sensor_id != self.spec.sensor_id @classmethod def _run_streaming_sensor( cls, sensor_spec: SensorSpec, new_data_df: DataFrame ) -> None: """Run sensor in streaming mode (internally runs in batch mode).""" def foreach_batch_check_new_data(df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(df) Sensor._run_batch_sensor( sensor_spec=sensor_spec, new_data_df=df, ) new_data_df.writeStream.trigger(availableNow=True).option( "checkpointLocation", sensor_spec.checkpoint_location ).foreachBatch(foreach_batch_check_new_data).start().awaitTermination() @classmethod def _run_batch_sensor( cls, sensor_spec: SensorSpec, new_data_df: DataFrame, ) -> None: """Run sensor in batch mode. Args: sensor_spec: sensor spec containing all sensor information. new_data_df: DataFrame possibly containing new data. """ new_data_first_row = SensorUpstreamManager.get_new_data(new_data_df) cls._LOGGER.info( f"Sensor {sensor_spec.sensor_id} has new data from upstream? " f"{new_data_first_row is not None}" ) if new_data_first_row: SensorControlTableManager.update_sensor_status( sensor_spec=sensor_spec, status=SensorStatus.ACQUIRED_NEW_DATA.value, upstream_key=( new_data_first_row.UPSTREAM_KEY if "UPSTREAM_KEY" in new_data_df.columns else None ), upstream_value=( new_data_first_row.UPSTREAM_VALUE if "UPSTREAM_VALUE" in new_data_df.columns else None ), ) cls._LOGGER.info( f"Successfully updated sensor status for sensor " f"{sensor_spec.sensor_id}..." ) def _validate_sensor_spec(self) -> None: """Validate if sensor spec Read Type is allowed for the selected Data Format.""" if InputFormat.exists(self.spec.input_spec.data_format): if ( self.spec.input_spec.data_format not in SENSOR_ALLOWED_DATA_FORMATS[self.spec.input_spec.read_type] ): raise NotImplementedError( f"A sensor has not been implemented yet for this data format or, " f"this data format is not available for the read_type" f" {self.spec.input_spec.read_type}. " f"Check the allowed combinations of read_type and data_formats:" f" {SENSOR_ALLOWED_DATA_FORMATS}" ) else: raise NotImplementedError( f"Data format {self.spec.input_spec.data_format} isn't implemented yet." ) ================================================ FILE: lakehouse_engine/configs/__init__.py ================================================ """This module receives a config file which is included in the wheel.""" ================================================ FILE: lakehouse_engine/configs/engine.yaml ================================================ dq_bucket: s3://sample-dq-bucket dq_dev_bucket: s3://sample-dq-dev-bucket dq_functions_column_list: - dq_rule_id - execution_point - filters - schema - table - column - dimension dq_result_sink_columns_to_delete: - partial_unexpected_list - partial_unexpected_counts - partial_unexpected_index_list - unexpected_list sharepoint_authority: https://login.microsoftonline.com sharepoint_api_domain: https://graph.microsoft.com sharepoint_company_domain: your_company_name.sharepoint.com notif_disallowed_email_servers: - sample.blocked.email_server engine_usage_path: s3://sample-log-bucket engine_dev_usage_path: s3://sample-log-dev-bucket raise_on_config_not_available: False prod_catalog: sample_catalog environment: prod ================================================ FILE: lakehouse_engine/core/__init__.py ================================================ """Package with the core behaviour of the lakehouse engine.""" ================================================ FILE: lakehouse_engine/core/dbfs_file_manager.py ================================================ """File manager module using dbfs.""" from lakehouse_engine.core.file_manager import FileManager from lakehouse_engine.utils.databricks_utils import DatabricksUtils from lakehouse_engine.utils.logging_handler import LoggingHandler def _dry_run(bucket: str, object_paths: list) -> dict: """Build the dry run request return format. Args: bucket: name of bucket to perform operation. object_paths: paths of object to list. Returns: A dict with a list of objects that would be copied/deleted. """ response = {} for path in object_paths: path = _get_path(bucket, path) object_list: list = [] object_list = _list_objects(path, object_list) if object_list: response[path] = object_list else: response[path] = ["No such key"] return response def _list_objects(path: str, objects_list: list) -> list: """List all the objects in a path. Args: path: path to be used to perform the list. objects_list: A list of object names, empty by default. Returns: A list of object names. """ from lakehouse_engine.core.exec_env import ExecEnv ls_objects_list = DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.ls(path) for file_or_directory in ls_objects_list: if file_or_directory.isDir(): _list_objects(file_or_directory.path, objects_list) else: objects_list.append(file_or_directory.path) return objects_list def _get_path(bucket: str, path: str) -> str: """Get complete path. For s3 path, the bucket (e.g. bucket-example) and path (e.g. folder1/folder2) will be filled with part of the path. For dbfs path, the path will have the complete path (dbfs:/example) and bucket as null. Args: bucket: bucket for s3 objects. path: path to access the directory of file. Returns: The complete path with or without bucket. """ if bucket.strip(): path = f"s3://{bucket}/{path}".strip() else: path = path.strip() return path class DBFSFileManager(FileManager): """Set of actions to manipulate dbfs files in several ways.""" _logger = LoggingHandler(__name__).get_logger() def get_function(self) -> None: """Get a specific function to execute.""" available_functions = { "delete_objects": self.delete_objects, "copy_objects": self.copy_objects, "move_objects": self.move_objects, } self._logger.info("Function being executed: {}".format(self.function)) if self.function in available_functions.keys(): func = available_functions[self.function] func() else: raise NotImplementedError( f"The requested function {self.function} is not implemented." ) @staticmethod def _delete_objects(bucket: str, objects_paths: list) -> None: """Delete objects recursively. Params: bucket: name of bucket to perform the delete operation. objects_paths: objects to be deleted. """ from lakehouse_engine.core.exec_env import ExecEnv for path in objects_paths: path = _get_path(bucket, path) DBFSFileManager._logger.info(f"Deleting: {path}") try: delete_operation = DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.rm( path, True ) if delete_operation: DBFSFileManager._logger.info(f"Deleted: {path}") else: DBFSFileManager._logger.info(f"Not able to delete: {path}") except Exception as e: DBFSFileManager._logger.error(f"Error deleting {path} - {e}") raise e def delete_objects(self) -> None: """Delete objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be deleted based on the given keys. """ bucket = self.configs["bucket"] objects_paths = self.configs["object_paths"] dry_run = self.configs["dry_run"] if dry_run: response = _dry_run(bucket=bucket, object_paths=objects_paths) self._logger.info("Paths that would be deleted:") self._logger.info(response) else: self._delete_objects(bucket, objects_paths) def copy_objects(self) -> None: """Copies objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be copied based on the given keys. """ source_bucket = self.configs["bucket"] source_object = self.configs["source_object"] destination_bucket = self.configs["destination_bucket"] destination_object = self.configs["destination_object"] dry_run = self.configs["dry_run"] if dry_run: response = _dry_run(bucket=source_bucket, object_paths=[source_object]) self._logger.info("Paths that would be copied:") self._logger.info(response) else: self._copy_objects( source_bucket=source_bucket, source_object=source_object, destination_bucket=destination_bucket, destination_object=destination_object, ) @staticmethod def _copy_objects( source_bucket: str, source_object: str, destination_bucket: str, destination_object: str, ) -> None: """Copies objects and 'directories'. Args: source_bucket: name of bucket to perform the copy. source_object: object/folder to be copied. destination_bucket: name of the target bucket to copy. destination_object: target object/folder to copy. """ from lakehouse_engine.core.exec_env import ExecEnv copy_from = _get_path(source_bucket, source_object) copy_to = _get_path(destination_bucket, destination_object) DBFSFileManager._logger.info(f"Copying: {copy_from} to {copy_to}") try: DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.cp( copy_from, copy_to, True ) DBFSFileManager._logger.info(f"Copied: {copy_from} to {copy_to}") except Exception as e: DBFSFileManager._logger.error( f"Error copying file {copy_from} to {copy_to} - {e}" ) raise e def move_objects(self) -> None: """Moves objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be moved based on the given keys. """ source_bucket = self.configs["bucket"] source_object = self.configs["source_object"] destination_bucket = self.configs["destination_bucket"] destination_object = self.configs["destination_object"] dry_run = self.configs["dry_run"] if dry_run: response = _dry_run(bucket=source_bucket, object_paths=[source_object]) self._logger.info("Paths that would be moved:") self._logger.info(response) else: self._move_objects( source_bucket=source_bucket, source_object=source_object, destination_bucket=destination_bucket, destination_object=destination_object, ) @staticmethod def _move_objects( source_bucket: str, source_object: str, destination_bucket: str, destination_object: str, ) -> None: """Moves objects and 'directories'. Args: source_bucket: name of bucket to perform the move. source_object: object/folder to be moved. destination_bucket: name of the target bucket to move. destination_object: target object/folder to move. """ from lakehouse_engine.core.exec_env import ExecEnv move_from = _get_path(source_bucket, source_object) move_to = _get_path(destination_bucket, destination_object) DBFSFileManager._logger.info(f"Moving: {move_from} to {move_to}") try: DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.mv( move_from, move_to, True ) DBFSFileManager._logger.info(f"Moved: {move_from} to {move_to}") except Exception as e: DBFSFileManager._logger.error( f"Error moving file {move_from} to {move_to} - {e}" ) raise e ================================================ FILE: lakehouse_engine/core/definitions.py ================================================ """Definitions of standard values and structures for core components.""" from dataclasses import dataclass from datetime import datetime from enum import Enum from pathlib import Path from typing import ClassVar, Collection, List, Optional, Tuple from pyspark.sql import DataFrame from pyspark.sql.types import ( ArrayType, BooleanType, StringType, StructField, StructType, TimestampType, ) from lakehouse_engine.io.exceptions import InputNotFoundException class CollectEngineUsage(Enum): """Options for collecting engine usage stats. - enabled, enables the collection and storage of Lakehouse Engine usage statistics for any environment. - prod_only, enables the collection and storage of Lakehouse Engine usage statistics for production environment only. - disabled, disables the collection and storage of Lakehouse Engine usage statistics, for all environments. """ ENABLED = "enabled" PROD_ONLY = "prod_only" DISABLED = "disabled" @dataclass class EngineConfig(object): """Definitions that can come from the Engine Config file. - dq_bucket: S3 prod bucket used to store data quality related artifacts. - dq_dev_bucket: S3 dev bucket used to store data quality related artifacts. - notif_disallowed_email_servers: email servers not allowed to be used for sending notifications. - engine_usage_path: path where the engine prod usage stats are stored. - engine_dev_usage_path: path where the engine dev usage stats are stored. - collect_engine_usage: whether to enable the collection of lakehouse engine usage stats or not. - dq_functions_column_list: list of columns to be added to the meta argument of GX when using PRISMA. - raise_on_config_not_available: whether to raise an exception if a spark config is not available. - prod_catalog: name of the prod catalog being used. This is useful to derive whether the environment is prod or dev, so the dev or prod buckets/paths can be used for storing engine usage stats and dq artifacts. - environment: environment that the engine is being executed on. Takes precedence over prod_catalog when defining if the environment is prod or dev. - sharepoint_authority: authority for the Sharepoint api. - sharepoint_company_domain: company domain for the Sharepoint api. - sharepoint_api_domain: api domain for the Sharepoint api. """ dq_bucket: Optional[str] = None dq_dev_bucket: Optional[str] = None notif_disallowed_email_servers: Optional[list] = None engine_usage_path: Optional[str] = None engine_dev_usage_path: Optional[str] = None collect_engine_usage: str = CollectEngineUsage.ENABLED.value dq_functions_column_list: Optional[list] = None dq_result_sink_columns_to_delete: Optional[list] = None sharepoint_authority: Optional[str] = None sharepoint_company_domain: Optional[str] = None sharepoint_api_domain: Optional[str] = None raise_on_config_not_available: bool = False prod_catalog: Optional[str] = None environment: Optional[str] = None class EngineStats(object): """Definitions for collection of Lakehouse Engine Stats. !!! note whenever the value comes from a key inside a Spark Config that returns an array, it can be specified with a '#' so that it is adequately processed. """ CLUSTER_USAGE_TAGS = "spark.databricks.clusterUsageTags" DEF_SPARK_CONFS = { "dp_name": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#accountName", "environment": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#environment", "workspace_id": f"{CLUSTER_USAGE_TAGS}.orgId", "job_id": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#JobId", "job_name": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#RunName", "run_id": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#ClusterName", } DEF_DATABRICKS_CONTEXT_KEYS = { "environment": "environment", "dp_name": "jobName", "run_id": "runId", "job_id": "jobId", "job_name": "jobName", "workspace_id": "workspaceId", "policy_id": "usagePolicyId", } class InputFormat(Enum): """Formats of algorithm input.""" JDBC = "jdbc" AVRO = "avro" JSON = "json" CSV = "csv" PARQUET = "parquet" DELTAFILES = "delta" CLOUDFILES = "cloudfiles" KAFKA = "kafka" SQL = "sql" SAP_BW = "sap_bw" SAP_B4 = "sap_b4" DATAFRAME = "dataframe" SFTP = "sftp" SHAREPOINT = "sharepoint" @classmethod def values(cls): # type: ignore """Generates a list containing all enum values. Returns: A list with all enum values. """ return (c.value for c in cls) @classmethod def exists(cls, input_format: str) -> bool: """Checks if the input format exists in the enum values. Args: input_format: format to check if exists. Returns: If the input format exists in our enum. """ return input_format in cls.values() # Formats of input that are considered files. FILE_INPUT_FORMATS = [ InputFormat.AVRO.value, InputFormat.JSON.value, InputFormat.PARQUET.value, InputFormat.CSV.value, InputFormat.DELTAFILES.value, InputFormat.CLOUDFILES.value, ] SHAREPOINT_SUPPORTED_EXTENSIONS = {".csv", ".xlsx"} @dataclass class SharepointFile: """Represents a file from Sharepoint with metadata and optional content.""" file_name: str time_created: str time_modified: str content: Optional[bytes] = None _folder: Optional[str] = None skip_rename: bool = False _already_archived: bool = False @property def file_extension(self) -> str: """Returns the file extension of the stored file.""" return Path(self.file_name).suffix @property def file_path(self) -> str: """Full Sharepoint path including folder and file name.""" if not self._folder: raise AttributeError("file_path unavailable; _folder not set.") return f"{self._folder}/{self.file_name}" @property def is_csv(self) -> bool: """True if file is a CSV.""" return self.file_extension.lower() == ".csv" @property def is_excel(self) -> bool: """True if file is an Excel file.""" return self.file_extension.lower() == ".xlsx" @property def content_size(self) -> int: """Size of content in bytes.""" return len(self.content) if self.content else 0 @dataclass class SharepointOptions(object): """Options for Sharepoint I/O (used by both reader and writer). This dataclass is shared by the Sharepoint reader and writer. Some fields are required/used only in *read* mode, others only in *write* mode. Use `validate_for_reader()` / `validate_for_writer()` to enforce the correct subsets. Common (reader & writer): - client_id (str): Azure AD application (client) ID. - tenant_id (str): Azure AD tenant (directory) ID. - site_name (str): Sharepoint site name. - drive_name (str): Document library/drive name. - secret (str): Client secret. - local_path (str): Local/volume path for staging (read/write temp). - api_version (str): Microsoft Graph API version (default: "v1.0"). - conflict_behaviour (Optional[str]): e.g. 'replace', 'fail'. - allowed_extensions (Optional[Collection[str]]): Defaults to SHAREPOINT_SUPPORTED_EXTENSIONS {".csv", ".xlsx"}. Reader-specific: - folder_relative_path (Optional[str]): Folder (or full file path) to read from. - file_name (Optional[str]): Name of a single file inside the folder to read. If `folder_relative_path` already points to a file, `file_name` must be None. - file_type (Optional[str]): "csv" or "xlsx" when reading a folder. - file_pattern (Optional[str]): Glob (e.g. '*.csv') when reading a folder. - local_options (Optional[dict]): Spark CSV read options (e.g. header, sep). - chunk_size (Optional[int]): Download chunk size (bytes). Writer-specific: - file_name (Optional[str]): Target file name to upload. - local_options (Optional[dict]): Spark CSV write options. - chunk_size (Optional[int]): Upload chunk size (bytes). Archiving (reader): - archive_enabled (bool): Whether to move files after a successful/failed read. Default: True. - archive_success_subfolder (Optional[str]): Success folder (default "done"). Set None to keep in place. - archive_error_subfolder (Optional[str]): Error folder (default "error"). Set None to keep in place. """ # Common client_id: str tenant_id: str site_name: str drive_name: str secret: str local_path: str file_name: Optional[str] = None # used by reader (optional) and writer (target) api_version: str = "v1.0" conflict_behaviour: Optional[str] = None allowed_extensions: Optional[Collection[str]] = None # Reader file_type: Optional[str] = None folder_relative_path: Optional[str] = None file_pattern: Optional[str] = None chunk_size: Optional[int] = 100 * 1024 * 1024 # 100 MB (read & write) local_options: Optional[dict] = None # (read & write) # Reader archiving archive_enabled: bool = True archive_success_subfolder: Optional[str] = "done" archive_error_subfolder: Optional[str] = "error" REQUIRED_READER_OPTS: ClassVar[Tuple[str, ...]] = ( "site_name", "drive_name", "folder_relative_path", ) REQUIRED_WRITER_OPTS: ClassVar[Tuple[str, ...]] = ( "site_name", "drive_name", "local_path", ) def __post_init__(self) -> None: """Normalize and validate Sharepoint options (types, extensions, etc).""" allowed_extensions = self._get_allowed_extensions() allowed_file_types = {extension.lstrip(".") for extension in allowed_extensions} self._validate_file_type(allowed_file_types) self._normalize_folder_relative_path() self._validate_folder_relative_path_extension_if_looks_like_file( allowed_extensions ) self._validate_single_file_mode_constraints_if_folder_is_file_path( allowed_extensions ) self._validate_file_name_and_file_pattern_are_not_both_set() def _get_allowed_extensions(self) -> set[str]: """Return the supported file extensions (lowercased).""" return { extension.lower() for extension in ( self.allowed_extensions or SHAREPOINT_SUPPORTED_EXTENSIONS ) } def _validate_file_type(self, allowed_file_types: set[str]) -> None: """Validate that `file_type` is supported when provided.""" if not self.file_type: return if self.file_type.lower() not in allowed_file_types: raise ValueError( f"`file_type` must be one of {sorted(allowed_file_types)}. " f"Got: '{self.file_type}'" ) def _normalize_folder_relative_path(self) -> None: """Strip leading and trailing slashes from `folder_relative_path`.""" if self.folder_relative_path: self.folder_relative_path = self.folder_relative_path.strip("/") def _ends_with_supported_extension( self, path_value: str, allowed_extensions: set[str], ) -> bool: """Return True if the path ends with any supported extension.""" lowered_path_value = path_value.lower() return any( lowered_path_value.endswith(extension) for extension in allowed_extensions ) def _validate_single_file_mode_constraints_if_folder_is_file_path( self, allowed_extensions: set[str], ) -> None: """Forbid file name, pattern, and type when folder_relative_path end is file.""" if not self.folder_relative_path: return if not self._ends_with_supported_extension( self.folder_relative_path, allowed_extensions ): return if self.file_name: raise ValueError( "When `folder_relative_path` points to a file, `file_name` must " "be None." ) if self.file_pattern: raise ValueError( "When `folder_relative_path` points to a file, `file_pattern` must " "be None." ) if self.file_type: raise ValueError( "When `folder_relative_path` points to a file, `file_type` must " "be None (it's derived from file_path extension)" ) def _validate_file_name_extension(self, allowed_extensions: set[str]) -> None: """Validate that `file_name` ends with a supported extension when provided.""" if not self.file_name: return if not self._ends_with_supported_extension(self.file_name, allowed_extensions): raise ValueError( f"`file_name` must end with one of {sorted(allowed_extensions)}," f" got: {self.file_name}" ) def _validate_file_name_and_file_pattern_are_not_both_set(self) -> None: """Validate that `file_name` and `file_pattern` are not both set.""" if self.file_name and self.file_pattern: raise ValueError( "Conflicting options: provide either `file_name` or `file_pattern`" ", not both." ) def _validate_folder_relative_path_extension_if_looks_like_file( self, allowed_extensions: set[str], ) -> None: """Fail if folder_relative_path is a file path but has unsupported extension.""" if not self.folder_relative_path: return last_segment = self.folder_relative_path.split("/")[-1] looks_like_file = "." in last_segment if not looks_like_file: return if self._ends_with_supported_extension(last_segment, allowed_extensions): return raise ValueError( f"`folder_relative_path` appears to be a file path but does not end " f"with one of {sorted(allowed_extensions)}: {self.folder_relative_path}" ) def validate_for_reader(self) -> None: """Validate Sharepoint options required for reading.""" missing = [opt for opt in self.REQUIRED_READER_OPTS if not getattr(self, opt)] if missing: raise InputNotFoundException( f"Missing required Sharepoint options for reader: {', '.join(missing)}" ) allowed_extensions = self._get_allowed_extensions() if self.file_name and not self._ends_with_supported_extension( self.file_name, allowed_extensions ): raise ValueError( f"`file_name` must end with one of {sorted(allowed_extensions)}, " "got: {self.file_name}" ) def validate_for_writer(self) -> None: """Validate Sharepoint options required for writing.""" missing = [opt for opt in self.REQUIRED_WRITER_OPTS if not getattr(self, opt)] if missing: raise InputNotFoundException( f"Missing required Sharepoint options for writer: {', '.join(missing)}" ) class OutputFormat(Enum): """Formats of algorithm output.""" JDBC = "jdbc" AVRO = "avro" JSON = "json" CSV = "csv" PARQUET = "parquet" DELTAFILES = "delta" KAFKA = "kafka" CONSOLE = "console" NOOP = "noop" DATAFRAME = "dataframe" REST_API = "rest_api" FILE = "file" # Internal use only TABLE = "table" # Internal use only SHAREPOINT = "sharepoint" @classmethod def values(cls): # type: ignore """Generates a list containing all enum values. Returns: A list with all enum values. """ return (c.value for c in cls) @classmethod def exists(cls, output_format: str) -> bool: """Checks if the output format exists in the enum values. Args: output_format: format to check if exists. Returns: If the output format exists in our enum. """ return output_format in cls.values() # Formats of output that are considered files. FILE_OUTPUT_FORMATS = [ OutputFormat.AVRO.value, OutputFormat.JSON.value, OutputFormat.PARQUET.value, OutputFormat.CSV.value, OutputFormat.DELTAFILES.value, ] class NotifierType(Enum): """Type of notifier available.""" EMAIL = "email" class NotificationRuntimeParameters(Enum): """Parameters to be replaced in runtime.""" DATABRICKS_JOB_NAME = "databricks_job_name" DATABRICKS_WORKSPACE_ID = "databricks_workspace_id" JOB_EXCEPTION = "exception" NOTIFICATION_RUNTIME_PARAMETERS = [ NotificationRuntimeParameters.DATABRICKS_JOB_NAME.value, NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID.value, NotificationRuntimeParameters.JOB_EXCEPTION.value, ] class ReadType(Enum): """Define the types of read operations. - BATCH - read the data in batch mode (e.g., Spark batch). - STREAMING - read the data in streaming mode (e.g., Spark streaming). """ BATCH = "batch" STREAMING = "streaming" class ReadMode(Enum): """Different modes that control how we handle compliance to the provided schema. These read modes map to Spark's read modes at the moment. """ PERMISSIVE = "PERMISSIVE" FAILFAST = "FAILFAST" DROPMALFORMED = "DROPMALFORMED" class DQDefaults(Enum): """Defaults used on the data quality process.""" FILE_SYSTEM_STORE = "file_system" FILE_SYSTEM_S3_STORE = "s3" DQ_BATCH_IDENTIFIERS = ["spec_id", "input_id", "timestamp"] DATASOURCE_CLASS_NAME = "Datasource" DATASOURCE_EXECUTION_ENGINE = "SparkDFExecutionEngine" DATA_CONNECTORS_CLASS_NAME = "RuntimeDataConnector" DATA_CONNECTORS_MODULE_NAME = "great_expectations.datasource.data_connector" STORE_BACKEND = "s3" EXPECTATIONS_STORE_PREFIX = "dq/expectations/" VALIDATIONS_STORE_PREFIX = "dq/validations/" CHECKPOINT_STORE_PREFIX = "dq/checkpoints/" CUSTOM_EXPECTATION_LIST = [ "expect_column_values_to_be_date_not_older_than", "expect_column_pair_a_to_be_smaller_or_equal_than_b", "expect_multicolumn_column_a_must_equal_b_or_c", "expect_queried_column_agg_value_to_be", "expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b", "expect_column_pair_a_to_be_not_equal_to_b", "expect_column_values_to_not_be_null_or_empty_string", ] DQ_COLUMNS_TO_KEEP_TYPES = [ "success", "run_time", "validation_results", "expectation_success", "exception_info", "meta", "run_time_year", "run_time_month", "run_time_day", "source_primary_key", "evaluated_expectations", "success_percent", "successful_expectations", "unsuccessful_expectations", "unexpected_index_list", ] DQ_VALIDATIONS_SCHEMA = StructType( [ StructField( "dq_validations", StructType( [ StructField("run_name", StringType()), StructField("run_success", BooleanType()), StructField("raised_exceptions", BooleanType()), StructField("run_row_success", BooleanType()), StructField( "dq_failure_details", ArrayType( StructType( [ StructField("expectation_type", StringType()), StructField("kwargs", StringType()), ] ), ), ), ] ), ) ] ) class WriteType(Enum): """Types of write operations.""" OVERWRITE = "overwrite" COMPLETE = "complete" APPEND = "append" UPDATE = "update" MERGE = "merge" ERROR_IF_EXISTS = "error" IGNORE_IF_EXISTS = "ignore" @dataclass class InputSpec(object): """Specification of an algorithm input. This is very aligned with the way the execution environment connects to the sources (e.g., spark sources). - spec_id: spec_id of the input specification - read_type: ReadType type of read operation. - data_format: format of the input. - sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp directory. - df_name: dataframe name. - db_table: table name in the form of `.`. - location: uri that identifies from where to read data in the specified format. - sharepoint_opts: Options to apply when reading from Sharepoint. - enforce_schema_from_table: if we want to enforce the table schema or not, by providing a table name in the form of `.
`. - query: sql query to execute and return the dataframe. Use it if you do not want to read from a file system nor from a table, but rather from a sql query. - schema: dict representation of a schema of the input (e.g., Spark struct type schema). - schema_path: path to a file with a representation of a schema of the input (e.g., Spark struct type schema). - disable_dbfs_retry: optional flag to disable file storage dbfs. - with_filepath: if we want to include the path of the file that is being read. Only works with the file reader (batch and streaming modes are supported). - options: dict with other relevant options according to the execution environment (e.g., spark) possible sources. - calculate_upper_bound: when to calculate upper bound to extract from SAP BW or not. - calc_upper_bound_schema: specific schema for the calculated upper_bound. - generate_predicates: when to generate predicates to extract from SAP BW or not. - predicates_add_null: if we want to include is null on partition by predicates. - temp_view: optional name of a view to point to the input dataframe to be used to create or replace a temp view on top of the dataframe. """ spec_id: str read_type: str data_format: Optional[str] = None sftp_files_format: Optional[str] = None df_name: Optional[DataFrame] = None db_table: Optional[str] = None location: Optional[str] = None sharepoint_opts: Optional[SharepointOptions] = None query: Optional[str] = None enforce_schema_from_table: Optional[str] = None schema: Optional[dict] = None schema_path: Optional[str] = None disable_dbfs_retry: bool = False with_filepath: bool = False options: Optional[dict] = None jdbc_args: Optional[dict] = None calculate_upper_bound: bool = False calc_upper_bound_schema: Optional[str] = None generate_predicates: bool = False predicates_add_null: bool = True temp_view: Optional[str] = None def __post_init__(self) -> None: """Normalize Sharepoint options if passed as a raw dictionary. Args: self: Instance of the class where `sharepoint_opts` attribute may be either a dictionary or a SharepointOptions object. """ if isinstance(self.sharepoint_opts, dict): self.sharepoint_opts = SharepointOptions(**self.sharepoint_opts) @dataclass class TransformerSpec(object): """Transformer Specification, i.e., a single transformation amongst many. - function: name of the function (or callable function) to be executed. - args: (not applicable if using a callable function) dict with the arguments to pass to the function `` pairs with the name of the parameter of the function and the respective value. """ function: str args: dict @dataclass class TransformSpec(object): """Transformation Specification. I.e., the specification that defines the many transformations to be done to the data that was read. - spec_id: id of the terminate specification - input_id: id of the corresponding input specification. - transformers: list of transformers to execute. - force_streaming_foreach_batch_processing: sometimes, when using streaming, we want to force the transform to be executed in the foreachBatch function to ensure non-supported streaming operations can be properly executed. """ spec_id: str input_id: str transformers: List[TransformerSpec] force_streaming_foreach_batch_processing: bool = False class DQType(Enum): """Available data quality tasks.""" VALIDATOR = "validator" PRISMA = "prisma" class DQResultFormat(Enum): """Available data quality result formats.""" COMPLETE = "COMPLETE" class DQExecutionPoint(Enum): """Available data quality execution points.""" IN_MOTION = "in_motion" AT_REST = "at_rest" class DQTableBaseParameters(Enum): """Base parameters for importing DQ rules from a table.""" PRISMA_BASE_PARAMETERS = ["arguments", "dq_tech_function"] @dataclass class DQFunctionSpec(object): """Defines a data quality function specification. - function - name of the data quality function (expectation) to execute. It follows the great_expectations api https://greatexpectations.io/expectations/. - args - args of the function (expectation). Follow the same api as above. """ function: str args: Optional[dict] = None @dataclass class DQSpec(object): """Data quality overall specification. - spec_id - id of the specification. - input_id - id of the input specification. - dq_type - type of DQ process to execute (e.g. validator). - dq_functions - list of function specifications to execute. - dq_db_table - name of table to derive the dq functions from. - dq_table_table_filter - name of the table which rules are to be applied in the validations (Only used when deriving dq functions). - dq_table_extra_filters - extra filters to be used when deriving dq functions. This is a sql expression to be applied to the dq_db_table. - execution_point - execution point of the dq functions. [at_rest, in_motion]. This is set during the load_data or dq_validator functions. - unexpected_rows_pk - the list of columns composing the primary key of the source data to identify the rows failing the DQ validations. Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It is mandatory to provide one of these arguments when using tag_source_data as True. When tag_source_data is False, this is not mandatory, but still recommended. - tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from. Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It is mandatory to provide one of these arguments when using tag_source_data as True. hen tag_source_data is False, this is not mandatory, but still recommended. - gx_result_format - great expectations result format. Default: "COMPLETE". - tag_source_data - when set to true, this will ensure that the DQ process ends by tagging the source data with an additional column with information about the DQ results. This column makes it possible to identify if the DQ run was succeeded in general and, if not, it unlocks the insights to know what specific rows have made the DQ validations fail and why. Default: False. Note: it only works if result_sink_explode is True, gx_result_format is COMPLETE, fail_on_error is False (which is done automatically when you specify tag_source_data as True) and tbl_to_derive_pk or unexpected_rows_pk is configured. - store_backend - which store_backend to use (e.g. s3 or file_system). - local_fs_root_dir - path of the root directory. Note: only applicable for store_backend file_system. - bucket - the bucket name to consider for the store_backend (store DQ artefacts). Note: only applicable for store_backend s3. - expectations_store_prefix - prefix where to store expectations' data. Note: only applicable for store_backend s3. - validations_store_prefix - prefix where to store validations' data. Note: only applicable for store_backend s3. - checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only applicable for store_backend s3. - data_asset_name - name of the data asset to consider when configuring the great expectations' data source. - expectation_suite_name - name to consider for great expectations' suite. - result_sink_db_table - db.table_name indicating the database and table in which to save the results of the DQ process. - result_sink_location - file system location in which to save the results of the DQ process. - result_sink_chunk_size - number of records per chunk when writing the results of the DQ process. Default: 1000000 records. - processed_keys_location - file system location where the keys processed by the DQ Process are saved. This is specifically used when the DQ Type is PRISMA. Note that this location is always constructed during the process, so any value defined in the configuration will be overwritten. - data_product_name - name of the data product. - result_sink_partitions - the list of partitions to consider. - result_sink_format - format of the result table (e.g. delta, parquet, kafka...). - result_sink_options - extra spark options for configuring the result sink. E.g: can be used to configure a Kafka sink if result_sink_format is kafka. - result_sink_explode - flag to determine if the output table/location should have the columns exploded (as True) or not (as False). Default: True. - result_sink_extra_columns - list of extra columns to be exploded (following the pattern ".*") or columns to be selected. It is only used when result_sink_explode is set to True. - source - name of data source, to be easier to identify in analysis. If not specified, it is set as default . This will be only used when result_sink_explode is set to True. - fail_on_error - whether to fail the algorithm if the validations of your data in the DQ process failed. - cache_df - whether to cache the dataframe before running the DQ process or not. - critical_functions - functions that should not fail. When this argument is defined, fail_on_error is nullified. - max_percentage_failure - percentage of failure that should be allowed. This argument has priority over both fail_on_error and critical_functions. - enable_row_condition - flag to determine if the row_conditions should be enabled or not. row_conditions allow you to filter the rows that are processed by the DQ functions. This is useful when you want to run the DQ functions only on a subset of the data. Default: False. Note: When using PRISMA, if you enable this flag, bear in mind that the number of processed keys will be numerically different from the evaluated keys. This happens because the row_conditions limit the number of rows that are processed by the DQ functions, but the we consider processed keys as all the keys that are passed to the dq_spec. """ spec_id: str input_id: str dq_type: str dq_functions: Optional[List[DQFunctionSpec]] = None dq_db_table: Optional[str] = None dq_table_table_filter: Optional[str] = None dq_table_extra_filters: Optional[str] = None execution_point: Optional[str] = None unexpected_rows_pk: Optional[List[str]] = None tbl_to_derive_pk: Optional[str] = None gx_result_format: Optional[str] = DQResultFormat.COMPLETE.value tag_source_data: Optional[bool] = False store_backend: str = DQDefaults.STORE_BACKEND.value local_fs_root_dir: Optional[str] = None bucket: Optional[str] = None expectations_store_prefix: str = DQDefaults.EXPECTATIONS_STORE_PREFIX.value validations_store_prefix: str = DQDefaults.VALIDATIONS_STORE_PREFIX.value checkpoint_store_prefix: str = DQDefaults.CHECKPOINT_STORE_PREFIX.value data_asset_name: Optional[str] = None expectation_suite_name: Optional[str] = None result_sink_db_table: Optional[str] = None result_sink_location: Optional[str] = None result_sink_chunk_size: Optional[int] = 1000000 processed_keys_location: Optional[str] = None data_product_name: Optional[str] = None result_sink_partitions: Optional[List[str]] = None result_sink_format: str = OutputFormat.DELTAFILES.value result_sink_options: Optional[dict] = None result_sink_explode: bool = True result_sink_extra_columns: Optional[List[str]] = None source: Optional[str] = None fail_on_error: bool = True cache_df: bool = False critical_functions: Optional[List[DQFunctionSpec]] = None max_percentage_failure: Optional[float] = None enable_row_condition: bool = False @dataclass class MergeOptions(object): """Options for a merge operation. - merge_predicate: predicate to apply to the merge operation so that we can check if a new record corresponds to a record already included in the historical data. - insert_only: indicates if the merge should only insert data (e.g., deduplicate scenarios). - delete_predicate: predicate to apply to the delete operation. - update_predicate: predicate to apply to the update operation. - insert_predicate: predicate to apply to the insert operation. - update_column_set: rules to apply to the update operation which allows to set the value for each column to be updated. (e.g. {"data": "new.data", "count": "current.count + 1"} ) - insert_column_set: rules to apply to the insert operation which allows to set the value for each column to be inserted. (e.g. {"date": "updates.date", "count": "1"} ) """ merge_predicate: str insert_only: bool = False delete_predicate: Optional[str] = None update_predicate: Optional[str] = None insert_predicate: Optional[str] = None update_column_set: Optional[dict] = None insert_column_set: Optional[dict] = None @dataclass class OutputSpec(object): """Specification of an algorithm output. This is very aligned with the way the execution environment connects to the output systems (e.g., spark outputs). - spec_id: id of the output specification. - input_id: id of the corresponding input specification. - write_type: type of write operation. - data_format: format of the output. Defaults to DELTA. - db_table: table name in the form of `.
`. - location: uri that identifies from where to write data in the specified format. - sharepoint_opts: options to apply on writing on Sharepoint operations. - partitions: list of partition input_col names. - merge_opts: options to apply to the merge operation. - streaming_micro_batch_transformers: transformers to invoke for each streaming micro batch, before writing (i.e., in Spark's foreachBatch structured streaming function). Note: the lakehouse engine manages this for you, so you don't have to manually specify streaming transformations here, so we don't advise you to manually specify transformations through this parameter. Supply them as regular transformers in the transform_specs sections of an ACON. - streaming_once: if the streaming query is to be executed just once, or not, generating just one micro batch. - streaming_processing_time: if streaming query is to be kept alive, this indicates the processing time of each micro batch. - streaming_available_now: if set to True, set a trigger that processes all available data in multiple batches then terminates the query. When using streaming, this is the default trigger that the lakehouse-engine will use, unless you configure a different one. - streaming_continuous: set a trigger that runs a continuous query with a given checkpoint interval. - streaming_await_termination: whether to wait (True) for the termination of the streaming query (e.g. timeout or exception) or not (False). Default: True. - streaming_await_termination_timeout: a timeout to set to the streaming_await_termination. Default: None. - with_batch_id: whether to include the streaming batch id in the final data, or not. It only takes effect in streaming mode. - options: dict with other relevant options according to the execution environment (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for streaming, etc. - streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers but for the DQ functions to be executed. Used internally by the lakehouse engine, so you don't have to supply DQ functions through this parameter. Use the dq_specs of the acon instead. """ spec_id: str input_id: str write_type: str data_format: str = OutputFormat.DELTAFILES.value db_table: Optional[str] = None location: Optional[str] = None sharepoint_opts: Optional[SharepointOptions] = None merge_opts: Optional[MergeOptions] = None partitions: Optional[List[str]] = None streaming_micro_batch_transformers: Optional[List[TransformerSpec]] = None streaming_once: Optional[bool] = None streaming_processing_time: Optional[str] = None streaming_available_now: bool = True streaming_continuous: Optional[str] = None streaming_await_termination: bool = True streaming_await_termination_timeout: Optional[int] = None with_batch_id: bool = False options: Optional[dict] = None streaming_micro_batch_dq_processors: Optional[List[DQSpec]] = None @dataclass class TerminatorSpec(object): """Terminator Specification. I.e., the specification that defines a terminator operation to be executed. Examples are compute statistics, vacuum, optimize, etc. - function: terminator function to execute. - args: arguments of the terminator function. - input_id: id of the corresponding output specification (Optional). """ function: str args: Optional[dict] = None input_id: Optional[str] = None @dataclass class ReconciliatorSpec(object): """Reconciliator Specification. - metrics: list of metrics in the form of: [{ metric: name of the column present in both truth and current datasets, aggregation: sum, avg, max, min, ..., type: percentage or absolute, yellow: value, red: value }]. - recon_type: reconciliation type (percentage or absolute). Percentage calculates the difference between truth and current results as a percentage (x-y/x), and absolute calculates the raw difference (x - y). - truth_input_spec: input specification of the truth data. - current_input_spec: input specification of the current results data - truth_preprocess_query: additional query on top of the truth input data to preprocess the truth data before it gets fueled into the reconciliation process. Important note: you need to assume that the data out of the truth_input_spec is referencable by a table called 'truth'. - truth_preprocess_query_args: optional dict having the functions/transformations to apply on top of the truth_preprocess_query and respective arguments. Note: cache is being applied on the Dataframe, by default. For turning the default behavior off, pass `"truth_preprocess_query_args": []`. - current_preprocess_query: additional query on top of the current results input data to preprocess the current results data before it gets fueled into the reconciliation process. Important note: you need to assume that the data out of the current_results_input_spec is referencable by a table called 'current'. - current_preprocess_query_args: optional dict having the functions/transformations to apply on top of the current_preprocess_query and respective arguments. Note: cache is being applied on the Dataframe, by default. For turning the default behavior off, pass `"current_preprocess_query_args": []`. - ignore_empty_df: optional boolean, to ignore the recon process if source & target dataframes are empty, recon will exit success code (passed) """ metrics: List[dict] truth_input_spec: InputSpec current_input_spec: InputSpec truth_preprocess_query: Optional[str] = None truth_preprocess_query_args: Optional[List[dict]] = None current_preprocess_query: Optional[str] = None current_preprocess_query_args: Optional[List[dict]] = None ignore_empty_df: Optional[bool] = False @dataclass class DQValidatorSpec(object): """Data Quality Validator Specification. - input_spec: input specification of the data to be checked/validated. - dq_spec: data quality specification. - restore_prev_version: specify if, having delta table/files as input, they should be restored to the previous version if the data quality process fails. Note: this is only considered if fail_on_error is kept as True. """ input_spec: InputSpec dq_spec: DQSpec restore_prev_version: Optional[bool] = False class SQLDefinitions(Enum): """SQL definitions statements.""" compute_table_stats = "ANALYZE TABLE {} COMPUTE STATISTICS" drop_table_stmt = "DROP TABLE IF EXISTS" drop_view_stmt = "DROP VIEW IF EXISTS" truncate_stmt = "TRUNCATE TABLE" describe_stmt = "DESCRIBE TABLE" optimize_stmt = "OPTIMIZE" show_tbl_props_stmt = "SHOW TBLPROPERTIES" delete_where_stmt = "DELETE FROM {} WHERE {}" class FileManagerAPIKeys(Enum): """File Manager s3 api keys.""" CONTENTS = "Contents" KEY = "Key" CONTINUATION = "NextContinuationToken" BUCKET = "Bucket" OBJECTS = "Objects" @dataclass class SensorSpec(object): """Sensor Specification. - sensor_id: sensor id. - assets: a list of assets that are considered as available to consume downstream after this sensor has status PROCESSED_NEW_DATA. - control_db_table_name: db.table to store sensor metadata. - input_spec: input specification of the source to be checked for new data. - preprocess_query: SQL query to transform/filter the result from the upstream. Consider that we should refer to 'new_data' whenever we are referring to the input of the sensor. E.g.: "SELECT dummy_col FROM new_data WHERE ..." - checkpoint_location: optional location to store checkpoints to resume from. These checkpoints use the same as Spark checkpoint strategy. For Spark readers that do not support checkpoints, use the preprocess_query parameter to form a SQL query to filter the result from the upstream accordingly. - fail_on_empty_result: if the sensor should throw an error if there is no new data in the upstream. Default: True. """ sensor_id: str assets: List[str] control_db_table_name: str input_spec: InputSpec preprocess_query: Optional[str] checkpoint_location: Optional[str] fail_on_empty_result: bool = True @classmethod def create_from_acon(cls, acon: dict): # type: ignore """Create SensorSpec from acon. Args: acon: sensor ACON. """ checkpoint_location = acon.get("base_checkpoint_location") if checkpoint_location: checkpoint_location = ( f"{checkpoint_location.rstrip('/')}/lakehouse_engine/" f"sensors/{acon['sensor_id']}" ) return cls( sensor_id=acon["sensor_id"], assets=acon["assets"], control_db_table_name=acon["control_db_table_name"], input_spec=InputSpec(**acon["input_spec"]), preprocess_query=acon.get("preprocess_query"), checkpoint_location=checkpoint_location, fail_on_empty_result=acon.get("fail_on_empty_result", True), ) class SensorStatus(Enum): """Status for a sensor.""" ACQUIRED_NEW_DATA = "ACQUIRED_NEW_DATA" PROCESSED_NEW_DATA = "PROCESSED_NEW_DATA" SENSOR_SCHEMA = StructType( [ StructField("sensor_id", StringType(), False), StructField("assets", ArrayType(StringType(), False), True), StructField("status", StringType(), False), StructField("status_change_timestamp", TimestampType(), False), StructField("checkpoint_location", StringType(), True), StructField("upstream_key", StringType(), True), StructField("upstream_value", StringType(), True), ] ) SENSOR_UPDATE_SET: dict = { "sensors.sensor_id": "updates.sensor_id", "sensors.status": "updates.status", "sensors.status_change_timestamp": "updates.status_change_timestamp", } SENSOR_ALLOWED_DATA_FORMATS = { ReadType.STREAMING.value: [InputFormat.KAFKA.value, *FILE_INPUT_FORMATS], ReadType.BATCH.value: [ InputFormat.DELTAFILES.value, InputFormat.JDBC.value, ], } class SAPLogchain(Enum): """Defaults used on consuming data from SAP Logchain.""" DBTABLE = "SAPPHA.RSPCLOGCHAIN" GREEN_STATUS = "G" ENGINE_TABLE = "sensor_new_data" class RestoreType(Enum): """Archive types.""" BULK = "Bulk" STANDARD = "Standard" EXPEDITED = "Expedited" @classmethod def values(cls): # type: ignore """Generates a list containing all enum values. Returns: A list with all enum values. """ return (c.value for c in cls) @classmethod def exists(cls, restore_type: str) -> bool: """Checks if the restore type exists in the enum values. Args: restore_type: restore type to check if exists. Returns: If the restore type exists in our enum. """ return restore_type in cls.values() class RestoreStatus(Enum): """Archive types.""" NOT_STARTED = "not_started" ONGOING = "ongoing" RESTORED = "restored" ARCHIVE_STORAGE_CLASS = [ "GLACIER", "DEEP_ARCHIVE", "GLACIER_IR", ] class SQLParser(Enum): """Defaults to use for parsing.""" DOUBLE_QUOTES = '"' SINGLE_QUOTES = "'" BACKSLASH = "\\" SINGLE_TRACE = "-" DOUBLE_TRACES = "--" SLASH = "/" OPENING_MULTIPLE_LINE_COMMENT = "/*" CLOSING_MULTIPLE_LINE_COMMENT = "*/" PARAGRAPH = "\n" STAR = "*" MULTIPLE_LINE_COMMENT = [ OPENING_MULTIPLE_LINE_COMMENT, CLOSING_MULTIPLE_LINE_COMMENT, ] class GABDefaults(Enum): """Defaults used on the GAB process.""" DATE_FORMAT = "%Y-%m-%d" DIMENSIONS_DEFAULT_COLUMNS = ["from_date", "to_date"] DEFAULT_DIMENSION_CALENDAR_TABLE = "dim_calendar" DEFAULT_LOOKUP_QUERY_BUILDER_TABLE = "lkp_query_builder" class GABStartOfWeek(Enum): """Representation of start of week values on GAB.""" SUNDAY = "S" MONDAY = "M" @classmethod def get_start_of_week(cls) -> dict: """Get the start of week enum as a dict. Returns: dict containing all enum entries as `{name:value}`. """ return { start_of_week.name: start_of_week.value for start_of_week in GABStartOfWeek } @classmethod def get_values(cls) -> set[str]: """Get the start of week enum values as set. Returns: set containing all possible values `{value}`. """ return {start_of_week.value for start_of_week in GABStartOfWeek} @dataclass class GABSpec(object): """Gab Specification. - query_label_filter: query use-case label to execute. - queue_filter: queue to execute the job. - cadence_filter: selected cadences to build the asset. - target_database: target database to write. - curr_date: current date. - start_date: period start date. - end_date: period end date. - rerun_flag: rerun flag. - target_table: target table to write. - source_database: source database. - gab_base_path: base path to read the use cases. - lookup_table: gab configuration table. - calendar_table: gab calendar table. """ query_label_filter: list[str] queue_filter: list[str] cadence_filter: list[str] target_database: str current_date: datetime start_date: datetime end_date: datetime rerun_flag: str target_table: str source_database: str gab_base_path: str lookup_table: str calendar_table: str @classmethod def create_from_acon(cls, acon: dict): # type: ignore """Create GabSpec from acon. Args: acon: gab ACON. """ lookup_table = f"{acon['source_database']}." + ( acon.get( "lookup_table", GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE.value ) ) calendar_table = f"{acon['source_database']}." + ( acon.get( "calendar_table", GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE.value ) ) def format_date(date_to_format: datetime | str) -> datetime: if isinstance(date_to_format, str): return datetime.strptime(date_to_format, GABDefaults.DATE_FORMAT.value) else: return date_to_format return cls( query_label_filter=acon["query_label_filter"], queue_filter=acon["queue_filter"], cadence_filter=acon["cadence_filter"], target_database=acon["target_database"], current_date=datetime.now(), start_date=format_date(acon["start_date"]), end_date=format_date(acon["end_date"]), rerun_flag=acon["rerun_flag"], target_table=acon["target_table"], source_database=acon["source_database"], gab_base_path=acon["gab_base_path"], lookup_table=lookup_table, calendar_table=calendar_table, ) class GABCadence(Enum): """Representation of the supported cadences on GAB.""" DAY = 1 WEEK = 2 MONTH = 3 QUARTER = 4 YEAR = 5 @classmethod def get_ordered_cadences(cls) -> dict: """Get the cadences ordered by the value. Returns: dict containing ordered cadences as `{name:value}`. """ return { cadence.name: cadence.value for cadence in sorted(GABCadence, key=lambda gab_cadence: gab_cadence.value) } @classmethod def get_cadences(cls) -> set[str]: """Get the cadences values as set. Returns: set containing all possible cadence values as `{value}`. """ return {cadence.name for cadence in GABCadence} @classmethod def order_cadences(cls, cadences_to_order: list[str]) -> list[str]: """Order a list of cadences by value. Returns: ordered set containing the received cadences. """ return sorted( cadences_to_order, key=lambda item: cls.get_ordered_cadences().get(item), # type: ignore ) class GABKeys: """Constants used to update pre-configured gab dict key.""" JOIN_SELECT = "join_select" PROJECT_START = "project_start" PROJECT_END = "project_end" class GABReplaceableKeys: """Constants used to replace pre-configured gab dict values.""" CADENCE = "${cad}" DATE_COLUMN = "${date_column}" CONFIG_WEEK_START = "${config_week_start}" RECONCILIATION_CADENCE = "${rec_cadence}" class GABCombinedConfiguration(Enum): """GAB combined configuration. Based on the use case configuration return the values to override in the SQL file. This enum aims to exhaustively map each combination of `cadence`, `reconciliation`, `week_start` and `snap_flag` return the corresponding values `join_select`, `project_start` and `project_end` to replace this values in the stages SQL file. Return corresponding configuration (join_select, project_start, project_end) for each combination (cadence x recon x week_start x snap_flag). """ _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE = ( "date(date_trunc('${cad}',${date_column}))" ) _DEFAULT_PROJECT_START = "df_cal.cadence_start_date" _DEFAULT_PROJECT_END = "df_cal.cadence_end_date" COMBINED_CONFIGURATION = { # Combination of: # - cadence: `DAY` # - reconciliation_window: `DAY`, `WEEK`, `MONTH`, `QUARTER`, `YEAR` # - week_start: `S`, `M` # - snapshot_flag: `Y`, `N` 1: { "cadence": GABCadence.DAY.name, "recon": GABCadence.get_cadences(), "week_start": GABStartOfWeek.get_values(), "snap_flag": {"Y", "N"}, "join_select": "", "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE, "project_end": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE, }, # Combination of: # - cadence: `WEEK` # - reconciliation_window: `DAY` # - week_start: `S`, `M` # - snapshot_flag: `Y` 2: { "cadence": GABCadence.WEEK.name, "recon": GABCadence.DAY.name, "week_start": GABStartOfWeek.get_values(), "snap_flag": "Y", "join_select": """ select distinct case when '${config_week_start}' = 'Monday' then weekstart_mon when '${config_week_start}' = 'Sunday' then weekstart_sun end as cadence_start_date, calendar_date as cadence_end_date """, "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, # Combination of: # - cadence: `WEEK` # - reconciliation_window: `DAY, `MONTH`, `QUARTER`, `YEAR` # - week_start: `M` # - snapshot_flag: `Y`, `N` 3: { "cadence": GABCadence.WEEK.name, "recon": { GABCadence.DAY.name, GABCadence.MONTH.name, GABCadence.QUARTER.name, GABCadence.YEAR.name, }, "week_start": "M", "snap_flag": {"Y", "N"}, "join_select": """ select distinct case when '${config_week_start}' = 'Monday' then weekstart_mon when '${config_week_start}' = 'Sunday' then weekstart_sun end as cadence_start_date, case when '${config_week_start}' = 'Monday' then weekend_mon when '${config_week_start}' = 'Sunday' then weekend_sun end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 4: { "cadence": GABCadence.MONTH.name, "recon": GABCadence.DAY.name, "week_start": GABStartOfWeek.get_values(), "snap_flag": "Y", "join_select": """ select distinct month_start as cadence_start_date, calendar_date as cadence_end_date """, "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 5: { "cadence": GABCadence.MONTH.name, "recon": GABCadence.WEEK.name, "week_start": GABStartOfWeek.MONDAY.value, "snap_flag": "Y", "join_select": """ select distinct month_start as cadence_start_date, case when date( date_trunc('MONTH',add_months(calendar_date, 1)) )-1 < weekend_mon then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1 else weekend_mon end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 6: { "cadence": GABCadence.MONTH.name, "recon": GABCadence.WEEK.name, "week_start": GABStartOfWeek.SUNDAY.value, "snap_flag": "Y", "join_select": """ select distinct month_start as cadence_start_date, case when date( date_trunc('MONTH',add_months(calendar_date, 1)) )-1 < weekend_sun then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1 else weekend_sun end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 7: { "cadence": GABCadence.MONTH.name, "recon": GABCadence.get_cadences(), "week_start": GABStartOfWeek.get_values(), "snap_flag": {"Y", "N"}, "join_select": "", "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE, "project_end": "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1", }, 8: { "cadence": GABCadence.QUARTER.name, "recon": GABCadence.DAY.name, "week_start": GABStartOfWeek.get_values(), "snap_flag": "Y", "join_select": """ select distinct quarter_start as cadence_start_date, calendar_date as cadence_end_date """, "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 9: { "cadence": GABCadence.QUARTER.name, "recon": GABCadence.WEEK.name, "week_start": GABStartOfWeek.MONDAY.value, "snap_flag": "Y", "join_select": """ select distinct quarter_start as cadence_start_date, case when weekend_mon > date( date_trunc('QUARTER',add_months(calendar_date, 3)) )-1 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1 else weekend_mon end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 10: { "cadence": GABCadence.QUARTER.name, "recon": GABCadence.WEEK.name, "week_start": GABStartOfWeek.SUNDAY.value, "snap_flag": "Y", "join_select": """ select distinct quarter_start as cadence_start_date, case when weekend_sun > date( date_trunc('QUARTER',add_months(calendar_date, 3)) )-1 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1 else weekend_sun end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 11: { "cadence": GABCadence.QUARTER.name, "recon": GABCadence.MONTH.name, "week_start": GABStartOfWeek.get_values(), "snap_flag": "Y", "join_select": """ select distinct quarter_start as cadence_start_date, month_end as cadence_end_date """, "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 12: { "cadence": GABCadence.QUARTER.name, "recon": GABCadence.YEAR.name, "week_start": GABStartOfWeek.get_values(), "snap_flag": "N", "join_select": "", "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE, "project_end": """ date( date_trunc( '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3) ) )-1 """, }, 13: { "cadence": GABCadence.QUARTER.name, "recon": GABCadence.get_cadences(), "week_start": GABStartOfWeek.get_values(), "snap_flag": "N", "join_select": "", "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE, "project_end": """ date( date_trunc( '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3) ) )-1 """, }, 14: { "cadence": GABCadence.YEAR.name, "recon": GABCadence.WEEK.name, "week_start": GABStartOfWeek.MONDAY.value, "snap_flag": "Y", "join_select": """ select distinct year_start as cadence_start_date, case when weekend_mon > date( date_trunc('YEAR',add_months(calendar_date, 12)) )-1 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1 else weekend_mon end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 15: { "cadence": GABCadence.YEAR.name, "recon": GABCadence.WEEK.name, "week_start": GABStartOfWeek.SUNDAY.value, "snap_flag": "Y", "join_select": """ select distinct year_start as cadence_start_date, case when weekend_sun > date( date_trunc('YEAR',add_months(calendar_date, 12)) )-1 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1 else weekend_sun end as cadence_end_date""", "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 16: { "cadence": GABCadence.YEAR.name, "recon": GABCadence.get_cadences(), "week_start": GABStartOfWeek.get_values(), "snap_flag": "N", "inverse_flag": "Y", "join_select": "", "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE, "project_end": """ date( date_trunc( '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12) ) )-1 """, }, 17: { "cadence": GABCadence.YEAR.name, "recon": { GABCadence.DAY.name, GABCadence.MONTH.name, GABCadence.QUARTER.name, }, "week_start": GABStartOfWeek.get_values(), "snap_flag": "Y", "join_select": """ select distinct year_start as cadence_start_date, case when '${rec_cadence}' = 'DAY' then calendar_date when '${rec_cadence}' = 'MONTH' then month_end when '${rec_cadence}' = 'QUARTER' then quarter_end end as cadence_end_date """, "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, 18: { "cadence": GABCadence.get_cadences(), "recon": GABCadence.get_cadences(), "week_start": GABStartOfWeek.get_values(), "snap_flag": {"Y", "N"}, "join_select": """ select distinct case when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday' then weekstart_mon when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday' then weekstart_sun else date(date_trunc('${cad}',calendar_date)) end as cadence_start_date, case when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday' then weekend_mon when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday' then weekend_sun when '${cad}' = 'DAY' then date(date_trunc('${cad}',calendar_date)) when '${cad}' = 'MONTH' then date( date_trunc( 'MONTH', add_months(date(date_trunc('${cad}',calendar_date)), 1) ) )-1 when '${cad}' = 'QUARTER' then date( date_trunc( 'QUARTER', add_months(date(date_trunc('${cad}',calendar_date)) , 3) ) )-1 when '${cad}' = 'YEAR' then date( date_trunc( 'YEAR', add_months(date(date_trunc('${cad}',calendar_date)), 12) ) )-1 end as cadence_end_date """, "project_start": _DEFAULT_PROJECT_START, "project_end": _DEFAULT_PROJECT_END, }, } @dataclass class HeartbeatConfigSpec(object): """Heartbeat Configurations and control table specifications. This provides the way in which the Heartbeat can pass environment and specific quantum related config information to sensor acon. - sensor_source: specifies the source system of sensor, for e.g. sap_b4, sap_bw, delta_table, kafka, lmu_delta_table, trigger_file etc. It is also a part of heartbeat control table, Therefore it is useful for filtering out data from Heartbeat control table based on template source system. - data_format: format of the input source, e.g jdbc, delta, kafka, cloudfiles etc. - heartbeat_sensor_db_table: heartbeat control table along with database from config. - lakehouse_engine_sensor_db_table: Control table along with database(config). - options: dict with other relevant options for reading data from specified input data_format. This can vary for each source system. For e.g. For sap systems, DRIVER, URL, USERNAME, PASSWORD are required which are all being read from config file of quantum. - jdbc_db_table: schema and table name of JDBC sources. - token: token to access Databricks Job API(read from config). - domain: workspace domain url for quantum(read from config). - base_checkpoint_location: checkpoint location for streaming sources(from config). - kafka_configs: configs required for kafka. It is (read from config) as JSON. config hierarchy is [sensor_kafka --> --> main kafka options]. - kafka_secret_scope: secret scope for kafka (read from config). - base_trigger_file_location: location where all the trigger files are being created (read from config). - schema_dict: dict representation of schema of the trigger file (e.g. Spark struct type schema). """ sensor_source: str data_format: str heartbeat_sensor_db_table: str lakehouse_engine_sensor_db_table: str token: str domain: str options: Optional[dict] jdbc_db_table: Optional[str] base_checkpoint_location: Optional[str] kafka_configs: Optional[dict] kafka_secret_scope: Optional[str] base_trigger_file_location: Optional[str] schema_dict: Optional[dict] @classmethod def create_from_acon(cls, acon: dict): # type: ignore """Create HeartbeatConfigSpec from acon. Args: acon: Heartbeat ACON. """ return cls( sensor_source=acon["sensor_source"], data_format=acon["data_format"], heartbeat_sensor_db_table=acon["heartbeat_sensor_db_table"], lakehouse_engine_sensor_db_table=acon["lakehouse_engine_sensor_db_table"], token=acon["token"], domain=acon["domain"], options=acon.get("options"), jdbc_db_table=acon.get("jdbc_db_table"), base_checkpoint_location=acon.get("base_checkpoint_location"), kafka_configs=acon.get("kafka_configs"), kafka_secret_scope=acon.get("kafka_secret_scope"), base_trigger_file_location=acon.get("base_trigger_file_location"), schema_dict=acon.get("schema_dict"), ) class HeartbeatSensorSource(Enum): """Formats of algorithm input.""" SAP_BW = "sap_bw" SAP_B4 = "sap_b4" DELTA_TABLE = "delta_table" KAFKA = "kafka" LMU_DELTA_TABLE = "lmu_delta_table" TRIGGER_FILE = "trigger_file" @classmethod def values(cls): # type: ignore """Generates a list containing all enum values. Returns: A list with all enum values. """ return (c.value for c in cls) class HeartbeatStatus(Enum): """Status for a sensor.""" NEW_EVENT_AVAILABLE = "NEW_EVENT_AVAILABLE" IN_PROGRESS = "IN_PROGRESS" COMPLETED = "COMPLETED" HEARTBEAT_SENSOR_UPDATE_SET: dict = { "target.sensor_source": "src.sensor_source", "target.sensor_id": "src.sensor_id", "target.asset_description": "src.asset_description", "target.upstream_key": "src.upstream_key", "target.preprocess_query": "src.preprocess_query", "target.latest_event_fetched_timestamp": "src.latest_event_fetched_timestamp", "target.trigger_job_id": "src.trigger_job_id", "target.trigger_job_name": "src.trigger_job_name", "target.status": "src.status", "target.status_change_timestamp": "src.status_change_timestamp", "target.job_start_timestamp": "src.job_start_timestamp", "target.job_end_timestamp": "src.job_end_timestamp", "target.job_state": "src.job_state", "target.dependency_flag": "src.dependency_flag", "target.sensor_read_type": "src.sensor_read_type", } TABLE_MANAGER_OPERATIONS = { "compute_table_statistics": {"table_or_view": {"type": "str", "mandatory": True}}, "create_table": { "path": {"type": "str", "mandatory": True}, "disable_dbfs_retry": {"type": "bool", "mandatory": False}, "delimiter": {"type": "str", "mandatory": False}, "advanced_parser": {"type": "bool", "mandatory": False}, }, "create_tables": { "path": {"type": "str", "mandatory": True}, "disable_dbfs_retry": {"type": "bool", "mandatory": False}, "delimiter": {"type": "str", "mandatory": False}, "advanced_parser": {"type": "bool", "mandatory": False}, }, "create_view": { "path": {"type": "str", "mandatory": True}, "disable_dbfs_retry": {"type": "bool", "mandatory": False}, "delimiter": {"type": "str", "mandatory": False}, "advanced_parser": {"type": "bool", "mandatory": False}, }, "drop_table": {"table_or_view": {"type": "str", "mandatory": True}}, "drop_view": {"table_or_view": {"type": "str", "mandatory": True}}, "execute_sql": { "sql": {"type": "str", "mandatory": True}, "delimiter": {"type": "str", "mandatory": False}, "advanced_parser": {"type": "bool", "mandatory": False}, }, "truncate": {"table_or_view": {"type": "str", "mandatory": True}}, "vacuum": { "table_or_view": {"type": "str", "mandatory": False}, "path": {"type": "str", "mandatory": False}, "vacuum_hours": {"type": "int", "mandatory": False}, }, "describe": {"table_or_view": {"type": "str", "mandatory": True}}, "optimize": { "table_or_view": {"type": "str", "mandatory": False}, "path": {"type": "str", "mandatory": False}, "where_clause": {"type": "str", "mandatory": False}, "optimize_zorder_col_list": {"type": "str", "mandatory": False}, }, "show_tbl_properties": {"table_or_view": {"type": "str", "mandatory": True}}, "get_tbl_pk": {"table_or_view": {"type": "str", "mandatory": True}}, "repair_table": { "table_or_view": {"type": "str", "mandatory": True}, "sync_metadata": {"type": "bool", "mandatory": True}, }, "delete_where": { "table_or_view": {"type": "str", "mandatory": True}, "where_clause": {"type": "str", "mandatory": True}, }, } FILE_MANAGER_OPERATIONS = { "delete_objects": { "bucket": {"type": "str", "mandatory": True}, "object_paths": {"type": "list", "mandatory": True}, "dry_run": {"type": "bool", "mandatory": True}, }, "copy_objects": { "bucket": {"type": "str", "mandatory": True}, "source_object": {"type": "str", "mandatory": True}, "destination_bucket": {"type": "str", "mandatory": True}, "destination_object": {"type": "str", "mandatory": True}, "dry_run": {"type": "bool", "mandatory": True}, }, "move_objects": { "bucket": {"type": "str", "mandatory": True}, "source_object": {"type": "str", "mandatory": True}, "destination_bucket": {"type": "str", "mandatory": True}, "destination_object": {"type": "str", "mandatory": True}, "dry_run": {"type": "bool", "mandatory": True}, }, "request_restore": { "bucket": {"type": "str", "mandatory": True}, "source_object": {"type": "str", "mandatory": True}, "restore_expiration": {"type": "int", "mandatory": True}, "retrieval_tier": {"type": "str", "mandatory": True}, "dry_run": {"type": "bool", "mandatory": True}, }, "check_restore_status": { "bucket": {"type": "str", "mandatory": True}, "source_object": {"type": "str", "mandatory": True}, }, "request_restore_to_destination_and_wait": { "bucket": {"type": "str", "mandatory": True}, "source_object": {"type": "str", "mandatory": True}, "destination_bucket": {"type": "str", "mandatory": True}, "destination_object": {"type": "str", "mandatory": True}, "restore_expiration": {"type": "int", "mandatory": True}, "retrieval_tier": {"type": "str", "mandatory": True}, "dry_run": {"type": "bool", "mandatory": True}, }, } ================================================ FILE: lakehouse_engine/core/exec_env.py ================================================ """Module to take care of creating a singleton of the execution environment class.""" from dataclasses import replace from pyspark.sql import DataFrame, SparkSession from lakehouse_engine.core.definitions import EngineConfig from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.databricks_utils import DatabricksUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class ExecEnv(object): """Represents the basic resources regarding the engine execution environment. Currently, it is used to encapsulate both the logic to get the Spark session and the engine configurations. """ SESSION: SparkSession _LOGGER = LoggingHandler(__name__).get_logger() ENGINE_CONFIG: EngineConfig = EngineConfig(**ConfigUtils.get_config()) IS_SERVERLESS = DatabricksUtils.is_serverless_workload() @classmethod def set_default_engine_config( cls, package: str = "lakehouse_engine.configs", custom_configs_dict: dict = None, custom_configs_file_path: str = None, ) -> None: """Set default engine configurations. The function set the default engine configurations by reading them from a specified package and overwrite them if the user pass a dictionary or a file path with new configurations. Args: package: package where the engine default configurations can be found. custom_configs_dict: a dictionary with custom configurations to overwrite the default ones. custom_configs_file_path: path for the file with custom configurations to overwrite the default ones. """ cls.ENGINE_CONFIG = EngineConfig(**ConfigUtils.get_config(package)) if custom_configs_dict: cls.ENGINE_CONFIG = replace(cls.ENGINE_CONFIG, **custom_configs_dict) if custom_configs_file_path: cls.ENGINE_CONFIG = replace( cls.ENGINE_CONFIG, **ConfigUtils.get_config_from_file(custom_configs_file_path), ) @classmethod def get_or_create( cls, session: SparkSession = None, enable_hive_support: bool = True, app_name: str = None, config: dict = None, ) -> None: """Get or create an execution environment session (currently Spark). It instantiates a singleton session that can be accessed anywhere from the lakehouse engine. By default, if there is an existing Spark Session in the environment (getActiveSession()), this function re-uses it. It can be further extended in the future to support forcing the creation of new isolated sessions even when a Spark Session is already active. Args: session: spark session. enable_hive_support: whether to enable hive support or not. app_name: application name. config: extra spark configs to supply to the spark session. """ if not cls.IS_SERVERLESS: default_config = { "spark.databricks.delta.optimizeWrite.enabled": True, "spark.sql.adaptive.enabled": True, "spark.databricks.delta.merge.enableLowShuffle": True, } cls._LOGGER.info( f"Using the following default configs you may want to override them " f"for your job: {default_config}" ) else: default_config = {} final_config: dict = {**default_config, **(config if config else {})} cls._LOGGER.info(f"Final config is: {final_config}") if session: cls.SESSION = session elif SparkSession.getActiveSession(): cls.SESSION = SparkSession.getActiveSession() cls._set_spark_configs(final_config) else: cls._LOGGER.info("Creating a new Spark Session") session_builder = SparkSession.builder.appName(app_name) cls._set_spark_configs(final_config, session_builder) if enable_hive_support: session_builder = session_builder.enableHiveSupport() cls.SESSION = session_builder.getOrCreate() @classmethod def get_for_each_batch_session(cls, df: DataFrame) -> None: """Get the execution environment session for foreachBatch operations. For Spark connect scenarios, spark is not able to re-use the Spark session from an external scope as it cannot serialise it, so the session needs to be retrieved and stored again in the ExecEnv class. """ cls.SESSION = df.sparkSession.getActiveSession() @classmethod def _set_spark_configs( cls, final_config: dict, session_builder: SparkSession.Builder = None ) -> None: """Set Spark session configurations based on final_config. This method attempts to set each configuration key-value pair in the provided final_config dictionary to the Spark session. If a configuration key is not available in the current environment, it logs a warning and skips that key. Args: final_config: dictionary with spark configurations to set. session_builder: spark session builder. """ for key, value in final_config.items(): try: if session_builder: session_builder.config(key, value) else: cls.SESSION.conf.set(key, value) except Exception as e: if ( "[CONFIG_NOT_AVAILABLE]" in str(e) and not ExecEnv.ENGINE_CONFIG.raise_on_config_not_available ): cls._LOGGER.warning( f"Spark config '{key}' is not available in this " f"environment and will be skipped." ) else: raise e @classmethod def get_environment(cls) -> str: """Get the environment where the process is running. Returns: Name of the environment. """ if cls.ENGINE_CONFIG.environment: return cls.ENGINE_CONFIG.environment catalog = cls.SESSION.sql("SELECT current_catalog()").collect()[0][0] if catalog.lower() == cls.ENGINE_CONFIG.prod_catalog: return "prod" else: return "dev" ================================================ FILE: lakehouse_engine/core/executable.py ================================================ """Module representing an executable lakehouse engine component.""" from abc import ABC, abstractmethod from typing import Any, Optional class Executable(ABC): """Abstract class defining the behaviour of an executable component.""" @abstractmethod def execute(self) -> Optional[Any]: """Define the executable component behaviour. E.g., the behaviour of an algorithm inheriting from this. """ pass ================================================ FILE: lakehouse_engine/core/file_manager.py ================================================ """Module for abstract representation of a file manager system.""" from abc import ABC, abstractmethod from typing import Any from lakehouse_engine.algorithms.exceptions import RestoreTypeNotFoundException from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions class FileManager(ABC): # noqa: B024 """Abstract file manager class. {{ get_file_manager_operations() }} """ def __init__(self, configs: dict): """Construct FileManager algorithm instances. Args: configs: configurations for the FileManager algorithm. """ self.configs = configs self.function = self.configs["function"] @abstractmethod def delete_objects(self) -> None: """Delete objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be deleted based on the given keys. """ pass @abstractmethod def copy_objects(self) -> None: """Copies objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be copied based on the given keys. """ pass @abstractmethod def move_objects(self) -> None: """Moves objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be moved based on the given keys. """ pass class FileManagerFactory(ABC): # noqa: B024 """Class for file manager factory.""" @staticmethod def execute_function(configs: dict) -> Any: """Get a specific File Manager and function to execute.""" from lakehouse_engine.core.dbfs_file_manager import DBFSFileManager from lakehouse_engine.core.s3_file_manager import S3FileManager disable_dbfs_retry = ( configs["disable_dbfs_retry"] if "disable_dbfs_retry" in configs.keys() else False ) if disable_dbfs_retry: S3FileManager(configs).get_function() elif FileStorageFunctions.is_boto3_configured(): try: S3FileManager(configs).get_function() except (ValueError, NotImplementedError, RestoreTypeNotFoundException): raise except Exception: DBFSFileManager(configs).get_function() else: DBFSFileManager(configs).get_function() ================================================ FILE: lakehouse_engine/core/gab_manager.py ================================================ """Module to define GAB Manager classes.""" import calendar from datetime import datetime, timedelta from typing import Tuple, cast import pendulum from pendulum import DateTime from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import GABCadence, GABDefaults from lakehouse_engine.core.gab_sql_generator import GABViewGenerator from lakehouse_engine.utils.gab_utils import GABUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class GABCadenceManager(object): """Class to control the GAB Cadence Window.""" _LOGGER = LoggingHandler(__name__).get_logger() def extended_window_calculator( self, cadence: str, reconciliation_cadence: str, current_date: datetime, start_date_str: str, end_date_str: str, query_type: str, rerun_flag: str, snapshot_flag: str, ) -> tuple[datetime, datetime, datetime, datetime]: """extended_window_calculator function. Calculates the extended window of any cadence despite the user providing custom dates which are not the exact start and end dates of a cadence. Args: cadence: cadence to process reconciliation_cadence: reconciliation to process. current_date: current date. start_date_str: start date of the period to process. end_date_str: end date of the period to process. query_type: use case query type. rerun_flag: flag indicating if it's a rerun or a normal run. snapshot_flag: flag indicating if for this cadence the snapshot is enabled. """ cad_order = GABCadence.get_ordered_cadences() derived_cadence = self._get_reconciliation_cadence( cad_order, rerun_flag, cadence, reconciliation_cadence, snapshot_flag ) self._LOGGER.info(f"cadence passed to extended window: {derived_cadence}") start_date = datetime.strptime(start_date_str, GABDefaults.DATE_FORMAT.value) end_date = datetime.strptime(end_date_str, GABDefaults.DATE_FORMAT.value) bucket_start_date, bucket_end_date = self.get_cadence_start_end_dates( cadence, derived_cadence, start_date, end_date, query_type, current_date ) self._LOGGER.info(f"bucket dates: {bucket_start_date} - {bucket_end_date}") filter_start_date, filter_end_date = self.get_cadence_start_end_dates( cadence, ( reconciliation_cadence if cad_order[cadence] < cad_order[reconciliation_cadence] else cadence ), start_date, end_date, query_type, current_date, ) self._LOGGER.info(f"filter dates: {filter_start_date} - {filter_end_date}") return bucket_start_date, bucket_end_date, filter_start_date, filter_end_date @classmethod def _get_reconciliation_cadence( cls, cadence_order: dict, rerun_flag: str, cadence: str, reconciliation_cadence: str, snapshot_flag: str, ) -> str: """Get bigger cadence when rerun_flag or snapshot. Args: cadence_order: ordered cadences. rerun_flag: flag indicating if it's a rerun or a normal run. cadence: cadence to process. reconciliation_cadence: reconciliation to process. snapshot_flag: flag indicating if for this cadence the snapshot is enabled. """ derived_cadence = reconciliation_cadence if rerun_flag == "Y": if cadence_order[cadence] > cadence_order[reconciliation_cadence]: derived_cadence = cadence elif cadence_order[cadence] < cadence_order[reconciliation_cadence]: derived_cadence = reconciliation_cadence else: if ( cadence_order[cadence] > cadence_order[reconciliation_cadence] and snapshot_flag == "Y" ) or (cadence_order[cadence] < cadence_order[reconciliation_cadence]): derived_cadence = reconciliation_cadence elif ( cadence_order[cadence] > cadence_order[reconciliation_cadence] and snapshot_flag == "N" ): derived_cadence = cadence return derived_cadence def get_cadence_start_end_dates( self, cadence: str, derived_cadence: str, start_date: datetime, end_date: datetime, query_type: str, current_date: datetime, ) -> tuple[datetime, datetime]: """Generate the new set of extended start and end dates based on the cadence. Running week cadence again to extend to correct week start and end date in case of recon window for Week cadence is present. For end_date 2012-12-31,in case of Quarter Recon window present for Week cadence, start and end dates are recalculated to 2022-10-01 to 2022-12-31. But these are not start and end dates of week. Hence, to correct this, new dates are passed again to get the correct dates. Args: cadence: cadence to process. derived_cadence: cadence reconciliation to process. start_date: start date of the period to process. end_date: end date of the period to process. query_type: use case query type. current_date: current date to be used in the end date, in case the end date is greater than current date so the end date should be the current date. """ new_start_date = self._get_cadence_calculated_date( derived_cadence=derived_cadence, base_date=start_date, is_start=True ) new_end_date = self._get_cadence_calculated_date( derived_cadence=derived_cadence, base_date=end_date, is_start=False ) if cadence.upper() == "WEEK": new_start_date = ( pendulum.datetime( int(new_start_date.strftime("%Y")), int(new_start_date.strftime("%m")), int(new_start_date.strftime("%d")), ) .start_of("week") .replace(tzinfo=None) ) new_end_date = ( pendulum.datetime( int(new_end_date.strftime("%Y")), int(new_end_date.strftime("%m")), int(new_end_date.strftime("%d")), ) .end_of("week") .replace(hour=0, minute=0, second=0, microsecond=0) .replace(tzinfo=None) ) new_end_date = new_end_date + timedelta(days=1) if new_end_date >= current_date: new_end_date = current_date if query_type == "NAM": new_end_date = new_end_date + timedelta(days=1) return new_start_date, new_end_date @classmethod def _get_cadence_calculated_date( cls, derived_cadence: str, base_date: datetime, is_start: bool ) -> datetime | DateTime: # type: ignore cadence_base_date = cls._get_cadence_base_date(derived_cadence, base_date) cadence_date_calculated: DateTime | datetime if derived_cadence.upper() == "WEEK": cadence_date_calculated = cls._get_calculated_week_date( cast(DateTime, cadence_base_date), is_start ) elif derived_cadence.upper() == "MONTH": cadence_date_calculated = cls._get_calculated_month_date( cast(datetime, cadence_base_date), is_start ) elif derived_cadence.upper() in ["QUARTER", "YEAR"]: cadence_date_calculated = cls._get_calculated_quarter_or_year_date( cast(DateTime, cadence_base_date), is_start, derived_cadence ) else: cadence_date_calculated = cadence_base_date # type: ignore return cadence_date_calculated # type: ignore @classmethod def _get_cadence_base_date( cls, derived_cadence: str, base_date: datetime ) -> datetime | DateTime | str: # type: ignore """Get start date for the selected cadence. Args: derived_cadence: cadence reconciliation to process. base_date: base date used to compute the start date of the cadence. """ if derived_cadence.upper() in ["DAY", "MONTH"]: cadence_date_calculated = base_date elif derived_cadence.upper() in ["WEEK", "QUARTER", "YEAR"]: cadence_date_calculated = pendulum.datetime( int(base_date.strftime("%Y")), int(base_date.strftime("%m")), int(base_date.strftime("%d")), ) else: cadence_date_calculated = "0" # type: ignore return cadence_date_calculated @classmethod def _get_calculated_week_date( cls, cadence_date_calculated: DateTime, is_start: bool ) -> DateTime: """Get WEEK start/end date. Args: cadence_date_calculated: base date to compute the week date. is_start: flag indicating if we should get the start or end for the cadence. """ if is_start: cadence_date_calculated = cadence_date_calculated.start_of("week").replace( tzinfo=None ) else: cadence_date_calculated = ( cadence_date_calculated.end_of("week") .replace(hour=0, minute=0, second=0, microsecond=0) .replace(tzinfo=None) ) return cadence_date_calculated @classmethod def _get_calculated_month_date( cls, cadence_date_calculated: datetime, is_start: bool ) -> datetime: """Get MONTH start/end date. Args: cadence_date_calculated: base date to compute the month date. is_start: flag indicating if we should get the start or end for the cadence. """ if is_start: cadence_date_calculated = cadence_date_calculated - timedelta( days=(int(cadence_date_calculated.strftime("%d")) - 1) ) else: cadence_date_calculated = datetime( int(cadence_date_calculated.strftime("%Y")), int(cadence_date_calculated.strftime("%m")), calendar.monthrange( int(cadence_date_calculated.strftime("%Y")), int(cadence_date_calculated.strftime("%m")), )[1], ) return cadence_date_calculated @classmethod def _get_calculated_quarter_or_year_date( cls, cadence_date_calculated: DateTime, is_start: bool, cadence: str ) -> DateTime: """Get QUARTER/YEAR start/end date. Args: cadence_date_calculated: base date to compute the quarter/year date. is_start: flag indicating if we should get the start or end for the cadence. cadence: selected cadence (possible values: QUARTER or YEAR). """ if is_start: cadence_date_calculated = cadence_date_calculated.first_of( cadence.lower() ).replace(tzinfo=None) else: cadence_date_calculated = cadence_date_calculated.last_of( cadence.lower() ).replace(tzinfo=None) return cadence_date_calculated class GABViewManager(object): """Class to control the GAB View creation.""" _LOGGER = LoggingHandler(__name__).get_logger() def __init__( self, query_id: str, lookup_query_builder: DataFrame, target_database: str, target_table: str, ): """Construct GABViewManager instances. Args: query_id: gab configuration table use case identifier. lookup_query_builder: gab configuration data. target_database: target database to write. target_table: target table to write. """ self.query_id = query_id self.lookup_query_builder = lookup_query_builder self.target_database = target_database self.target_table = target_table def generate_use_case_views(self) -> None: """Generate all the use case views. Generates the DDLs for each of the views. This DDL is dynamically built based on the mappings provided in the config table. """ reconciliation_window = GABUtils.get_json_column_as_dict( self.lookup_query_builder, self.query_id, "recon_window" ) cadence_snapshot_status = self._get_cadence_snapshot_status( reconciliation_window ) ( cadences_with_snapshot, cadences_without_snapshot, ) = self._split_cadence_by_snapshot(cadence_snapshot_status) mappings = GABUtils.get_json_column_as_dict( self.lookup_query_builder, self.query_id, "mappings" ) for view_name in mappings.keys(): self._generate_use_case_view( mappings, view_name, cadence_snapshot_status, cadences_with_snapshot, cadences_without_snapshot, self.target_database, self.target_table, self.query_id, ) @classmethod def _generate_use_case_view( cls, mappings: dict, view_name: str, cadence_snapshot_status: dict, cadences_with_snapshot: list[str], cadences_without_snapshot: list[str], target_database: str, target_table: str, query_id: str, ) -> None: """Generate the selected use case views. Args: mappings: use case mappings configuration. view_name: name of the view to be generated. cadence_snapshot_status: cadences to execute with the information if it has snapshot. cadences_with_snapshot: cadences to execute with snapshot. cadences_without_snapshot: cadences to execute without snapshot. target_database: target database to write. target_table: target table to write. query_id: gab configuration table use case identifier. """ view_configuration = mappings[view_name] view_dimensions = view_configuration["dimensions"] view_metrics = view_configuration["metric"] custom_filter = view_configuration["filter"] view_filter = " " if custom_filter: view_filter = " AND " + custom_filter ( dimensions, dimensions_and_metrics, dimensions_and_metrics_with_alias, ) = cls._get_dimensions_and_metrics_from_use_case_view( view_dimensions, view_metrics ) ( final_cols, final_calculated_script, final_calculated_script_snapshot, ) = cls._get_calculated_and_derived_metrics_from_use_case_view( view_metrics, view_dimensions, cadence_snapshot_status ) GABViewGenerator( cadence_snapshot_status=cadence_snapshot_status, target_database=target_database, view_name=view_name, final_cols=final_cols, target_table=target_table, dimensions_and_metrics_with_alias=dimensions_and_metrics_with_alias, dimensions=dimensions, dimensions_and_metrics=dimensions_and_metrics, final_calculated_script=final_calculated_script, query_id=query_id, view_filter=view_filter, final_calculated_script_snapshot=final_calculated_script_snapshot, without_snapshot_cadences=cadences_without_snapshot, with_snapshot_cadences=cadences_with_snapshot, ).generate_sql() @classmethod def _get_dimensions_and_metrics_from_use_case_view( cls, view_dimensions: dict, view_metrics: dict ) -> Tuple[str, str, str]: """Get dimensions and metrics from use case. Args: view_dimensions: use case configured dimensions. view_metrics: use case configured metrics. """ ( extracted_dimensions_with_alias, extracted_dimensions_without_alias, ) = GABUtils.extract_columns_from_mapping( columns=view_dimensions, is_dimension=True, extract_column_without_alias=True, table_alias="a", is_extracted_value_as_name=False, ) dimensions_without_default_columns = [ extracted_dimension for extracted_dimension in extracted_dimensions_without_alias if extracted_dimension not in GABDefaults.DIMENSIONS_DEFAULT_COLUMNS.value ] dimensions = ",".join(dimensions_without_default_columns) dimensions_with_alias = ",".join(extracted_dimensions_with_alias) ( extracted_metrics_with_alias, extracted_metrics_without_alias, ) = GABUtils.extract_columns_from_mapping( columns=view_metrics, is_dimension=False, extract_column_without_alias=True, table_alias="a", is_extracted_value_as_name=False, ) metrics = ",".join(extracted_metrics_without_alias) metrics_with_alias = ",".join(extracted_metrics_with_alias) dimensions_and_metrics_with_alias = ( dimensions_with_alias + "," + metrics_with_alias ) dimensions_and_metrics = dimensions + "," + metrics return dimensions, dimensions_and_metrics, dimensions_and_metrics_with_alias @classmethod def _get_calculated_and_derived_metrics_from_use_case_view( cls, view_metrics: dict, view_dimensions: dict, cadence_snapshot_status: dict ) -> Tuple[str, str, str]: """Get calculated and derived metrics from use case. Args: view_dimensions: use case configured dimensions. view_metrics: use case configured metrics. cadence_snapshot_status: cadences to execute with the information if it has snapshot. """ calculated_script = [] calculated_script_snapshot = [] derived_script = [] for metric_key, metric_value in view_metrics.items(): ( calculated_metrics_script, calculated_metrics_script_snapshot, derived_metrics_script, ) = cls._get_calculated_metrics( metric_key, metric_value, view_dimensions, cadence_snapshot_status ) calculated_script += [*calculated_metrics_script] calculated_script_snapshot += [*calculated_metrics_script_snapshot] derived_script += [*derived_metrics_script] joined_calculated_script = cls._join_list_to_string_when_present( calculated_script ) joined_calculated_script_snapshot = cls._join_list_to_string_when_present( calculated_script_snapshot ) joined_derived = cls._join_list_to_string_when_present( to_join=derived_script, starting_value="*,", default_value="*" ) return ( joined_derived, joined_calculated_script, joined_calculated_script_snapshot, ) @classmethod def _join_list_to_string_when_present( cls, to_join: list[str], separator: str = ",", starting_value: str = ",", default_value: str = "", ) -> str: """Join list to string when has values, otherwise return the default value. Args: to_join: values to join. separator: separator to be used in the join. starting_value: value to be started before the join. default_value: value to be returned if the list is empty. """ return starting_value + separator.join(to_join) if to_join else default_value @classmethod def _get_cadence_snapshot_status(cls, result: dict) -> dict: cadence_snapshot_status = {} for k, v in result.items(): cadence_snapshot_status[k] = next( ( next( ( snap_list["snapshot"] for snap_list in loop_outer_cad.values() if snap_list["snapshot"] == "Y" ), "N", ) for loop_outer_cad in v.values() if v ), "N", ) return cadence_snapshot_status @classmethod def _split_cadence_by_snapshot( cls, cadence_snapshot_status: dict ) -> tuple[list[str], list[str]]: """Split cadences by the snapshot value. Args: cadence_snapshot_status: cadences to be split by snapshot status. """ with_snapshot_cadences = [] without_snapshot_cadences = [] for key_snap_status, value_snap_status in cadence_snapshot_status.items(): if value_snap_status == "Y": with_snapshot_cadences.append(key_snap_status) else: without_snapshot_cadences.append(key_snap_status) return with_snapshot_cadences, without_snapshot_cadences @classmethod def _get_calculated_metrics( cls, metric_key: str, metric_value: dict, view_dimensions: dict, cadence_snapshot_status: dict, ) -> tuple[list[str], list[str], list[str]]: """Get calculated metrics from use case. Args: metric_key: use case metric name. metric_value: use case metric value. view_dimensions: use case configured dimensions. cadence_snapshot_status: cadences to execute with the information if it has snapshot. """ dim_partition = ",".join([str(i) for i in view_dimensions.keys()][2:]) dim_partition = "cadence," + dim_partition calculated_metrics = metric_value["calculated_metric"] derived_metrics = metric_value["derived_metric"] calculated_metrics_script: list[str] = [] calculated_metrics_script_snapshot: list[str] = [] derived_metrics_script: list[str] = [] if calculated_metrics: ( calculated_metrics_script, calculated_metrics_script_snapshot, ) = cls._get_calculated_metric( metric_key, calculated_metrics, dim_partition, cadence_snapshot_status ) if derived_metrics: derived_metrics_script = cls._get_derived_metrics(derived_metrics) return ( calculated_metrics_script, calculated_metrics_script_snapshot, derived_metrics_script, ) @classmethod def _get_derived_metrics(cls, derived_metric: dict) -> list[str]: """Get derived metrics from use case. Args: derived_metric: use case derived metrics. """ derived_metric_script = [] for i in range(0, len(derived_metric)): derived_formula = str(derived_metric[i]["formula"]) derived_label = derived_metric[i]["label"] derived_metric_script.append(derived_formula + " AS " + derived_label) return derived_metric_script @classmethod def _get_calculated_metric( cls, metric_key: str, calculated_metric: dict, dimension_partition: str, cadence_snapshot_status: dict, ) -> tuple[list[str], list[str]]: """Get calculated metrics from use case. Args: metric_key: use case metric name. calculated_metric: use case calculated metrics. dimension_partition: dimension partition. cadence_snapshot_status: cadences to execute with the information if it has snapshot. """ last_cadence_script: list[str] = [] last_year_cadence_script: list[str] = [] window_script: list[str] = [] last_cadence_script_snapshot: list[str] = [] last_year_cadence_script_snapshot: list[str] = [] window_script_snapshot: list[str] = [] if "last_cadence" in calculated_metric: ( last_cadence_script, last_cadence_script_snapshot, ) = cls._get_cadence_calculated_metric( metric_key, dimension_partition, calculated_metric, cadence_snapshot_status, "last_cadence", ) if "last_year_cadence" in calculated_metric: ( last_year_cadence_script, last_year_cadence_script_snapshot, ) = cls._get_cadence_calculated_metric( metric_key, dimension_partition, calculated_metric, cadence_snapshot_status, "last_year_cadence", ) if "window_function" in calculated_metric: window_script, window_script_snapshot = cls._get_window_calculated_metric( metric_key, dimension_partition, calculated_metric, cadence_snapshot_status, ) calculated_script = [ *last_cadence_script, *last_year_cadence_script, *window_script, ] calculated_script_snapshot = [ *last_cadence_script_snapshot, *last_year_cadence_script_snapshot, *window_script_snapshot, ] return calculated_script, calculated_script_snapshot @classmethod def _get_window_calculated_metric( cls, metric_key: str, dimension_partition: str, calculated_metric: dict, cadence_snapshot_status: dict, ) -> tuple[list, list]: """Get window calculated metrics from use case. Args: metric_key: use case metric name. dimension_partition: dimension partition. calculated_metric: use case calculated metrics. cadence_snapshot_status: cadences to execute with the information if it has snapshot. """ calculated_script = [] calculated_script_snapshot = [] for i in range(0, len(calculated_metric["window_function"])): window_function = calculated_metric["window_function"][i]["agg_func"] window_function_start = calculated_metric["window_function"][i]["window"][0] window_function_end = calculated_metric["window_function"][i]["window"][1] window_label = calculated_metric["window_function"][i]["label"] calculated_script.append( f""" NVL( {window_function}({metric_key}) OVER ( PARTITION BY {dimension_partition} order by from_date ROWS BETWEEN {str(window_function_start)} PRECEDING AND {str(window_function_end)} PRECEDING ), 0 ) AS {window_label} """ ) if "Y" in cadence_snapshot_status.values(): calculated_script_snapshot.append( f""" NVL( {window_function}({metric_key}) OVER ( PARTITION BY {dimension_partition} ,rn order by from_date ROWS BETWEEN {str(window_function_start)} PRECEDING AND {str(window_function_end)} PRECEDING ), 0 ) AS {window_label} """ ) return calculated_script, calculated_script_snapshot @classmethod def _get_cadence_calculated_metric( cls, metric_key: str, dimension_partition: str, calculated_metric: dict, cadence_snapshot_status: dict, cadence: str, ) -> tuple[list, list]: """Get cadence calculated metrics from use case. Args: metric_key: use case metric name. calculated_metric: use case calculated metrics. dimension_partition: dimension partition. cadence_snapshot_status: cadences to execute with the information if it has snapshot. cadence: cadence to process. """ calculated_script = [] calculated_script_snapshot = [] for i in range(0, len(calculated_metric[cadence])): cadence_lag = cls._get_cadence_item_lag(calculated_metric, cadence, i) cadence_label = calculated_metric[cadence][i]["label"] calculated_script.append( cls._get_cadence_lag_statement( metric_key, cadence_lag, dimension_partition, cadence_label, snapshot=False, cadence=cadence, ) ) if "Y" in cadence_snapshot_status.values(): calculated_script_snapshot.append( cls._get_cadence_lag_statement( metric_key, cadence_lag, dimension_partition, cadence_label, snapshot=True, cadence=cadence, ) ) return calculated_script, calculated_script_snapshot @classmethod def _get_cadence_item_lag( cls, calculated_metric: dict, cadence: str, item: int ) -> str: """Get calculated metric item lag. Args: calculated_metric: use case calculated metrics. cadence: cadence to process. item: metric item. """ return str(calculated_metric[cadence][item]["window"]) @classmethod def _get_cadence_lag_statement( cls, metric_key: str, cadence_lag: str, dimension_partition: str, cadence_label: str, snapshot: bool, cadence: str, ) -> str: """Get cadence lag statement. Args: metric_key: use case metric name. cadence_lag: cadence window lag. dimension_partition: dimension partition. cadence_label: cadence name. snapshot: indicate if the snapshot is enabled. cadence: cadence to process. """ cadence_lag_statement = "" if cadence == "last_cadence": cadence_lag_statement = ( "NVL(LAG(" + metric_key + "," + cadence_lag + ") OVER(PARTITION BY " + dimension_partition + (",rn" if snapshot else "") + " order by from_date),0) AS " + cadence_label ) elif cadence == "last_year_cadence": cadence_lag_statement = ( "NVL(LAG(" + metric_key + "," + cadence_lag + ") OVER(PARTITION BY " + dimension_partition + (",rn" if snapshot else "") + """, case when cadence in ('DAY','MONTH','QUARTER') then struct(month(from_date), day(from_date)) when cadence in('WEEK') then struct(weekofyear(from_date+1),1) end order by from_date),0) AS """ + cadence_label ) else: cls._LOGGER.error(f"Cadence {cadence} not implemented yet") return cadence_lag_statement ================================================ FILE: lakehouse_engine/core/gab_sql_generator.py ================================================ """Module to define GAB SQL classes.""" import ast import json from abc import ABC, abstractmethod from typing import Any, Callable, Optional from pyspark.sql import DataFrame from pyspark.sql.functions import col, lit, struct, to_json from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.gab_utils import GABUtils from lakehouse_engine.utils.logging_handler import LoggingHandler def _execute_sql(func) -> Callable: # type: ignore """Execute the SQL resulting from the function. This function is protected to be used just in this module. It's used to decorate functions that returns a SQL statement. Args: func: function that will return the sql to execute """ def inner(*args: Any) -> None: generated_sql = func(*args) if generated_sql: ExecEnv.SESSION.sql(generated_sql) return inner class GABSQLGenerator(ABC): """Abstract class defining the behaviour of a GAB SQL Generator.""" @abstractmethod def generate_sql(self) -> Optional[str]: """Define the generate sql command. E.g., the behaviour of gab generate sql inheriting from this. """ pass class GABInsertGenerator(GABSQLGenerator): """GAB insert generator. Creates the insert statement based on the dimensions and metrics provided in the configuration table. """ _LOGGER = LoggingHandler(__name__).get_logger() def __init__( self, query_id: str, cadence: str, final_stage_table: str, lookup_query_builder: DataFrame, target_database: str, target_table: str, ): """Construct GABInsertGenerator instances. Args: query_id: gab configuration table use case identifier. cadence: inputted cadence to process. final_stage_table: stage view name. lookup_query_builder: gab configuration data. target_database: target database to write. target_table: target table to write. """ self.query_id = query_id self.cadence = cadence self.final_stage_table = final_stage_table self.lookup_query_builder = lookup_query_builder self.target_database = target_database self.target_table = target_table def generate_sql(self) -> Optional[str]: """Generate insert sql statement to the insights table.""" insert_sql_statement = self._insert_statement_generator() return insert_sql_statement def _insert_statement_generator(self) -> str: """Generate GAB insert statement. Creates the insert statement based on the dimensions and metrics provided in the configuration table. """ result = GABUtils.get_json_column_as_dict( self.lookup_query_builder, self.query_id, "mappings" ) for result_key in result.keys(): joined_dimensions, joined_metrics = self._get_mapping_columns( mapping=result[result_key] ) gen_ins = f""" INSERT INTO {self.target_database}.{self.target_table} SELECT {self.query_id} as query_id, '{self.cadence}' as cadence, {joined_dimensions}, {joined_metrics}, current_timestamp() as lh_created_on FROM {self.final_stage_table} """ # nosec: B608 return gen_ins @classmethod def _get_mapping_columns(cls, mapping: dict) -> tuple[str, str]: """Get mapping columns(dimensions and metrics) as joined string. Args: mapping: use case mappings configuration. """ dimensions_mapping = mapping["dimensions"] metrics_mapping = mapping["metric"] joined_dimensions = cls._join_extracted_column_with_filled_columns( columns=dimensions_mapping, is_dimension=True ) joined_metrics = cls._join_extracted_column_with_filled_columns( columns=metrics_mapping, is_dimension=False ) return joined_dimensions, joined_metrics @classmethod def _join_extracted_column_with_filled_columns( cls, columns: dict, is_dimension: bool ) -> str: """Join extracted columns with empty filled columns. Args: columns: use case columns and values. is_dimension: flag identifying if is a dimension or a metric. """ extracted_columns_with_alias = ( GABUtils.extract_columns_from_mapping( # type: ignore columns=columns, is_dimension=is_dimension ) ) filled_columns = cls._fill_empty_columns( extracted_columns=extracted_columns_with_alias, # type: ignore is_dimension=is_dimension, ) joined_columns = [*extracted_columns_with_alias, *filled_columns] return ",".join(joined_columns) @classmethod def _fill_empty_columns( cls, extracted_columns: list[str], is_dimension: bool ) -> list[str]: """Fill empty columns as null. As the data is expected to have 40 columns we have to fill the unused columns. Args: extracted_columns: use case extracted columns. is_dimension: flag identifying if is a dimension or a metric. """ filled_columns = [] for ins in range( ( len(extracted_columns) - 1 if is_dimension else len(extracted_columns) + 1 ), 41, ): filled_columns.append( " null as {}{}".format("d" if is_dimension else "m", ins) ) return filled_columns class GABViewGenerator(GABSQLGenerator): """GAB view generator. Creates the use case view statement to be consumed. """ _LOGGER = LoggingHandler(__name__).get_logger() def __init__( self, cadence_snapshot_status: dict, target_database: str, view_name: str, final_cols: str, target_table: str, dimensions_and_metrics_with_alias: str, dimensions: str, dimensions_and_metrics: str, final_calculated_script: str, query_id: str, view_filter: str, final_calculated_script_snapshot: str, without_snapshot_cadences: list[str], with_snapshot_cadences: list[str], ): """Construct GABViewGenerator instances. Args: cadence_snapshot_status: each cadence with the corresponding snapshot status. target_database: target database to write. view_name: name of the view to be generated. final_cols: columns to return in the view. target_table: target table to write. dimensions_and_metrics_with_alias: configured dimensions and metrics with alias to compute in the view. dimensions: use case configured dimensions. dimensions_and_metrics: use case configured dimensions and metrics. final_calculated_script: use case calculated metrics. query_id: gab configuration table use case identifier. view_filter: filter to add in the view. final_calculated_script_snapshot: use case calculated metrics with snapshot. without_snapshot_cadences: cadences without snapshot. with_snapshot_cadences: cadences with snapshot. """ self.cadence_snapshot_status = cadence_snapshot_status self.target_database = target_database self.result_key = view_name self.final_cols = final_cols self.target_table = target_table self.dimensions_and_metrics_with_alias = dimensions_and_metrics_with_alias self.dimensions = dimensions self.dimensions_and_metrics = dimensions_and_metrics self.final_calculated_script = final_calculated_script self.query_id = query_id self.view_filter = view_filter self.final_calculated_script_snapshot = final_calculated_script_snapshot self.without_snapshot_cadences = without_snapshot_cadences self.with_snapshot_cadences = with_snapshot_cadences @_execute_sql def generate_sql(self) -> Optional[str]: """Generate use case view sql statement.""" consumption_view_sql = self._create_consumption_view() return consumption_view_sql def _create_consumption_view(self) -> str: """Create consumption view.""" final_view_query = self._generate_consumption_view_statement( self.cadence_snapshot_status, self.target_database, self.final_cols, self.target_table, self.dimensions_and_metrics_with_alias, self.dimensions, self.dimensions_and_metrics, self.final_calculated_script, self.query_id, self.view_filter, self.final_calculated_script_snapshot, without_snapshot_cadences=",".join( f'"{w}"' for w in self.without_snapshot_cadences ), with_snapshot_cadences=",".join( f'"{w}"' for w in self.with_snapshot_cadences ), ) rendered_query = """ CREATE OR REPLACE VIEW {database}.{view_name} AS {final_view_query} """.format( database=self.target_database, view_name=self.result_key, final_view_query=final_view_query, ) self._LOGGER.info(f"Consumption view statement: {rendered_query}") return rendered_query @classmethod def _generate_consumption_view_statement( cls, cadence_snapshot_status: dict, target_database: str, final_cols: str, target_table: str, dimensions_and_metrics_with_alias: str, dimensions: str, dimensions_and_metrics: str, final_calculated_script: str, query_id: str, view_filter: str, final_calculated_script_snapshot: str, without_snapshot_cadences: str, with_snapshot_cadences: str, ) -> str: """Generate consumption view. Args: cadence_snapshot_status: cadences to execute with the information if it has snapshot. target_database: target database to write. final_cols: use case columns exposed in the consumption view. target_table: target table to write. dimensions_and_metrics_with_alias: dimensions and metrics as string columns with alias. dimensions: dimensions as string columns. dimensions_and_metrics: dimensions and metrics as string columns without alias. final_calculated_script: final calculated metrics script. query_id: gab configuration table use case identifier. view_filter: filter to execute on the view. final_calculated_script_snapshot: final calculated metrics with snapshot script. without_snapshot_cadences: cadences without snapshot. with_snapshot_cadences: cadences with snapshot. """ cls._LOGGER.info("Generating consumption view statement...") cls._LOGGER.info( f""" {{ target_database: {target_database}, target_table: {target_table}, query_id: {query_id}, cadence_and_snapshot_status: {cadence_snapshot_status}, cadences_without_snapshot: [{without_snapshot_cadences}], cadences_with_snapshot: [{with_snapshot_cadences}], final_cols: {final_cols}, dimensions_and_metrics_with_alias: {dimensions_and_metrics_with_alias}, dimensions: {dimensions}, dimensions_with_metrics: {dimensions_and_metrics}, final_calculated_script: {final_calculated_script}, final_calculated_script_snapshot: {final_calculated_script_snapshot}, view_filter: {view_filter} }}""" ) if ( "Y" in cadence_snapshot_status.values() and "N" in cadence_snapshot_status.values() ): consumption_view_query = f""" WITH TEMP1 AS ( SELECT a.cadence, {dimensions_and_metrics_with_alias}{final_calculated_script} FROM {target_database}.{target_table} a WHERE a.query_id = {query_id} AND cadence IN ({without_snapshot_cadences}) {view_filter} ), TEMP_RN AS ( SELECT a.cadence, a.from_date, a.to_date, {dimensions_and_metrics}, row_number() over( PARTITION BY a.cadence, {dimensions}, a.from_date order by to_date ) as rn FROM {target_database}.{target_table} a WHERE a.query_id = {query_id} AND cadence IN ({with_snapshot_cadences}) {view_filter} ), TEMP2 AS ( SELECT a.cadence, {dimensions_and_metrics_with_alias}{final_calculated_script_snapshot} FROM TEMP_RN a ), TEMP3 AS (SELECT * FROM TEMP1 UNION SELECT * from TEMP2) SELECT {final_cols} FROM TEMP3 """ # nosec: B608 elif "N" in cadence_snapshot_status.values(): consumption_view_query = f""" WITH TEMP1 AS ( SELECT a.cadence, {dimensions_and_metrics_with_alias}{final_calculated_script} FROM {target_database}.{target_table} a WHERE a.query_id = {query_id} AND cadence IN ({without_snapshot_cadences}) {view_filter} ) SELECT {final_cols} FROM TEMP1 """ # nosec: B608 else: consumption_view_query = f""" WITH TEMP_RN AS ( SELECT a.cadence, a.from_date, a.to_date, {dimensions_and_metrics}, row_number() over( PARTITION BY a.cadence, a.from_date, a.to_date, {dimensions}, a.from_date order by to_date) as rn FROM {target_database}.{target_table} a WHERE a.query_id = {query_id} AND cadence IN ({with_snapshot_cadences}) {view_filter} ), TEMP2 AS ( SELECT a.cadence, {dimensions_and_metrics_with_alias}{final_calculated_script_snapshot} FROM TEMP_RN a ) SELECT {final_cols} FROM TEMP2 """ # nosec: B608 return consumption_view_query class GABDeleteGenerator(GABSQLGenerator): """GAB delete generator. Creates the delete statement to clean the use case base data on the insights table. """ _LOGGER = LoggingHandler(__name__).get_logger() def __init__( self, query_id: str, cadence: str, temp_stage_view_name: str, lookup_query_builder: DataFrame, target_database: str, target_table: str, ): """Construct GABViewGenerator instances. Args: query_id: gab configuration table use case identifier. cadence: inputted cadence to process. temp_stage_view_name: stage view name. lookup_query_builder: gab configuration data. target_database: target database to write. target_table: target table to write. """ self.query_id = query_id self.cadence = cadence self.temp_stage_view_name = temp_stage_view_name self.lookup_query_builder = lookup_query_builder self.target_database = target_database self.target_table = target_table @_execute_sql def generate_sql(self) -> Optional[str]: """Generate delete sql statement. This statement is to clean the insights table for the corresponding use case. """ delete_sql_statement = self._delete_statement_generator() return delete_sql_statement def _delete_statement_generator(self) -> str: df_filtered = self.lookup_query_builder.filter( col("query_id") == lit(self.query_id) ) df_map = df_filtered.select(col("mappings")) view_df = df_map.select( to_json(struct([df_map[x] for x in df_map.columns])) ).collect()[0][0] line = json.loads(view_df) for line_v in line.values(): result = ast.literal_eval(line_v) for result_key in result.keys(): result_new = result[result_key] dim_from_date = result_new["dimensions"]["from_date"] dim_to_date = result_new["dimensions"]["to_date"] self._LOGGER.info(f"temp stage view name: {self.temp_stage_view_name}") min_from_date = ExecEnv.SESSION.sql( """ SELECT MIN({from_date}) as min_from_date FROM {iter_stages}""".format( # nosec: B608 iter_stages=self.temp_stage_view_name, from_date=dim_from_date ) ).collect()[0][0] max_from_date = ExecEnv.SESSION.sql( """ SELECT MAX({from_date}) as max_from_date FROM {iter_stages}""".format( # nosec: B608 iter_stages=self.temp_stage_view_name, from_date=dim_from_date ) ).collect()[0][0] min_to_date = ExecEnv.SESSION.sql( """ SELECT MIN({to_date}) as min_to_date FROM {iter_stages}""".format( # nosec: B608 iter_stages=self.temp_stage_view_name, to_date=dim_to_date ) ).collect()[0][0] max_to_date = ExecEnv.SESSION.sql( """ SELECT MAX({to_date}) as max_to_date FROM {iter_stages}""".format( # nosec: B608 iter_stages=self.temp_stage_view_name, to_date=dim_to_date ) ).collect()[0][0] gen_del = """ DELETE FROM {target_database}.{target_table} a WHERE query_id = {query_id} AND cadence = '{cadence}' AND from_date BETWEEN '{min_from_date}' AND '{max_from_date}' AND to_date BETWEEN '{min_to_date}' AND '{max_to_date}' """.format( # nosec: B608 target_database=self.target_database, target_table=self.target_table, query_id=self.query_id, cadence=self.cadence, min_from_date=min_from_date, max_from_date=max_from_date, min_to_date=min_to_date, max_to_date=max_to_date, ) return gen_del ================================================ FILE: lakehouse_engine/core/s3_file_manager.py ================================================ """File manager module using boto3.""" import time from typing import Any, Optional, Tuple import boto3 from lakehouse_engine.algorithms.exceptions import RestoreTypeNotFoundException from lakehouse_engine.core.definitions import ( ARCHIVE_STORAGE_CLASS, FileManagerAPIKeys, RestoreStatus, RestoreType, ) from lakehouse_engine.core.file_manager import FileManager from lakehouse_engine.utils.file_utils import get_directory_path from lakehouse_engine.utils.logging_handler import LoggingHandler def _dry_run(bucket: str, object_paths: list) -> dict: """Build the dry run request return format. Args: bucket: name of bucket to perform operation. object_paths: paths of object to list. Returns: A dict with a list of objects that would be copied/deleted. """ response = {} for path in object_paths: if _check_directory(bucket, path): path = get_directory_path(path) res = _list_objects_recursively(bucket=bucket, path=path) if res: response[path] = res else: response[path] = ["No such key"] return response def _list_objects( s3_client: Any, bucket: str, path: str, paginator: str = "" ) -> Tuple[list, str]: """List 1000 objects in a bucket given a prefix and paginator in s3. Args: bucket: name of bucket to perform the list. path: path to be used as a prefix. paginator: paginator token to be used. Returns: A list of object names. """ object_list = [] if not paginator: list_response = s3_client.list_objects_v2(Bucket=bucket, Prefix=path) else: list_response = s3_client.list_objects_v2( Bucket=bucket, Prefix=path, ContinuationToken=paginator, ) if FileManagerAPIKeys.CONTENTS.value in list_response: for obj in list_response[FileManagerAPIKeys.CONTENTS.value]: object_list.append(obj[FileManagerAPIKeys.KEY.value]) if FileManagerAPIKeys.CONTINUATION.value in list_response: pagination = list_response[FileManagerAPIKeys.CONTINUATION.value] else: pagination = "" return object_list, pagination def _list_objects_recursively(bucket: str, path: str) -> list: """Recursively list all objects given a prefix in s3. Args: bucket: name of bucket to perform the list. path: path to be used as a prefix. Returns: A list of object names fetched recursively. """ object_list = [] more_objects = True paginator = "" s3 = boto3.client("s3") while more_objects: temp_list, paginator = _list_objects(s3, bucket, path, paginator) object_list.extend(temp_list) if not paginator: more_objects = False return object_list def _check_directory(bucket: str, path: str) -> bool: """Checks if the object is a 'directory' in s3. Args: bucket: name of bucket to perform the check. path: path to be used as a prefix. Returns: If path represents a 'directory'. """ s3 = boto3.client("s3") objects, _ = _list_objects(s3, bucket, path) return len(objects) > 1 class S3FileManager(FileManager): """Set of actions to manipulate s3 files in several ways.""" _logger = LoggingHandler(__name__).get_logger() def get_function(self) -> None: """Get a specific function to execute.""" available_functions = { "delete_objects": self.delete_objects, "copy_objects": self.copy_objects, "request_restore": self.request_restore, "check_restore_status": self.check_restore_status, "request_restore_to_destination_and_wait": ( self.request_restore_to_destination_and_wait ), } self._logger.info("Function being executed: {}".format(self.function)) if self.function in available_functions.keys(): func = available_functions[self.function] func() else: raise NotImplementedError( f"The requested function {self.function} is not implemented." ) def _delete_objects(self, bucket: str, objects_paths: list) -> None: """Delete objects recursively in s3. Params: bucket: name of bucket to perform the delete operation. objects_paths: objects to be deleted. """ s3 = boto3.client("s3") for path in objects_paths: if _check_directory(bucket, path): path = get_directory_path(path) else: path = path.strip() more_objects = True paginator = "" objects_to_delete = [] while more_objects: objects_found, paginator = _list_objects( s3_client=s3, bucket=bucket, path=path, paginator=paginator ) for obj in objects_found: objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) if not paginator: more_objects = False response = s3.delete_objects( Bucket=bucket, Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, ) self._logger.info(response) objects_to_delete = [] def delete_objects(self) -> None: """Delete objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be deleted based on the given keys. """ bucket = self.configs["bucket"] objects_paths = self.configs["object_paths"] dry_run = self.configs["dry_run"] if dry_run: response = _dry_run(bucket=bucket, object_paths=objects_paths) self._logger.info("Paths that would be deleted:") self._logger.info(response) else: self._delete_objects(bucket, objects_paths) def copy_objects(self) -> None: """Copies objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be copied based on the given keys. """ source_bucket = self.configs["bucket"] source_object = self.configs["source_object"] destination_bucket = self.configs["destination_bucket"] destination_object = self.configs["destination_object"] dry_run = self.configs["dry_run"] S3FileManager._copy_objects( source_bucket=source_bucket, source_object=source_object, destination_bucket=destination_bucket, destination_object=destination_object, dry_run=dry_run, ) def move_objects(self) -> None: """Moves objects and 'directories'. If dry_run is set to True the function will print a dict with all the paths that would be moved based on the given keys. """ pass def request_restore(self) -> None: """Request the restore of archived data.""" source_bucket = self.configs["bucket"] source_object = self.configs["source_object"] restore_expiration = self.configs["restore_expiration"] retrieval_tier = self.configs["retrieval_tier"] dry_run = self.configs["dry_run"] ArchiveFileManager.request_restore( source_bucket, source_object, restore_expiration, retrieval_tier, dry_run, ) def check_restore_status(self) -> None: """Check the restore status of archived data.""" source_bucket = self.configs["bucket"] source_object = self.configs["source_object"] restore_status = ArchiveFileManager.check_restore_status( source_bucket, source_object ) self._logger.info( f""" Restore status: - Not Started: {restore_status.get('not_started_objects')} - Ongoing: {restore_status.get('ongoing_objects')} - Restored: {restore_status.get('restored_objects')} Total objects in this restore process: {restore_status.get('total_objects')} """ ) def request_restore_to_destination_and_wait(self) -> None: """Request and wait for the restore to complete, polling the restore status. After the restore is done, copy the restored files to destination """ source_bucket = self.configs["bucket"] source_object = self.configs["source_object"] destination_bucket = self.configs["destination_bucket"] destination_object = self.configs["destination_object"] restore_expiration = self.configs["restore_expiration"] retrieval_tier = self.configs["retrieval_tier"] dry_run = self.configs["dry_run"] ArchiveFileManager.request_restore_and_wait( source_bucket=source_bucket, source_object=source_object, restore_expiration=restore_expiration, retrieval_tier=retrieval_tier, dry_run=dry_run, ) S3FileManager._logger.info( f"Restoration complete for {source_bucket} and {source_object}" ) S3FileManager._logger.info( f"Starting to copy data from {source_bucket}/{source_object} to " f"{destination_bucket}/{destination_object}" ) S3FileManager._copy_objects( source_bucket=source_bucket, source_object=source_object, destination_bucket=destination_bucket, destination_object=destination_object, dry_run=dry_run, ) S3FileManager._logger.info( f"Finished copying data, data should be available on {destination_bucket}/" f"{destination_object}" ) @staticmethod def _copy_objects( source_bucket: str, source_object: str, destination_bucket: str, destination_object: str, dry_run: bool, ) -> None: """Copies objects and 'directories' in s3. Args: source_bucket: name of bucket to perform the copy. source_object: object/folder to be copied. destination_bucket: name of the target bucket to copy. destination_object: target object/folder to copy. dry_run: if dry_run is set to True the function will print a dict with all the paths that would be deleted based on the given keys. """ s3 = boto3.client("s3") if dry_run: response = _dry_run(bucket=source_bucket, object_paths=[source_object]) S3FileManager._logger.info("Paths that would be copied:") S3FileManager._logger.info(response) else: original_object_name = source_object.split("/")[-1] if _check_directory(source_bucket, source_object): source_object = get_directory_path(source_object) copy_object = _list_objects_recursively( bucket=source_bucket, path=source_object ) for obj in copy_object: S3FileManager._logger.info(f"Copying obj: {obj}") final_path = obj.replace(source_object, "") response = s3.copy_object( Bucket=destination_bucket, CopySource={ FileManagerAPIKeys.BUCKET.value: source_bucket, FileManagerAPIKeys.KEY.value: obj, }, Key=f"{destination_object}/{original_object_name}/{final_path}", ) S3FileManager._logger.info(response) else: S3FileManager._logger.info(f"Copying obj: {source_object}") response = s3.copy_object( Bucket=destination_bucket, CopySource={ FileManagerAPIKeys.BUCKET.value: source_bucket, FileManagerAPIKeys.KEY.value: source_object, }, Key=f"""{destination_object}/{original_object_name}""", ) S3FileManager._logger.info(response) class ArchiveFileManager(object): """Set of actions to restore archives.""" _logger = LoggingHandler(__name__).get_logger() @staticmethod def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: """Get the archived object if it's an object. Args: bucket: name of bucket to check get the object. object_key: object to get. Returns: S3 Object if it's an archived object, otherwise None. """ s3 = boto3.resource("s3") object_to_restore = s3.Object(bucket, object_key) if ( object_to_restore.storage_class is not None and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS ): return object_to_restore else: return None @staticmethod def _check_object_restore_status( bucket: str, object_key: str ) -> Optional[RestoreStatus]: """Check the restore status of the archive. Args: bucket: name of bucket to check the restore status. object_key: object to check the restore status. Returns: The restore status represented by an enum, possible values are: NOT_STARTED, ONGOING or RESTORED """ archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) if archived_object is None: status = None elif archived_object.restore is None: status = RestoreStatus.NOT_STARTED elif 'ongoing-request="true"' in archived_object.restore: status = RestoreStatus.ONGOING else: status = RestoreStatus.RESTORED return status @staticmethod def check_restore_status(source_bucket: str, source_object: str) -> dict: """Check the restore status of archived data. Args: source_bucket: name of bucket to check the restore status. source_object: object to check the restore status. Returns: A dict containing the amount of objects in each status. """ not_started_objects = 0 ongoing_objects = 0 restored_objects = 0 total_objects = 0 if _check_directory(source_bucket, source_object): source_object = get_directory_path(source_object) objects_to_restore = _list_objects_recursively( bucket=source_bucket, path=source_object ) for obj in objects_to_restore: ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") restore_status = ArchiveFileManager._check_object_restore_status( source_bucket, obj ) if not restore_status: ArchiveFileManager._logger.warning( f"Restore status not found for {source_bucket}/{obj}" ) else: total_objects += 1 if RestoreStatus.NOT_STARTED == restore_status: not_started_objects += 1 elif RestoreStatus.ONGOING == restore_status: ongoing_objects += 1 else: restored_objects += 1 ArchiveFileManager._logger.info( f"{obj} restore status is {restore_status.value}" ) return { "total_objects": total_objects, "not_started_objects": not_started_objects, "ongoing_objects": ongoing_objects, "restored_objects": restored_objects, } @staticmethod def _request_restore_object( bucket: str, object_key: str, expiration: int, retrieval_tier: str ) -> None: """Request a restore of the archive. Args: bucket: name of bucket to perform the restore. object_key: object to be restored. expiration: restore expiration. retrieval_tier: type of restore, possible values are: Bulk, Standard or Expedited. """ if not RestoreType.exists(retrieval_tier): raise RestoreTypeNotFoundException( f"Restore type {retrieval_tier} not supported." ) if _check_directory(bucket, object_key): object_key = get_directory_path(object_key) archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) if archived_object and archived_object.restore is None: ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") archived_object.restore_object( RestoreRequest={ "Days": expiration, "GlacierJobParameters": {"Tier": retrieval_tier}, } ) else: ArchiveFileManager._logger.info( f"Restore request for {bucket}/{object_key} not performed." ) @staticmethod def request_restore( source_bucket: str, source_object: str, restore_expiration: int, retrieval_tier: str, dry_run: bool, ) -> None: """Request the restore of archived data. Args: source_bucket: name of bucket to perform the restore. source_object: object to be restored. restore_expiration: restore expiration in days. retrieval_tier: type of restore, possible values are: Bulk, Standard or Expedited. dry_run: if dry_run is set to True the function will print a dict with all the paths that would be deleted based on the given keys. """ if _check_directory(source_bucket, source_object): source_object = get_directory_path(source_object) if dry_run: response = _dry_run(bucket=source_bucket, object_paths=[source_object]) ArchiveFileManager._logger.info("Paths that would be restored:") ArchiveFileManager._logger.info(response) else: objects_to_restore = _list_objects_recursively( bucket=source_bucket, path=source_object ) for obj in objects_to_restore: ArchiveFileManager._request_restore_object( source_bucket, obj, restore_expiration, retrieval_tier, ) @staticmethod def request_restore_and_wait( source_bucket: str, source_object: str, restore_expiration: int, retrieval_tier: str, dry_run: bool, ) -> None: """Request and wait for the restore to complete, polling the restore status. Args: source_bucket: name of bucket to perform the restore. source_object: object to be restored. restore_expiration: restore expiration in days. retrieval_tier: type of restore, possible values are: Bulk, Standard or Expedited. dry_run: if dry_run is set to True the function will print a dict with all the paths that would be deleted based on the given keys. """ if retrieval_tier != RestoreType.EXPEDITED.value: ArchiveFileManager._logger.error( f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " "kind of restore should be used just with `Expedited` retrieval tier " "to save cluster costs." ) raise ValueError( f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " "kind of restore should be used just with `Expedited` retrieval tier " "to save cluster costs." ) ArchiveFileManager.request_restore( source_bucket=source_bucket, source_object=source_object, restore_expiration=restore_expiration, retrieval_tier=retrieval_tier, dry_run=dry_run, ) restore_status = ArchiveFileManager.check_restore_status( source_bucket, source_object ) ArchiveFileManager._logger.info(f"Restore status: {restore_status}") if not dry_run: ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") wait_time = 300 while restore_status.get("total_objects") > restore_status.get( "restored_objects" ): ArchiveFileManager._logger.info( "Not all objects have been restored yet, checking the status again " f"in {wait_time} seconds." ) time.sleep(wait_time) wait_time = 30 restore_status = ArchiveFileManager.check_restore_status( source_bucket, source_object ) ArchiveFileManager._logger.info(f"Restore status: {restore_status}") ================================================ FILE: lakehouse_engine/core/sensor_manager.py ================================================ """Module to define Sensor Manager classes.""" import json from datetime import datetime from typing import List, Optional, Tuple import requests from delta.tables import DeltaTable from pyspark.sql import DataFrame, Row from pyspark.sql.functions import array, col, lit from lakehouse_engine.core.definitions import ( SENSOR_SCHEMA, SENSOR_UPDATE_SET, SAPLogchain, SensorSpec, SensorStatus, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader_factory import ReaderFactory from lakehouse_engine.utils.logging_handler import LoggingHandler class SensorControlTableManager(object): """Class to control the Sensor execution.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def check_if_sensor_has_acquired_data( cls, sensor_id: str, control_db_table_name: str, ) -> bool: """Check if sensor has acquired new data. Args: sensor_id: sensor id. control_db_table_name: `db.table` to control sensor runs. Returns: True if acquired new data, otherwise False """ sensor_table_data = cls.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) cls._LOGGER.info(f"sensor_table_data = {sensor_table_data}") return ( sensor_table_data is not None and sensor_table_data.status == SensorStatus.ACQUIRED_NEW_DATA.value ) @classmethod def update_sensor_status( cls, sensor_spec: SensorSpec, status: str, upstream_key: str = None, upstream_value: str = None, ) -> None: """Control sensor execution storing the execution data in a delta table. Args: sensor_spec: sensor spec containing all sensor information we need to update the control status. status: status of the sensor. upstream_key: upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically). upstream_value: upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). """ cls._LOGGER.info( f"Updating sensor status for sensor {sensor_spec.sensor_id}..." ) data = cls._convert_sensor_to_data( spec=sensor_spec, status=status, upstream_key=upstream_key, upstream_value=upstream_value, ) sensor_update_set = cls._get_sensor_update_set( assets=sensor_spec.assets, checkpoint_location=sensor_spec.checkpoint_location, upstream_key=upstream_key, upstream_value=upstream_value, ) cls._update_sensor_control( data=data, sensor_update_set=sensor_update_set, sensor_control_table=sensor_spec.control_db_table_name, sensor_id=sensor_spec.sensor_id, ) @classmethod def _update_sensor_control( cls, data: List[dict], sensor_update_set: dict, sensor_control_table: str, sensor_id: str, ) -> None: """Update sensor control delta table. Args: data: to be updated. sensor_update_set: columns which we had update. sensor_control_table: control table name. sensor_id: sensor_id to be updated. """ sensors_delta_table = DeltaTable.forName( ExecEnv.SESSION, sensor_control_table, ) sensors_updates = ExecEnv.SESSION.createDataFrame(data, SENSOR_SCHEMA) sensors_delta_table.alias("sensors").merge( sensors_updates.alias("updates"), f"sensors.sensor_id = '{sensor_id}' AND " "sensors.sensor_id = updates.sensor_id", ).whenMatchedUpdate(set=sensor_update_set).whenNotMatchedInsertAll().execute() @classmethod def _convert_sensor_to_data( cls, spec: SensorSpec, status: str, upstream_key: str, upstream_value: str, status_change_timestamp: Optional[datetime] = None, ) -> List[dict]: """Convert sensor data to dataframe input data. Args: spec: sensor spec containing sensor identifier data. status: new sensor data status. upstream_key: key used to acquired data from the upstream. upstream_value: max value from the upstream_key acquired from the upstream. status_change_timestamp: timestamp we commit this change in the sensor control table. Returns: Sensor data as list[dict], used to create a dataframe to store the data into the sensor_control_table. """ status_change_timestamp = ( datetime.now() if status_change_timestamp is None else status_change_timestamp ) return [ { "sensor_id": spec.sensor_id, "assets": spec.assets, "status": status, "status_change_timestamp": status_change_timestamp, "checkpoint_location": spec.checkpoint_location, "upstream_key": str(upstream_key), "upstream_value": str(upstream_value), } ] @classmethod def _get_sensor_update_set(cls, **kwargs: Optional[str] | List[str]) -> dict: """Get the sensor update set. Args: kwargs: Containing the following keys: - assets - checkpoint_location - upstream_key - upstream_value Returns: A set containing the fields to update in the control_table. """ sensor_update_set = dict(SENSOR_UPDATE_SET) for key, value in kwargs.items(): if value: sensor_update_set[f"sensors.{key}"] = f"updates.{key}" return sensor_update_set @classmethod def read_sensor_table_data( cls, control_db_table_name: str, sensor_id: str = None, assets: list = None, ) -> Optional[Row]: """Read data from delta table containing sensor status info. Args: sensor_id: sensor id. If this parameter is defined search occurs only considering this parameter. Otherwise, it considers sensor assets and checkpoint location. control_db_table_name: db.table to control sensor runs. assets: list of assets that are fueled by the pipeline where this sensor is. Returns: Row containing the data for the provided sensor_id. """ df = DeltaTable.forName( ExecEnv.SESSION, control_db_table_name, ).toDF() if sensor_id: df = df.where(col("sensor_id") == sensor_id) elif assets: df = df.where(col("assets") == array(*[lit(asset) for asset in assets])) else: raise ValueError( "Either sensor_id or assets need to be provided as arguments." ) return df.first() class SensorUpstreamManager(object): """Class to deal with Sensor Upstream data.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def generate_filter_exp_query( cls, sensor_id: str, filter_exp: str, control_db_table_name: str = None, upstream_key: str = None, upstream_value: str = None, upstream_table_name: str = None, ) -> str: """Generates a sensor preprocess query based on timestamp logic. Args: sensor_id: sensor id. filter_exp: expression to filter incoming new data. You can use the placeholder `?upstream_value` so that it can be replaced by the upstream_value in the control_db_table_name for this specific sensor_id. control_db_table_name: db.table to retrieve the last status change timestamp. This is only relevant for the jdbc sensor. upstream_key: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream). upstream_value: value for custom sensor to identify new data from the upstream (e.g., the value of a time present in the upstream) If none we will set the default value. Note: This parameter is used just to override the default value `-2147483647`. upstream_table_name: value for custom sensor to query new data from the upstream. If none we will set the default value, our `sensor_new_data` view. Returns: The query string. """ source_table = upstream_table_name if upstream_table_name else "sensor_new_data" select_exp = "SELECT COUNT(1) as count" if control_db_table_name: if not upstream_key: raise ValueError( "If control_db_table_name is defined, upstream_key should " "also be defined!" ) default_upstream_value: str = "-2147483647" trigger_name = upstream_key trigger_value = ( default_upstream_value if upstream_value is None else upstream_value ) sensor_table_data = SensorControlTableManager.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) if sensor_table_data and sensor_table_data.upstream_value: trigger_value = sensor_table_data.upstream_value filter_exp = filter_exp.replace("?upstream_key", trigger_name).replace( "?upstream_value", trigger_value ) select_exp = ( f"SELECT COUNT(1) as count, '{trigger_name}' as UPSTREAM_KEY, " f"max({trigger_name}) as UPSTREAM_VALUE" ) query = ( f"{select_exp} " f"FROM {source_table} " f"WHERE {filter_exp} " f"HAVING COUNT(1) > 0" ) return query @classmethod def generate_sensor_table_preprocess_query( cls, sensor_id: str, ) -> str: """Generates a query to be used for a sensor having other sensor as upstream. Args: sensor_id: sensor id. Returns: The query string. """ query = ( f"SELECT * " # nosec f"FROM sensor_new_data " f"WHERE" f" _change_type in ('insert', 'update_postimage')" f" and sensor_id = '{sensor_id}'" f" and status = '{SensorStatus.PROCESSED_NEW_DATA.value}'" ) return query @classmethod def read_new_data(cls, sensor_spec: SensorSpec) -> DataFrame: """Read new data from the upstream into the sensor 'new_data_df'. Args: sensor_spec: sensor spec containing all sensor information. Returns: An empty dataframe if it doesn't have new data otherwise the new data """ new_data_df = ReaderFactory.get_data(sensor_spec.input_spec) if sensor_spec.preprocess_query: new_data_df.createOrReplaceTempView("sensor_new_data") new_data_df = ExecEnv.SESSION.sql(sensor_spec.preprocess_query) return new_data_df @classmethod def get_new_data( cls, new_data_df: DataFrame, ) -> Optional[Row]: """Get new data from upstream df if it's present. Args: new_data_df: DataFrame possibly containing new data. Returns: Optional row, present if there is new data in the upstream, absent otherwise. """ return new_data_df.first() @classmethod def generate_sensor_sap_logchain_query( cls, chain_id: str, dbtable: str = SAPLogchain.DBTABLE.value, status: str = SAPLogchain.GREEN_STATUS.value, engine_table_name: str = SAPLogchain.ENGINE_TABLE.value, ) -> str: """Generates a sensor query based in the SAP Logchain table. Args: chain_id: chain id to query the status on SAP. dbtable: db.table to retrieve the data to check if the sap chain is already finished. status: db.table to retrieve the last status change timestamp. engine_table_name: table name exposed with the SAP LOGCHAIN data. This table will be used in the jdbc query. Returns: The query string. """ if not chain_id: raise ValueError( "To query on log chain SAP table the chain id should be defined!" ) select_exp = ( "SELECT CHAIN_ID, CONCAT(DATUM, ZEIT) AS LOAD_DATE, ANALYZED_STATUS" ) filter_exp = ( f"UPPER(CHAIN_ID) = UPPER('{chain_id}') " f"AND UPPER(ANALYZED_STATUS) = UPPER('{status}')" ) query = ( f"WITH {engine_table_name} AS (" f"{select_exp} " f"FROM {dbtable} " f"WHERE {filter_exp}" ")" ) return query class SensorJobRunManager(object): """Class to manage triggering of Jobs via Job Run API.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def run_job(cls, job_id: str, token: str, host: str) -> Tuple[int, Optional[str]]: """Trigger the job based on its id. Args: job_id: the id of the job to trigger. token: token required to access Databricks API. host: host for workspace. """ run_id = None ex = None headers = {"Authorization": f"Bearer {token}"} body = json.dumps( { "job_id": job_id, "notebook_params": {"msg": "triggered via heartbeat sensor"}, } ) res = requests.post( f"https://{host}/api/2.1/jobs/run-now", data=body, headers=headers, timeout=3600, ) if res.status_code == 200: run_id = (json.loads(res.text))["run_id"] cls._LOGGER.info( f"Job : {str(job_id)} triggered successfully... RUN ID : {str(run_id)}" ) else: ex = str(res.json()["error_code"]) + " " + res.json()["message"] cls._LOGGER.error(f"An error has occurred: {ex}") return run_id, ex ================================================ FILE: lakehouse_engine/core/table_manager.py ================================================ """Table manager module.""" from typing import List from delta.tables import DeltaTable from pyspark.sql import DataFrame from pyspark.sql.functions import translate from lakehouse_engine.core.definitions import SQLDefinitions from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.sql_parser_utils import SQLParserUtils class TableManager(object): """Set of actions to manipulate tables/views in several ways. {{ get_table_manager_operations() }} """ def __init__(self, configs: dict): """Construct TableManager algorithm instances. Args: configs: configurations for the TableManager algorithm. """ self._logger = LoggingHandler(__name__).get_logger() self.configs = configs self.function = self.configs["function"] def get_function(self) -> None: """Get a specific function to execute.""" available_functions = { "compute_table_statistics": self.compute_table_statistics, "create_table": self.create, "create_tables": self.create_many, "create_view": self.create, "drop_table": self.drop_table, "drop_view": self.drop_view, "execute_sql": self.execute_sql, "truncate": self.truncate, "vacuum": self.vacuum, "describe": self.describe, "optimize": self.optimize, "show_tbl_properties": self.show_tbl_properties, "get_tbl_pk": self.get_tbl_pk, "repair_table": self.repair_table, "delete_where": self.delete_where, } self._logger.info("Function being executed: {}".format(self.function)) if self.function in available_functions.keys(): func = available_functions[self.function] func() else: raise NotImplementedError( f"The requested function {self.function} is not implemented." ) def create(self) -> None: """Create a new table or view on metastore.""" disable_dbfs_retry = ( self.configs["disable_dbfs_retry"] if "disable_dbfs_retry" in self.configs.keys() else False ) sql = ConfigUtils.read_sql(self.configs["path"], disable_dbfs_retry) try: sql_commands = SQLParserUtils().split_sql_commands( sql_commands=sql, delimiter=self.configs.get("delimiter", ";"), advanced_parser=self.configs.get("advanced_parser", False), ) for command in sql_commands: if command.strip(): self._logger.info(f"sql command: {command}") ExecEnv.SESSION.sql(command) self._logger.info(f"{self.function} successfully executed!") except Exception as e: self._logger.error(e) raise def create_many(self) -> None: """Create multiple tables or views on metastore. In this function the path to the ddl files can be separated by comma. """ self.execute_multiple_sql_files() def compute_table_statistics(self) -> None: """Compute table statistics.""" sql = SQLDefinitions.compute_table_stats.value.format( self.configs["table_or_view"] ) try: self._logger.info(f"sql command: {sql}") ExecEnv.SESSION.sql(sql) self._logger.info(f"{self.function} successfully executed!") except Exception as e: self._logger.error(e) raise def drop_table(self) -> None: """Delete table function deletes table from metastore and erases all data.""" drop_stmt = "{} {}".format( SQLDefinitions.drop_table_stmt.value, self.configs["table_or_view"], ) self._logger.info(f"sql command: {drop_stmt}") ExecEnv.SESSION.sql(drop_stmt) self._logger.info("Table successfully dropped!") def drop_view(self) -> None: """Delete view function deletes view from metastore and erases all data.""" drop_stmt = "{} {}".format( SQLDefinitions.drop_view_stmt.value, self.configs["table_or_view"], ) self._logger.info(f"sql command: {drop_stmt}") ExecEnv.SESSION.sql(drop_stmt) self._logger.info("View successfully dropped!") def truncate(self) -> None: """Truncate function erases all data but keeps metadata.""" truncate_stmt = "{} {}".format( SQLDefinitions.truncate_stmt.value, self.configs["table_or_view"], ) self._logger.info(f"sql command: {truncate_stmt}") ExecEnv.SESSION.sql(truncate_stmt) self._logger.info("Table successfully truncated!") def vacuum(self) -> None: """Vacuum function erases older versions from Delta Lake tables or locations.""" if not self.configs.get("table_or_view", None): delta_table = DeltaTable.forPath(ExecEnv.SESSION, self.configs["path"]) self._logger.info(f"Vacuuming location: {self.configs['path']}") delta_table.vacuum(self.configs.get("vacuum_hours", 168)) else: delta_table = DeltaTable.forName( ExecEnv.SESSION, self.configs["table_or_view"] ) self._logger.info(f"Vacuuming table: {self.configs['table_or_view']}") delta_table.vacuum(self.configs.get("vacuum_hours", 168)) def describe(self) -> None: """Describe function describes metadata from some table or view.""" describe_stmt = "{} {}".format( SQLDefinitions.describe_stmt.value, self.configs["table_or_view"], ) self._logger.info(f"sql command: {describe_stmt}") output = ExecEnv.SESSION.sql(describe_stmt) self._logger.info(output) def optimize(self) -> None: """Optimize function optimizes the layout of Delta Lake data.""" if self.configs.get("where_clause", None): where_exp = "WHERE {}".format(self.configs["where_clause"].strip()) else: where_exp = "" if self.configs.get("optimize_zorder_col_list", None): zorder_exp = "ZORDER BY ({})".format( self.configs["optimize_zorder_col_list"].strip() ) else: zorder_exp = "" optimize_stmt = "{} {} {} {}".format( SQLDefinitions.optimize_stmt.value, ( f"delta.`{self.configs.get('path', None)}`" if not self.configs.get("table_or_view", None) else self.configs.get("table_or_view", None) ), where_exp, zorder_exp, ) self._logger.info(f"sql command: {optimize_stmt}") output = ExecEnv.SESSION.sql(optimize_stmt) self._logger.info(output) def execute_multiple_sql_files(self) -> None: """Execute multiple statements in multiple sql files. In this function the path to the files is separated by comma. """ for table_metadata_file in self.configs["path"].split(","): disable_dbfs_retry = ( self.configs["disable_dbfs_retry"] if "disable_dbfs_retry" in self.configs.keys() else False ) sql = ConfigUtils.read_sql(table_metadata_file.strip(), disable_dbfs_retry) sql_commands = SQLParserUtils().split_sql_commands( sql_commands=sql, delimiter=self.configs.get("delimiter", ";"), advanced_parser=self.configs.get("advanced_parser", False), ) for command in sql_commands: if command.strip(): self._logger.info(f"sql command: {command}") ExecEnv.SESSION.sql(command) self._logger.info("sql file successfully executed!") def execute_sql(self) -> None: """Execute sql commands separated by semicolon (;).""" sql_commands = SQLParserUtils().split_sql_commands( sql_commands=self.configs.get("sql"), delimiter=self.configs.get("delimiter", ";"), advanced_parser=self.configs.get("advanced_parser", False), ) for command in sql_commands: if command.strip(): self._logger.info(f"sql command: {command}") ExecEnv.SESSION.sql(command) self._logger.info("sql successfully executed!") def show_tbl_properties(self) -> DataFrame: """Show Table Properties. Returns: A dataframe with the table properties. """ show_tbl_props_stmt = "{} {}".format( SQLDefinitions.show_tbl_props_stmt.value, self.configs["table_or_view"], ) self._logger.info(f"sql command: {show_tbl_props_stmt}") output = ExecEnv.SESSION.sql(show_tbl_props_stmt) self._logger.info(output) return output def get_tbl_pk(self) -> List[str]: """Get the primary key of a particular table. Returns: The list of columns that are part of the primary key. """ output: List[str] = ( self.show_tbl_properties() .filter("key == 'lakehouse.primary_key'") .select("value") .withColumn("value", translate("value", " `", "")) .first()[0] .split(",") ) self._logger.info(output) return output def repair_table(self) -> None: """Run the repair table command.""" table_name = self.configs["table_or_view"] sync_metadata = self.configs["sync_metadata"] repair_stmt = ( f"MSCK REPAIR TABLE {table_name} " f"{'SYNC METADATA' if sync_metadata else ''}" ) self._logger.info(f"sql command: {repair_stmt}") output = ExecEnv.SESSION.sql(repair_stmt) self._logger.info(output) def delete_where(self) -> None: """Run the delete where command.""" table_name = self.configs["table_or_view"] delete_where = self.configs["where_clause"].strip() delete_stmt = SQLDefinitions.delete_where_stmt.value.format( table_name, delete_where ) self._logger.info(f"sql command: {delete_stmt}") output = ExecEnv.SESSION.sql(delete_stmt) self._logger.info(output) ================================================ FILE: lakehouse_engine/dq_processors/__init__.py ================================================ """Package to define data quality processes available in the lakehouse engine.""" ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/__init__.py ================================================ """Package containing custom DQ expectations available in the lakehouse engine.""" ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b.py ================================================ """Expectation to check if column 'a' is not equal to column 'b'.""" from typing import Any, Dict, Optional from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import ColumnPairMapExpectation from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) from lakehouse_engine.utils.expectations_utils import validate_result class ColumnPairCustom(ColumnPairMapMetricProvider): """Asserts that column 'A' is not equal to column 'B'. Additionally, It compares Null as well. """ condition_metric_name = "column_pair_values.a_not_equal_to_b" condition_domain_keys = ( "batch_id", "table", "column_A", "column_B", "ignore_row_if", ) condition_value_keys = () @column_pair_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: ColumnPairMapMetricProvider, column_A: Any, column_B: Any, **kwargs: dict, ) -> Any: """Implementation of the expectation's logic. Args: column_A: Value of the row of column_A. column_B: Value of the row of column_B. kwargs: dict with additional parameters. Returns: If the condition is met. """ return ((column_A.isNotNull()) | (column_B.isNotNull())) & ( column_A != column_B ) # noqa: E501 class ExpectColumnPairAToBeNotEqualToB(ColumnPairMapExpectation): """Expect values in column A to be not equal to column B. Args: column_A: The first column name. column_B: The second column name. Keyword Args: allow_cross_type_comparisons: If True, allow comparisons between types (e.g. integer and string). Otherwise, attempting such comparisons will raise an exception. ignore_row_if: "both_values_are_missing", "either_value_is_missing", "neither" (default). result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`. include_config: If True (default), then include the expectation config as part of the result object. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default: False. meta: A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. Returns: An ExpectationSuiteValidationResult. """ mostly: float = 1.0 ignore_row_if: str = "neither" result_format: dict = {"result_format": "BASIC"} include_config: bool = True catch_exceptions: bool = False column_A: Any = None column_B: Any = None examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "a": ["IE4019", "IM6092", "IE1405"], "b": ["IE4019", "IM6092", "IE1405"], "c": ["IE1404", "IN6192", "842075"], }, "schemas": { "spark": { "a": "StringType", "b": "StringType", "c": "StringType", } }, } ], "tests": [ { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "b", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["b"], }, }, "out": { "success": False, "unexpected_index_list": [ {"b": "IE4019", "a": "IE4019"}, {"b": "IM6092", "a": "IM6092"}, {"b": "IE1405", "a": "IE1405"}, ], }, }, { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "c", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a"], }, }, "out": { "success": True, "unexpected_index_list": [], }, }, ], }, ] map_metric = "column_pair_values.a_not_equal_to_b" success_keys = ( "column_A", "column_B", "ignore_row_if", "mostly", ) def _validate( self, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> Any: """Custom implementation of the GE _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GE test logic does not do this validation, and thus we need to make it manually. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ validate_result( self, metrics, ) return super()._validate(metrics, runtime_configuration, execution_engine) """Mandatory block of code. If it is removed the expectation will not be available.""" if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectColumnPairAToBeNotEqualToB().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b.py ================================================ """Expectation to check if column 'a' is lower or equal than column 'b'.""" from typing import Any, Dict, Optional from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import ColumnPairMapExpectation from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) from lakehouse_engine.utils.expectations_utils import validate_result class ColumnPairCustom(ColumnPairMapMetricProvider): """Asserts that column 'A' is lower or equal than column 'B'. Additionally, the 'margin' parameter can be used to add a margin to the check between column 'A' and 'B': 'A' <= 'B' + 'margin'. """ condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b" condition_domain_keys = ( "batch_id", "table", "column_A", "column_B", "ignore_row_if", ) condition_value_keys = ("margin",) @column_pair_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: ColumnPairMapMetricProvider, column_A: Any, column_B: Any, **kwargs: dict, ) -> Any: """Implementation of the expectation's logic. Args: column_A: Value of the row of column_A. column_B: Value of the row of column_B. kwargs: dict with additional parameters. Returns: If the condition is met. """ margin = kwargs.get("margin") or None if margin is None: approx = 0 elif not isinstance(margin, (int, float, complex)): raise TypeError( f"margin must be one of int, float, complex." f" Found: {margin} as {type(margin)}" ) else: approx = margin # type: ignore return column_A <= column_B + approx # type: ignore class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation): """Expect values in column A to be lower or equal than column B. Args: column_A: The first column name. column_B: The second column name. margin: additional approximation to column B value. Keyword Args: allow_cross_type_comparisons: If True, allow comparisons between types (e.g. integer and string). Otherwise, attempting such comparisons will raise an exception. ignore_row_if: "both_values_are_missing", "either_value_is_missing", "neither" (default). result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`. include_config: If True (default), then include the expectation config as part of the result object. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default: False. meta: A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. Returns: An ExpectationSuiteValidationResult. """ mostly: float = 1.0 ignore_row_if: str = "neither" result_format: dict = {"result_format": "BASIC"} include_config: bool = True catch_exceptions: bool = False margin: Any = None column_A: Any = None column_B: Any = None examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "a": [11, 22, 50], "b": [10, 21, 100], "c": [9, 21, 30], }, "schemas": { "spark": { "a": "IntegerType", "b": "IntegerType", "c": "IntegerType", } }, } ], "tests": [ { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "c", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["c"], }, }, "out": { "success": False, "unexpected_index_list": [ {"c": 9, "a": 11}, {"c": 21, "a": 22}, {"c": 30, "a": 50}, ], }, }, { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "b", "margin": 1, "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a"], }, }, "out": { "success": True, "unexpected_index_list": [], }, }, ], }, ] map_metric = "column_pair_values.a_smaller_or_equal_than_b" success_keys = ( "column_A", "column_B", "ignore_row_if", "margin", "mostly", ) def _validate( self, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> Any: """Custom implementation of the GE _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GE test logic does not do this validation, and thus we need to make it manually. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ validate_result( self, metrics, ) return super()._validate(metrics, runtime_configuration, execution_engine) """Mandatory block of code. If it is removed the expectation will not be available.""" if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py ================================================ """Expectation to check if date column 'a' is greater or equal to date column 'b'.""" import datetime from typing import Any, Dict, Optional from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import ColumnPairMapExpectation from great_expectations.expectations.metrics.map_metric_provider import ( ColumnPairMapMetricProvider, column_pair_condition_partial, ) from lakehouse_engine.utils.expectations_utils import validate_result # This class defines a Metric to support your Expectation class ColumnPairDateAToBeGreaterOrEqualToDateB(ColumnPairMapMetricProvider): """Asserts that date column 'A' is greater or equal to date column 'B'.""" # This is the id string that will be used to refer your metric. condition_metric_name = "column_pair_values.date_a_greater_or_equal_to_date_b" condition_domain_keys = ( "batch_id", "table", "column_A", "column_B", "ignore_row_if", ) @column_pair_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: ColumnPairMapMetricProvider, column_A: Any, column_B: Any, **kwargs: dict, ) -> Any: """Implementation of the expectation's logic. Args: column_A: Value of the row of column_A. column_B: Value of the row of column_B. kwargs: dict with additional parameters. Returns: Boolean on the basis of condition. """ return ( (column_A.isNotNull()) & (column_B.isNotNull()) & (column_A >= column_B) ) # type: ignore class ExpectColumnPairDateAToBeGreaterThanOrEqualToDateB(ColumnPairMapExpectation): """Expect values in date column A to be greater than or equal to date column B. Args: column_A: The first date column name. column_B: The second date column name. Keyword Args: ignore_row_if: "both_values_are_missing", "either_value_is_missing", "neither" (default). result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`. include_config: If True (default), then include the expectation config as part of the result object. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default: False. meta: A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. Returns: An ExpectationSuiteValidationResult. """ mostly: float = 1.0 ignore_row_if: str = "neither" result_format: dict = {"result_format": "BASIC"} include_config: bool = True catch_exceptions: bool = True column_A: Any = None column_B: Any = None examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "a": [ "2029-01-12", "2024-11-21", "2022-01-01", ], "b": [ "2019-02-11", "2014-12-22", "2012-09-09", ], "c": [ "2010-02-11", "2015-12-22", "2022-09-09", ], }, "schemas": { "spark": { "a": "DateType", "b": "DateType", "c": "DateType", } }, } ], "tests": [ { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "b", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a", "b"], }, }, "out": {"success": True, "unexpected_index_list": []}, }, { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "b", "column_B": "c", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a"], }, }, "out": { "success": False, "unexpected_index_list": [ { "a": datetime.date(2024, 11, 21), "b": datetime.date(2014, 12, 22), "c": datetime.date(2015, 12, 22), }, { "a": datetime.date(2022, 1, 1), "b": datetime.date(2012, 9, 9), "c": datetime.date(2022, 9, 9), }, ], }, }, ], } ] map_metric = "column_pair_values.date_a_greater_or_equal_to_date_b" success_keys = ( "column_A", "column_B", "ignore_row_if", "mostly", ) def _validate( self, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> Any: """Custom implementation of the GE _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GE test logic does not do this validation, and thus we need to make it manually. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ validate_result( self, metrics, ) return super()._validate(metrics, runtime_configuration, execution_engine) """Mandatory block of code. If it is removed the expectation will not be available.""" if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectColumnPairDateAToBeGreaterThanOrEqualToDateB().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_be_date_not_older_than.py ================================================ """Expectation to check if column value is a date within a timeframe.""" import datetime from datetime import timedelta from typing import Any, Dict, Optional from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ColumnMapMetricProvider from great_expectations.expectations.metrics.map_metric_provider import ( column_condition_partial, ) from lakehouse_engine.utils.expectations_utils import validate_result class ColumnValuesDateNotOlderThan(ColumnMapMetricProvider): """Asserts that column values are a date that isn't older than a given date.""" condition_metric_name = "column_values.date_is_not_older_than" condition_domain_keys = ( "batch_id", "table", "column", "ignore_row_if", ) # type: ignore condition_value_keys = ("timeframe",) @column_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: ColumnMapMetricProvider, column: Any, **kwargs: dict, ) -> Any: """Implementation of the expectation's logic. Since timedelta can only define an interval up to weeks, a month is defined as 4 weeks and a year is defined as 52 weeks. Args: column: Name of column to validate. kwargs: dict with additional parameters. Returns: If the condition is met. """ timeframe = kwargs.get("timeframe") or None weeks = ( timeframe.get("weeks", 0) + (timeframe.get("months", 0) * 4) + (timeframe.get("years", 0) * 52) ) delta = timedelta( days=timeframe.get("days", 0), seconds=timeframe.get("seconds", 0), microseconds=timeframe.get("microseconds", 0), milliseconds=timeframe.get("milliseconds", 0), minutes=timeframe.get("minutes", 0), hours=timeframe.get("hours", 0), weeks=weeks, ) return delta > (datetime.datetime.now() - column) class ExpectColumnValuesToBeDateNotOlderThan(ColumnMapExpectation): """Expect value in column to be date that is not older than a given time. Since timedelta can only define an interval up to weeks, a month is defined as 4 weeks and a year is defined as 52 weeks. Args: column: Name of column to validate Note: Column must be of type Date, Timestamp or String (with Timestamp format). Format: yyyy-MM-ddTHH:mm:ss timeframe: dict with the definition of the timeframe. kwargs: dict with additional parameters. Keyword Args: allow_cross_type_comparisons: If True, allow comparisons between types (e.g. integer and string). Otherwise, attempting such comparisons will raise an exception. ignore_row_if: "both_values_are_missing", "either_value_is_missing", "neither" (default). result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`. include_config: If True (default), then include the expectation config as part of the result object. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default: False. meta: A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. Returns: An ExpectationSuiteValidationResult. """ mostly: float = 1.0 ignore_row_if: str = "neither" result_format: dict = {"result_format": "BASIC"} include_config: bool = True catch_exceptions: bool = False timeframe: Any = {} column: Any = None examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "a": [ datetime.datetime(2023, 6, 1, 12, 0, 0), datetime.datetime(2023, 6, 2, 12, 0, 0), datetime.datetime(2023, 6, 3, 12, 0, 0), ], "b": [ datetime.datetime(1800, 6, 1, 12, 0, 0), datetime.datetime(2023, 6, 2, 12, 0, 0), datetime.datetime(1800, 6, 3, 12, 0, 0), ], } } ], "schemas": {"spark": {"a": "TimestampType", "b": "TimestampType"}}, "tests": [ { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column": "a", "timeframe": {"years": 100}, "result_format": { "result_format": "BASIC", "unexpected_index_column_names": ["b"], }, }, "out": { "success": True, "unexpected_index_list": [], }, }, { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column": "b", "timeframe": {"years": 100}, "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a"], }, }, "out": { "success": False, "unexpected_index_list": [ { "a": datetime.datetime(2023, 6, 1, 12, 0), "b": datetime.datetime(1800, 6, 1, 12, 0), }, { "a": datetime.datetime(2023, 6, 3, 12, 0), "b": datetime.datetime(1800, 6, 3, 12, 0), }, ], }, }, ], }, ] map_metric = "column_values.date_is_not_older_than" success_keys = ("column", "ignore_row_if", "timeframe", "mostly") def _validate( self, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> Any: """Custom implementation of the GE _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GE test logic does not do this validation, and thus we need to make it manually. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ validate_result( self, metrics, ) return super()._validate(metrics, runtime_configuration, execution_engine) """Mandatory block of code. If it is removed the expectation will not be available.""" if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectColumnValuesToBeDateNotOlderThan().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_not_be_null_or_empty_string.py ================================================ """Expectation to check if column value is not null or empty string.""" from typing import Any, Dict, Optional from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import ColumnMapExpectation from great_expectations.expectations.metrics import ColumnMapMetricProvider from great_expectations.expectations.metrics.map_metric_provider import ( column_condition_partial, ) from lakehouse_engine.utils.expectations_utils import validate_result class ColumnValuesNotNullOrEpmtyString(ColumnMapMetricProvider): """Asserts that column values are not null or empty string.""" condition_metric_name = "column_values.not_null_or_empty_string" filter_column_isnull = False condition_domain_keys = ( "batch_id", "table", "column", "ignore_row_if", ) # type: ignore condition_value_keys = () @column_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: ColumnMapMetricProvider, column: Any, **kwargs: dict, ) -> Any: """Implementation of the expectation's logic. Args: column: Name of column to validate. kwargs: dict with additional parameters. Returns: If the condition is met. """ return (column.isNotNull()) & (column != "") class ExpectColumnValuesToNotBeNullOrEmptyString(ColumnMapExpectation): """Expect value in column to be not null or empty string. Args: column: Name of column to validate. kwargs: dict with additional parameters. Keyword Args: allow_cross_type_comparisons: If True, allow comparisons between types (e.g. integer and string). Otherwise, attempting such comparisons will raise an exception. ignore_row_if: "both_values_are_missing", "either_value_is_missing", "neither" (default). result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`. include_config: If True (default), then include the expectation config as part of the result object. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default: False. meta: A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. Returns: An ExpectationSuiteValidationResult. """ mostly: float = 1.0 ignore_row_if: str = "neither" result_format: dict = {"result_format": "BASIC"} include_config: bool = True catch_exceptions: bool = False column: Any = None examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "a": [ "4061622965678", "4061622965679", "4061622965680", ], "b": [ "4061622965678", "", "4061622965680", ], } } ], "schemas": {"spark": {"a": "StringType", "b": "StringType"}}, "tests": [ { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column": "a", "result_format": { "result_format": "BASIC", "unexpected_index_column_names": ["b"], }, }, "out": { "success": True, "unexpected_index_list": [], }, }, { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column": "b", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a"], }, }, "out": { "success": False, "unexpected_index_list": [ { "a": "4061622965679", "b": "", } ], }, }, ], }, ] map_metric = "column_values.not_null_or_empty_string" success_keys = ("column", "ignore_row_if", "mostly") def _validate( self, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> Any: """Custom implementation of the GE _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GE test logic does not do this validation, and thus we need to make it manually. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ validate_result( self, metrics, ) return super()._validate(metrics, runtime_configuration, execution_engine) """Mandatory block of code. If it is removed the expectation will not be available.""" if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectColumnValuesToNotBeNullOrEmptyString().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c.py ================================================ """Expectation to check if column 'a' equals 'b', or 'c'.""" from typing import Any, Dict, Literal, Optional from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine from great_expectations.expectations.expectation import MulticolumnMapExpectation from great_expectations.expectations.metrics.map_metric_provider import ( MulticolumnMapMetricProvider, multicolumn_condition_partial, ) from lakehouse_engine.utils.expectations_utils import validate_result class MulticolumnCustomMetric(MulticolumnMapMetricProvider): """Expectation metric definition. This expectation asserts that column 'a' must equal to column 'b' or column 'c'. In addition to this it is possible to validate that column 'b' or 'c' match a regex. """ condition_metric_name = "multicolumn_values.column_a_must_equal_b_or_c" condition_domain_keys = ( "batch_id", "table", "column_list", "ignore_row_if", ) condition_value_keys = ("validation_regex_b", "validation_regex_c") @multicolumn_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: MulticolumnMapMetricProvider, column_list: list, **kwargs: dict ) -> Any: validation_regex_b = ( kwargs.get("validation_regex_b") if "validation_regex_b" in kwargs else ".*" ) validation_regex_c = ( kwargs.get("validation_regex_c") if "validation_regex_c" in kwargs else ".*" ) return (column_list[0].isNotNull()) & ( ( column_list[1].isNotNull() & (column_list[1].rlike(validation_regex_b)) & (column_list[0] == column_list[1]) ) | ( (column_list[1].isNull()) & (column_list[2].rlike(validation_regex_c)) & (column_list[0] == column_list[2]) ) ) class ExpectMulticolumnColumnAMustEqualBOrC(MulticolumnMapExpectation): """Expect that the column 'a' is equal to 'b' when this is not empty; otherwise 'a' must be equal to 'c'. Args: column_list: The column names to evaluate. Keyword Args: ignore_row_if: default to "never". result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`. Default set to `BASIC`. include_config: If True, then include the expectation config as part of the result object. Default set to True. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default set to False. Returns: An ExpectationSuiteValidationResult. """ # noqa: E501 ignore_row_if: Literal[ "all_values_are_missing", "any_value_is_missing", "never" ] = "never" result_format: dict = {"result_format": "BASIC"} include_config: bool = True catch_exceptions: bool = False mostly: float = 1.0 column_list: Any = None validation_regex_c: Any = None examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "a": ["d001", "1000", "1001"], "b": [None, "1000", "1001"], "c": ["d001", "d002", "d002"], "d": ["d001", "d002", "1001"], }, "schemas": { "spark": { "a": "StringType", "b": "StringType", "c": "StringType", "d": "StringType", } }, } ], "tests": [ { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_list": ["d", "b", "c"], "validation_regex_c": "d[0-9]{3}$", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["d", "b", "c"], }, }, "out": { "success": False, "unexpected_index_list": [ { "d": "d002", "b": "1000", "c": "d002", } ], }, }, { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_list": ["a", "b", "c"], "validation_regex_c": "d[0-9]{3}$", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a", "b", "c"], }, }, "out": {"success": True}, }, ], }, ] map_metric = "multicolumn_values.column_a_must_equal_b_or_c" success_keys = ( "validation_regex_b", "validation_regex_c", "mostly", ) # type: ignore def _validate( self, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> Any: """Custom implementation of the GE _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GE test logic does not do this validation, and thus we need to make it manually. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ validate_result( self, metrics, ) return super()._validate(metrics, runtime_configuration, execution_engine) if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectMulticolumnColumnAMustEqualBOrC().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/custom_expectations/expect_queried_column_agg_value_to_be.py ================================================ """Expectation to check if aggregated column satisfy the condition.""" from typing import Any, Dict, Optional from great_expectations.execution_engine import ExecutionEngine from great_expectations.expectations.expectation import ( ExpectationValidationResult, QueryExpectation, ) from great_expectations.expectations.expectation_configuration import ( ExpectationConfiguration, ) class ExpectQueriedColumnAggValueToBe(QueryExpectation): """Expect agg of column to satisfy the condition specified. Args: template_dict: dict with the following keys: - column (column to check sum). - group_column_list (group by column names to be listed). - condition (how to validate the aggregated value eg: between, greater, lesser). - max_value (maximum allowed value). - min_value (minimum allowed value). - agg_type (sum/count/max/min). """ metric_dependencies = ("query.template_values",) query_temp = """ SELECT {group_column_list}, {agg_type}({column}) FROM {batch} GROUP BY {group_column_list} """ include_config: bool = True mostly: float = 1.0 result_format: dict = {"result_format": "BASIC"} catch_exceptions: bool = False meta: Any = None query: str = query_temp template_dict: Any = None success_keys = ("template_dict", "query") condition_domain_keys = ( "query", "template_dict", "batch_id", "row_condition", "condition_parser", ) def validate_configuration( self, configuration: Optional[ExpectationConfiguration] = None ) -> None: """Validates that a configuration has been set. Args: configuration (OPTIONAL[ExpectationConfiguration]): An optional Expectation Configuration entry. Returns: None. Raises InvalidExpectationConfigurationError """ super().validate_configuration(configuration) @staticmethod def _validate_between( x: str, y: int, expected_max_value: int, expected_min_value: int ) -> dict: """Method to check whether value satisfy the between condition. Args: x: contains key of dict(query_result). y: contains value of dict(query_result). expected_max_value: max value passed. expected_min_value: min value passed. Returns: dict with the results after being validated. """ if expected_min_value <= y <= expected_max_value: return { "info": f"Value is within range\ {expected_min_value} and {expected_max_value}", "success": True, } else: return { "success": False, "result": { "info": f"Value not in range\ {expected_min_value} and {expected_max_value}", "observed_value": (x, y), }, } @staticmethod def _validate_lesser(x: str, y: int, expected_max_value: int) -> dict: """Method to check whether value satisfy the less condition. Args: x: contains key of dict(query_result). y: contains value of dict(query_result). expected_max_value: max value passed. Returns: dict with the results after being validated. """ if y < expected_max_value: return { "info": f"Value is lesser than {expected_max_value}", "success": True, } else: return { "success": False, "result": { "info": f"Value is greater than {expected_max_value}", "observed_value": (x, y), }, } @staticmethod def _validate_greater(x: str, y: int, expected_min_value: int) -> dict: """Method to check whether value satisfy the greater condition. Args: x: contains key of dict(query_result). y: contains value of dict(query_result). expected_min_value: min value passed. Returns: dict with the results after being validated. """ if y > expected_min_value: return { "info": f"Value is greater than {expected_min_value}", "success": True, } else: return { "success": False, "result": { "info": f"Value is less than {expected_min_value}", "observed_value": (x, y), }, } def _validate_condition(self, query_result: dict, template_dict: dict) -> dict: """Method to check whether value satisfy the expected result. Args: query_result: contains dict of key and value. template_dict: contains dict of input provided. Returns: dict with the results after being validated. """ result: Dict[Any, Any] = {} for x, y in query_result.items(): condition_check = template_dict["condition"] if condition_check == "between": _max = template_dict["max_value"] _min = template_dict["min_value"] result = self._validate_between(x, y, _max, _min) elif condition_check == "lesser": _max = template_dict["max_value"] result = self._validate_lesser(x, y, _max) else: _min = template_dict["min_value"] result = self._validate_greater(x, y, _min) return result @staticmethod def _generate_dict(query_result: list) -> dict: """Generate a dict from a list of dicts and merge the group by columns values. Args: query_result: contains list of dict values obtained from query. Returns: Dict Example: input: [dict_values(['Male', 25, 3500]), dict_values(['Female', 25, 6200]), dict_values(['Female', 20, 3500]), dict_values(['Male', 20, 6900])]. output: {'Male|25': 3500, 'Female|25': 6200, 'Female|20': 3500, 'Male|20': 6900}. """ intermediate_list = [] final_list = [] for i in range(len(query_result)): intermediate_list.append(list(query_result[i])) for element in intermediate_list: if type(element) is list: output = "|".join(map(str, element)) key = "|".join(map(str, element[0:-1])) value = output.replace(key + "|", "") final_list.append(key) final_list.append(value) new_result = { final_list[i]: int(final_list[i + 1]) for i in range(0, len(final_list), 2) } return new_result def _validate( self, metrics: dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> ExpectationValidationResult | dict: """Implementation of the GE _validate method. This method is used on the tests to validate the result of the query output. Args: metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ query_result = metrics.get("query.template_values") query_result = [element.values() for element in query_result] query_result = self._generate_dict(query_result) template_dict = self._validate_template_dict(self) output = self._validate_condition(query_result, template_dict) return output @staticmethod def _validate_template_dict(self: Any) -> dict: """Validate the template dict. Returns: Dict. Raises TypeError and KeyError """ template_dict = self.template_dict if not isinstance(template_dict, dict): raise TypeError("template_dict must be supplied as a dict") if not all( [ "column" in template_dict, "group_column_list" in template_dict, "agg_type" in template_dict, "condition" in template_dict, ] ): raise KeyError( "The following keys have to be in the \ template dict: column, group_column_list, condition, agg_type" ) return template_dict examples = [ { "dataset_name": "Test Dataset", "data": [ { "data": { "ID": [1, 2, 3, 4, 5, 6], "Names": [ "Ramesh", "Nasser", "Jessica", "Komal", "Jude", "Muffy", ], "Age": [25, 25, 25, 20, 20, 25], "Gender": [ "Male", "Male", "Female", "Female", "Male", "Female", ], "Salary": [1000, 2500, 5000, 3500, 6900, 1200], }, "schemas": { "spark": { "ID": "IntegerType", "Names": "StringType", "Age": "IntegerType", "Gender": "StringType", "Salary": "IntegerType", } }, } ], "tests": [ { "title": "basic_positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "template_dict": { "column": "Salary", "group_column_list": "Gender", "agg_type": "sum", "condition": "greater", "min_value": 2000, }, "result_format": { "result_format": "COMPLETE", }, }, "out": {"success": True}, "only_for": ["spark"], }, { "title": "basic_positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "template_dict": { "column": "Salary", "group_column_list": "Gender,Age", "agg_type": "sum", "condition": "between", "max_value": 7000, "min_value": 2000, }, "result_format": { "result_format": "COMPLETE", }, }, "out": {"success": True}, "only_for": ["spark"], }, { "title": "basic_positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "template_dict": { "column": "Salary", "group_column_list": "Age", "agg_type": "max", "condition": "lesser", "max_value": 10000, }, "result_format": { "result_format": "COMPLETE", }, }, "out": {"success": True}, "only_for": ["spark"], }, { "title": "basic_negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "template_dict": { "column": "Salary", "group_column_list": "Gender", "agg_type": "count", "condition": "greater", "min_value": 4, }, "result_format": { "result_format": "COMPLETE", }, }, "out": {"success": False}, "only_for": ["sqlite", "spark"], }, { "title": "basic_negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "template_dict": { "column": "Salary", "group_column_list": "Gender,Age", "agg_type": "sum", "condition": "between", "max_value": 2000, "min_value": 1000, }, "result_format": { "result_format": "COMPLETE", }, }, "out": {"success": False}, "only_for": ["spark"], }, ], }, ] library_metadata = { "tags": ["query-based"], } if __name__ == "__main__": ExpectQueriedColumnAggValueToBe().print_diagnostic_checklist() ================================================ FILE: lakehouse_engine/dq_processors/dq_factory.py ================================================ """Module containing the class definition of the Data Quality Factory.""" import importlib import json import random from copy import deepcopy from datetime import datetime, timezone from json import dumps, loads from typing import Optional, Tuple import great_expectations as gx from great_expectations import ExpectationSuite from great_expectations.checkpoint import CheckpointResult from great_expectations.core.batch_definition import BatchDefinition from great_expectations.core.run_identifier import RunIdentifier from great_expectations.data_context import EphemeralDataContext from great_expectations.data_context.types.base import ( DataContextConfig, FilesystemStoreBackendDefaults, S3StoreBackendDefaults, ) from great_expectations.expectations.expectation_configuration import ( ExpectationConfiguration, ) from pyspark.sql import DataFrame from pyspark.sql.functions import ( col, dayofmonth, explode, from_json, lit, month, schema_of_json, struct, to_json, to_timestamp, transform, year, ) from pyspark.sql.types import FloatType, StringType from lakehouse_engine.core.definitions import ( DQDefaults, DQFunctionSpec, DQResultFormat, DQSpec, DQType, OutputSpec, WriteType, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.table_manager import TableManager from lakehouse_engine.dq_processors.exceptions import DQValidationsFailedException from lakehouse_engine.dq_processors.validator import Validator from lakehouse_engine.io.writer_factory import WriterFactory from lakehouse_engine.utils.logging_handler import LoggingHandler class DQFactory(object): """Class for the Data Quality Factory.""" _LOGGER = LoggingHandler(__name__).get_logger() _TIMESTAMP = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") @classmethod def _add_critical_function_tag(cls, args: dict) -> dict: """Add tags to function considered critical. Adds a tag to each of the functions passed on the dq_specs to denote that they are critical_functions. This means that if any of them fails, the dq process will fail, even if the threshold is not surpassed. This is done by adding a tag to the meta dictionary of the expectation configuration. Args: args: arguments passed on the dq_spec Returns: A dictionary with the args with the critical function tag. """ if "meta" in args.keys(): meta = args["meta"] if isinstance(meta["notes"], str): meta["notes"] = meta["notes"] + " **Critical function**." else: meta["notes"]["content"] = ( meta["notes"]["content"] + " **Critical function**." ) args["meta"] = meta return args else: args["meta"] = { "notes": { "format": "markdown", "content": "**Critical function**.", } } return args @classmethod def _configure_checkpoint( cls, context: EphemeralDataContext, dataframe_bd: BatchDefinition, suite: ExpectationSuite, dq_spec: DQSpec, data: DataFrame, checkpoint_run_time: str, ) -> Tuple[CheckpointResult, Optional[list]]: """Create and configure the validation checkpoint. Creates and configures a validation definition based on the suite and then creates, configures and runs the checkpoint returning, at the end, the result as well as the primary key from the dq_specs. Args: context: The data context from GX dataframe_bd: The dataframe with the batch definition to validate suite: A group of expectations to validate dq_spec: The arguments directly passed from the acon in the dq_spec key data: Input dataframe to run the dq process on. checkpoint_run_time: A string with the time in miliseconds Returns: A tuple with the result from the checkpoint run and the primary key from the dq_spec. """ validation_definition = context.validation_definitions.add( gx.ValidationDefinition( data=dataframe_bd, suite=suite, name=f"{dq_spec.spec_id}-{dq_spec.input_id}" f"-validation-{checkpoint_run_time}", ) ) source_pk = cls._get_unexpected_rows_pk(dq_spec) result_format: dict = { "result_format": DQResultFormat.COMPLETE.value, } # If the source primary key is defined, we add it to the result format # so that it is included in the results from GX. if source_pk: result_format = { **result_format, "unexpected_index_column_names": source_pk, } checkpoint = context.checkpoints.add( gx.Checkpoint( name=f"{dq_spec.spec_id}-{dq_spec.input_id}" f"-checkpoint-{checkpoint_run_time}" f"-{str(random.randint(1, 100))}", # nosec B311 validation_definitions=[validation_definition], actions=[], result_format=result_format, ) ) result = checkpoint.run( batch_parameters={"dataframe": data}, run_id=RunIdentifier( run_name=f"{checkpoint_run_time}" f"-{dq_spec.spec_id}-{dq_spec.input_id}" f"-{str(random.randint(1, 100))}-checkpoint", # nosec B311 run_time=datetime.strptime(checkpoint_run_time, "%Y%m%d-%H%M%S%f"), ), ) return result, source_pk @classmethod def _check_row_condition( cls, dq_spec: DQSpec, dq_function: DQFunctionSpec ) -> DQFunctionSpec: """Enables/disables row_conditions. Checks for row_codition arguments in the definition of expectations and enables/disables their usage based on the enable_row_condition argument. row_conditions allow you to filter the rows that are processed by the DQ functions. This is useful when you want to run the DQ functions only on a subset of the data. Args: dq_spec: The arguments directly passed from the acon in the dq_spec key dq_function: A DQFunctionSpec with the definition of a dq function. Returns: The definition of a dq_function with or without the row_condition key. """ if ( not dq_spec.enable_row_condition and "row_condition" in dq_function.args.keys() ): del dq_function.args["row_condition"] cls._LOGGER.info( f"Disabling row_condition for function: {dq_function.function}" ) return dq_function @classmethod def _add_suite( cls, context: EphemeralDataContext, dq_spec: DQSpec, checkpoint_run_time: str ) -> ExpectationSuite: """Create and configure an ExpectationSuite. Creates and configures an expectation suite, adding the dq functions passed on the dq_spec as well as the dq_critical_functions also passed on the dq_spec, if they exist. Finally return the configured suite. Args: context: The data context from GX dq_spec: The arguments directly passed from the acon in the dq_spec key checkpoint_run_time: A string with the time in miliseconds Returns: A configured ExpectationSuite object. """ expectation_suite_name = ( dq_spec.expectation_suite_name if dq_spec.expectation_suite_name else f"{dq_spec.spec_id}-{dq_spec.input_id}" f"-{dq_spec.dq_type}-{checkpoint_run_time}" ) suite = context.suites.add(gx.ExpectationSuite(name=expectation_suite_name)) for dq_function in dq_spec.dq_functions: dq_function = cls._check_row_condition(dq_spec, dq_function) suite.add_expectation_configuration( ExpectationConfiguration( type=dq_function.function, kwargs=dq_function.args if dq_function.args else {}, meta=dq_function.args.get("meta") if dq_function.args else {}, ) ) if dq_spec.critical_functions: for critical_function in dq_spec.critical_functions: meta_args = cls._add_critical_function_tag(critical_function.args) suite.add_expectation_configuration( ExpectationConfiguration( type=critical_function.function, kwargs=( critical_function.args if critical_function.args else {} ), meta=meta_args, ) ) suite.save() return suite @classmethod def _check_expectation_result(cls, result_dict: dict) -> dict: """Add an empty dict if the unexpected_index_list key is empty. Checks if the unexpected_index_list key has any element, if it doesn't, add an empty dictionary to the result key. This is needed due to some edge cases that appeared due to the GX update to version 1.3.13 where the unexpected_index_list would sometimes exist even for successful validation runs. Args: result_dict: A dict with the result_dict from a checkpoint run. Returns: The configured result_dict """ for expectation_result in result_dict["results"]: if "unexpected_index_list" in expectation_result["result"].keys(): if len(expectation_result["result"]["unexpected_index_list"]) < 1: expectation_result["result"] = {} return result_dict @classmethod def run_dq_process(cls, dq_spec: DQSpec, data: DataFrame) -> DataFrame: """Run the specified data quality process on a dataframe. Based on the dq_specs we apply the defined expectations on top of the dataframe in order to apply the necessary validations and then output the result of the data quality process. The logic of the function is as follows: 1. Import the custom expectations defined in the engine. 2. Create the context based on the dq_spec. - The context is the base class for the GX, an ephemeral context means that it does not store/load the configuration of the environment in a configuration file. 3. Add the data source to the context. - This is the data source that will be used to run the dq process, in our case Spark. 4. Create the dataframe asset and batch definition. - The asset represents the data where the expectations are applied and the batch definition is the way how the data should be split, in the case of dataframes it is always the whole dataframe. 5. Create the expectation suite. - This is the group of expectations that will be applied to the data. 6. Create the checkpoint and run it. - The checkpoint is the object that will run the expectations on the data and return the results. 7. Transform the results and write them to the result sink. - The results are transformed to a more readable format and then written to the result sink. 8. Log the results and raise an exception if needed. - The results are logged and if there are any failed expectations the process will raise an exception based on the dq_spec. 9. Tag the source data if needed. - If the dq_spec has the tag_source_data argument set to True, the source data will be tagged with the dq results. Args: dq_spec: data quality specification. data: input dataframe to run the dq process on. Returns: The DataFrame containing the results of the DQ process. """ # Creating the context if dq_spec.dq_type == "validator" or dq_spec.dq_type == "prisma": for expectation in DQDefaults.CUSTOM_EXPECTATION_LIST.value: importlib.__import__( "lakehouse_engine.dq_processors.custom_expectations." + expectation ) context = gx.get_context( cls._get_data_context_config(dq_spec), mode="ephemeral" ) # Adding data source to context dataframe_data_source = context.data_sources.add_spark( name=f"{dq_spec.spec_id}-{dq_spec.input_id}-datasource", persist=False, ) dataframe_asset = dataframe_data_source.add_dataframe_asset( name=f"{dq_spec.spec_id}-{dq_spec.input_id}-asset" ) dataframe_bd = dataframe_asset.add_batch_definition_whole_dataframe( name=f"{dq_spec.spec_id}-{dq_spec.input_id}-batch" ) checkpoint_run_time = datetime.today().strftime("%Y%m%d-%H%M%S%f") suite = cls._add_suite(context, dq_spec, checkpoint_run_time) result, source_pk = cls._configure_checkpoint( context, dataframe_bd, suite, dq_spec, data, checkpoint_run_time ) expectation_result_key = list(result.run_results.keys())[0] result_dict = result.run_results[expectation_result_key].to_json_dict() result_dict = cls._check_expectation_result(result_dict) data = cls._transform_checkpoint_results( data, source_pk, result_dict, dq_spec ) # Processed keys are only added for the PRISMA dq type # because they are being used to calculate the good # records that were processed in a run. if dq_spec.dq_type == DQType.PRISMA.value: keys = data.select( [col(c).cast(StringType()).alias(c) for c in source_pk] ) keys = keys.withColumn( "run_name", lit(result_dict["meta"]["run_id"]["run_name"]) ) cls._write_to_location(dq_spec, keys, processed_keys=True) else: raise TypeError( f"Type of Data Quality '{dq_spec.dq_type}' is not supported." ) return data @classmethod def _check_critical_functions_tags(cls, failed_expectations: dict) -> list: critical_failure = [] for expectation in failed_expectations.values(): meta = expectation["meta"] if meta and ( ("notes" in meta.keys() and "Critical function" in meta["notes"]) or ( "content" in meta["notes"].keys() and "Critical function" in meta["notes"]["content"] ) ): critical_failure.append(expectation["type"]) return critical_failure @classmethod def _check_chunk_usage(cls, results_dict: dict, dq_spec: DQSpec) -> bool: """Check if the results should be split into chunks. If the size of the results dictionary is too big, we will split it into smaller chunks. This is needed to avoid memory issues when processing large datasets. Args: results_dict: The results dictionary to be checked. dq_spec: data quality specification. Returns: True if the results dictionary is too big, False otherwise. """ for ele in results_dict["results"]: if ( "unexpected_index_list" in ele["result"].keys() and len(ele["result"]["unexpected_index_list"]) > dq_spec.result_sink_chunk_size ): return True return False @classmethod def _explode_results( cls, df: DataFrame, dq_spec: DQSpec, ) -> DataFrame: """Transform dq results dataframe exploding a set of columns. Args: df: dataframe with dq results to be exploded. dq_spec: data quality specification. """ df = df.withColumn("validation_results", explode("results")).withColumn( "source", lit(dq_spec.source) ) if ( not df.schema["validation_results"] .dataType.fieldNames() # type: ignore .__contains__("result") ): df = df.withColumn( "validation_results", col("validation_results").withField( "result", struct(lit(None).alias("observed_value")) ), ) kwargs_columns = [ f"validation_results.expectation_config.kwargs.{col_name}" for col_name in df.select( "validation_results.expectation_config.kwargs.*" ).columns ] cols_to_cast = ["max_value", "min_value", "sum_total"] for col_name in kwargs_columns: if col_name.split(".")[-1] in cols_to_cast: df = df.withColumn( "validation_results", col("validation_results").withField( "expectation_config", col("validation_results.expectation_config").withField( "kwargs", col( "validation_results.expectation_config.kwargs" ).withField( col_name.split(".")[-1], col(col_name).cast(FloatType()), ), ), ), ) new_columns = [ "validation_results.expectation_config.kwargs.*", "validation_results.expectation_config.type as expectation_type", "validation_results.success as expectation_success", "validation_results.exception_info", "statistics.*", ] + dq_spec.result_sink_extra_columns df_exploded = df.selectExpr(*df.columns, *new_columns).drop( *[c.replace(".*", "").split(" as")[0] for c in new_columns] ) df_exploded = df_exploded.drop( "statistics", "id", "results", "meta", "suite_name" ) if ( "meta" in df_exploded.select("validation_results.expectation_config.*").columns ): df_exploded = df_exploded.withColumn( "meta", col("validation_results.expectation_config.meta") ) schema = df_exploded.schema.simpleString() if ( dq_spec.gx_result_format.upper() == DQResultFormat.COMPLETE.value and "unexpected_index_list" in schema ): df_exploded = df_exploded.withColumn( "unexpected_index_list", transform( col("validation_results.result.unexpected_index_list"), lambda y: y.withField("run_success", lit(False)), ), ) if "observed_value" in schema: df_exploded = df_exploded.withColumn( "observed_value", col("validation_results.result.observed_value") ) return ( df_exploded.withColumn("run_time_year", year(to_timestamp("run_time"))) .withColumn("run_time_month", month(to_timestamp("run_time"))) .withColumn("run_time_day", dayofmonth(to_timestamp("run_time"))) .withColumn( "kwargs", to_json(col("validation_results.expectation_config.kwargs")) ) .withColumn("validation_results", to_json(col("validation_results"))) ) @classmethod def _get_data_context_config(cls, dq_spec: DQSpec) -> DataContextConfig: """Get the configuration of the data context. Based on the configuration it is possible to define the backend to be the file system (e.g. local file system) or S3, meaning that the DQ artefacts will be stored according to this configuration. Args: dq_spec: data quality process specification. Returns: The DataContextConfig object configuration. """ store_backend: FilesystemStoreBackendDefaults | S3StoreBackendDefaults if dq_spec.store_backend == DQDefaults.FILE_SYSTEM_STORE.value: store_backend = FilesystemStoreBackendDefaults( root_directory=dq_spec.local_fs_root_dir ) elif dq_spec.store_backend == DQDefaults.FILE_SYSTEM_S3_STORE.value: store_backend = S3StoreBackendDefaults( default_bucket_name=dq_spec.bucket, validation_results_store_prefix=dq_spec.validations_store_prefix, checkpoint_store_prefix=dq_spec.checkpoint_store_prefix, expectations_store_prefix=dq_spec.expectations_store_prefix, ) return DataContextConfig( store_backend_defaults=store_backend, analytics_enabled=False, ) @classmethod def _get_data_source_defaults(cls, dq_spec: DQSpec) -> dict: """Get the configuration for a datasource. Args: dq_spec: data quality specification. Returns: The python dictionary with the datasource configuration. """ return { "name": f"{dq_spec.spec_id}-{dq_spec.input_id}-datasource", "class_name": DQDefaults.DATASOURCE_CLASS_NAME.value, "execution_engine": { "class_name": DQDefaults.DATASOURCE_EXECUTION_ENGINE.value, "persist": False, }, "data_connectors": { f"{dq_spec.spec_id}-{dq_spec.input_id}-data_connector": { "module_name": DQDefaults.DATA_CONNECTORS_MODULE_NAME.value, "class_name": DQDefaults.DATA_CONNECTORS_CLASS_NAME.value, "assets": { ( dq_spec.data_asset_name if dq_spec.data_asset_name else f"{dq_spec.spec_id}-{dq_spec.input_id}" ): {"batch_identifiers": DQDefaults.DQ_BATCH_IDENTIFIERS.value} }, } }, } @classmethod def _get_failed_expectations( cls, results: dict, dq_spec: DQSpec, failed_expectations: dict, evaluated_expectations: dict, is_final_chunk: bool, ) -> Tuple[dict, dict]: """Get the failed expectations of a Checkpoint result. Args: results: the results of the DQ process. dq_spec: data quality specification. failed_expectations: dict of failed expectations. evaluated_expectations: dict of evaluated expectations. is_final_chunk: boolean indicating if this is the final chunk. Returns: a tuple with a dict of failed expectations and a dict of evaluated expectations. """ expectations_results = results["results"] for result in expectations_results: evaluated_expectations[result["expectation_config"]["id"]] = result[ "expectation_config" ] if not result["success"]: failed_expectations[result["expectation_config"]["id"]] = result[ "expectation_config" ] if result["exception_info"]["raised_exception"]: cls._LOGGER.error( f"""The expectation {str(result["expectation_config"])} raised the following exception: {result["exception_info"]["exception_message"]}""" ) cls._LOGGER.error( f"{len(failed_expectations)} out of {len(evaluated_expectations)} " f"Data Quality Expectation(s) have failed! Failed Expectations: " f"{failed_expectations}" ) percentage_failure = 1 - (results["statistics"]["success_percent"] / 100) if ( dq_spec.max_percentage_failure is not None and dq_spec.max_percentage_failure < percentage_failure and is_final_chunk ): raise DQValidationsFailedException( f"Max error threshold is being surpassed! " f"Expected: {dq_spec.max_percentage_failure} " f"Got: {percentage_failure}" ) return failed_expectations, evaluated_expectations @classmethod def _get_unexpected_rows_pk(cls, dq_spec: DQSpec) -> Optional[list]: """Get primary key for using on rows failing DQ validations. Args: dq_spec: data quality specification. Returns: the list of columns that are part of the primary key. """ if dq_spec.unexpected_rows_pk: return dq_spec.unexpected_rows_pk elif dq_spec.tbl_to_derive_pk: return TableManager( {"function": "get_tbl_pk", "table_or_view": dq_spec.tbl_to_derive_pk} ).get_tbl_pk() elif dq_spec.tag_source_data: raise ValueError( "You need to provide either the argument " "'unexpected_rows_pk' or 'tbl_to_derive_pk'." ) else: return None @classmethod def _log_or_fail( cls, results: dict, dq_spec: DQSpec, failed_expectations: dict, evaluated_expectations: dict, is_final_chunk: bool, ) -> Tuple[dict, dict]: """Log the execution of the Data Quality process. Args: results: the results of the DQ process. dq_spec: data quality specification. failed_expectations: list of failed expectations. evaluated_expectations: list of evaluated expectations. is_final_chunk: boolean indicating if this is the final chunk. Returns: a tuple with a dict of failed expectations and a dict of evaluated expectations. """ if results["success"]: cls._LOGGER.info( "The data passed all the expectations defined. Everything looks good!" ) else: failed_expectations, evaluated_expectations = cls._get_failed_expectations( results, dq_spec, failed_expectations, evaluated_expectations, is_final_chunk, ) if dq_spec.critical_functions and is_final_chunk: critical_failure = cls._check_critical_functions_tags(failed_expectations) if critical_failure: raise DQValidationsFailedException( f"Data Quality Validations Failed, the following critical " f"expectations failed: {critical_failure}." ) if dq_spec.fail_on_error and is_final_chunk and failed_expectations: raise DQValidationsFailedException("Data Quality Validations Failed!") return failed_expectations, evaluated_expectations @classmethod def _transform_checkpoint_results( cls, data: DataFrame, source_pk: list, checkpoint_results: dict, dq_spec: DQSpec, ) -> DataFrame: """Transforms the checkpoint results and creates new entries. All the items of the dictionary are cast to a json like format. All columns are cast to json like format. After that the dictionary is converted into a dataframe. Args: data: input dataframe to run the dq process on. source_pk: list of columns that are part of the primary key. checkpoint_results: dict with results of the checkpoint run. dq_spec: data quality specification. checkpoint_run_time: A string with the time in miliseconds. Returns: Transformed results dataframe. """ results_dict = loads(dumps(checkpoint_results)) # Check the size of the results dictionary, if it is too big # we will split it into smaller chunks. results_dict_list = cls._generate_chunks(results_dict, dq_spec) index = 0 failed_expectations: dict = {} evaluated_expectations: dict = {} # The processed chunk is removed from the list of results # so the memory is freed as soon as possible. while index < len(results_dict_list): is_final_chunk = len(results_dict_list) == 1 data, failed_expectations, evaluated_expectations = cls._process_chunk( dq_spec, source_pk, results_dict_list[index], data, failed_expectations, evaluated_expectations, is_final_chunk, ) del results_dict_list[index] return data @classmethod def _process_chunk( cls, dq_spec: DQSpec, source_pk: list[str], ele: dict, data: DataFrame, failed_expectations: dict, evaluated_expectations: dict, is_final_chunk: bool, ) -> Tuple[DataFrame, dict, dict]: """Process a chunk of the results. Args: dq_spec: data quality specification. source_pk: list of columns that are part of the primary key. ele: dictionary with the results of the dq process. data: input dataframe to run the dq process on. failed_expectations: list of failed expectations. evaluated_expectations: list of evaluated expectations. is_final_chunk: boolean indicating if this is the final chunk. Returns: A tuple with the processed data, failed expectations and evaluated expectations. """ df = ExecEnv.SESSION.createDataFrame([json.dumps(ele)], schema=StringType()) schema = schema_of_json(lit(json.dumps(ele))) df = ( df.withColumn("value", from_json("value", schema)) .select("value.*") .withColumn("spec_id", lit(dq_spec.spec_id)) .withColumn("input_id", lit(dq_spec.input_id)) .withColumn("run_name", col("meta.run_id.run_name")) .withColumn("run_time", col("meta.run_id.run_time")) ) exploded_df = ( cls._explode_results(df, dq_spec) if dq_spec.result_sink_explode else df.withColumn("validation_results", to_json(col("results"))).drop( "statistics", "meta", "suite_name", "results", "id" ) ) exploded_df = exploded_df.withColumn("source_primary_key", lit(source_pk)) exploded_df = cls._cast_columns_to_string(exploded_df) cls._write_to_location(dq_spec, exploded_df) failed_expectations, evaluated_expectations = cls._log_or_fail( ele, dq_spec, failed_expectations, evaluated_expectations, is_final_chunk ) if ( dq_spec.tag_source_data and dq_spec.result_sink_explode and dq_spec.fail_on_error is not True ): data = Validator.tag_source_with_dq(source_pk, data, exploded_df) return data, failed_expectations, evaluated_expectations return data, failed_expectations, evaluated_expectations @classmethod def _cast_columns_to_string(cls, df: DataFrame) -> DataFrame: """Cast selected columns of the dataframe to string type. Args: df: The input dataframe. Returns: A new dataframe with selected columns cast to string type. """ for col_name in df.columns: if col_name not in DQDefaults.DQ_COLUMNS_TO_KEEP_TYPES.value: df = df.withColumn(col_name, df[col_name].cast(StringType())) return df @classmethod def _generate_chunks(cls, results_dict: dict, dq_spec: DQSpec) -> list: """Split the results dictionary into smaller chunks. This is needed to avoid memory issues when processing large datasets. The size of the chunks is defined by the dq_spec.result_sink_chunk_size. Args: results_dict: The results dictionary to be split. dq_spec: data quality specification. Returns: A list of dictionaries, where each dictionary is a chunk of the original results dictionary. """ results_dict_list = [] split = cls._check_chunk_usage(results_dict, dq_spec) if split: # Here we are splitting the results into chunks per expectation # and then we are splitting the unexpected_index_list into # chunks of size dq_spec.result_sink_chunk_size. results_dict_list = cls._split_into_chunks(results_dict, dq_spec) else: # If the results are not too big, we can process them all at once. results_dict_list = [results_dict] return results_dict_list @classmethod def _split_into_chunks(cls, results_dict: dict, dq_spec: DQSpec) -> list: """Split the results into smaller chunks. This is needed to avoid memory issues when processing large datasets. The size of the chunks is defined by the dq_spec.result_sink_chunk_size. Args: results: The results to be split. dq_spec: data quality specification. Returns: A list of dictionaries, where each dictionary is a chunk of the original results. """ results_dict_list = [] for ele in results_dict["results"]: base_result = deepcopy(results_dict) if "unexpected_index_list" in ele["result"].keys(): for key in ExecEnv.ENGINE_CONFIG.dq_result_sink_columns_to_delete: del ele["result"][key] unexpected_index_list = ele["result"]["unexpected_index_list"] unexpected_index_list_chunks = cls.split_into_chunks( unexpected_index_list, dq_spec.result_sink_chunk_size ) del ele["result"]["unexpected_index_list"] for chunk in unexpected_index_list_chunks: ele["result"]["unexpected_index_list"] = chunk base_result["results"] = [ele] results_dict_list.append(deepcopy(base_result)) else: base_result["results"] = [ele] results_dict_list.append(base_result) return results_dict_list @classmethod def _write_to_location( cls, dq_spec: DQSpec, df: DataFrame, processed_keys: bool = False, ) -> None: """Write dq results dataframe to a table or location. It can be written: - a raw output (having result_sink_explode set as False) - an exploded output (having result_sink_explode set as True), which is more prepared for analysis, with some columns exploded, flatten and transformed. It can also be set result_sink_extra_columns with other columns desired to have in the output table or location. - processed keys when running the dq process with the dq_type set as 'prisma'. Args: dq_spec: data quality specification. df: dataframe with dq results to write. processed_keys: boolean indicating if the dataframe contains the processed keys. """ if processed_keys: table = None location = dq_spec.processed_keys_location options = {"mergeSchema": "true"} else: table = dq_spec.result_sink_db_table location = dq_spec.result_sink_location options = {"mergeSchema": "true"} if dq_spec.result_sink_explode else {} if table or location: WriterFactory.get_writer( spec=OutputSpec( spec_id="dq_result_sink", input_id="dq_result", db_table=table, location=location, partitions=( dq_spec.result_sink_partitions if dq_spec.result_sink_partitions else [] ), write_type=WriteType.APPEND.value, data_format=dq_spec.result_sink_format, options=( options if dq_spec.result_sink_options is None else {**dq_spec.result_sink_options, **options} ), ), df=df, data=None, ).write() @staticmethod def split_into_chunks(lst: list, chunk_size: int) -> list: """Split a list into chunks of a specified size. Args: lst: The list to be split. chunk_size: Number of records in each chunk. Returns: A list of lists, where each inner list is a chunk of the original list. """ if chunk_size <= 0: raise ValueError("Chunk size must be a positive integer.") chunk_list = [] for i in range(0, len(lst), chunk_size): chunk_list.append(lst[i : i + chunk_size]) return chunk_list ================================================ FILE: lakehouse_engine/dq_processors/exceptions.py ================================================ """Package defining all the DQ custom exceptions.""" class DQValidationsFailedException(Exception): """Exception for when the data quality validations fail.""" pass class DQCheckpointsResultsException(Exception): """Exception for when the checkpoint results parsing fail.""" pass class DQSpecMalformedException(Exception): """Exception for when the DQSpec is malformed.""" pass class DQDuplicateRuleIdException(Exception): """Exception for when a duplicated rule id is found.""" pass ================================================ FILE: lakehouse_engine/dq_processors/validator.py ================================================ """Module containing the definition of a data quality validator.""" from typing import Any, List from great_expectations.core.batch import RuntimeBatchRequest from great_expectations.data_context import EphemeralDataContext from pyspark.sql import DataFrame from pyspark.sql.functions import ( col, collect_set, concat, explode, first, lit, struct, when, ) from lakehouse_engine.core.definitions import DQDefaults, DQFunctionSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.logging_handler import LoggingHandler class Validator(object): """Class containing the data quality validator.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def get_dq_validator( cls, context: EphemeralDataContext, batch_request: RuntimeBatchRequest, expectation_suite_name: str, dq_functions: List[DQFunctionSpec], critical_functions: List[DQFunctionSpec], ) -> Any: """Get a validator according to the specification. We use getattr to dynamically execute any expectation available. getattr(validator, function) is similar to validator.function(). With this approach, we can execute any expectation supported. Args: context: the BaseDataContext containing the configurations for the data source and store backend. batch_request: run time batch request to be able to query underlying data. expectation_suite_name: name of the expectation suite. dq_functions: a list of DQFunctionSpec to consider in the expectation suite. critical_functions: list of critical expectations in the expectation suite. Returns: The validator with the expectation suite stored. """ validator = context.get_validator( batch_request=batch_request, expectation_suite_name=expectation_suite_name ) if dq_functions: for dq_function in dq_functions: getattr(validator, dq_function.function)( **dq_function.args if dq_function.args else {} ) if critical_functions: for critical_function in critical_functions: meta_args = cls._add_critical_function_tag(critical_function.args) getattr(validator, critical_function.function)(**meta_args) return validator.save_expectation_suite(discard_failed_expectations=False) @classmethod def tag_source_with_dq( cls, source_pk: List[str], source_df: DataFrame, results_df: DataFrame ) -> DataFrame: """Tags the source dataframe with a new column having the DQ results. Args: source_pk: the primary key of the source data. source_df: the source dataframe to be tagged with DQ results. results_df: dq results dataframe. Returns: a dataframe tagged with the DQ results. """ run_success = results_df.select("success").first()[0] run_name = results_df.select("run_name").first()[0] raised_exceptions = ( True if results_df.filter("exception_info.raised_exception == True").count() > 0 else False ) failures_df = ( results_df.filter( "expectation_success == False and size(unexpected_index_list) > 0" ) if "unexpected_index_list" in results_df.schema.simpleString() else results_df.filter("expectation_success == False") ) if failures_df.isEmpty() is not True: source_df = cls._get_row_tagged_fail_df( failures_df, raised_exceptions, source_df, source_pk ) return cls._join_complementary_data( run_name, run_success, raised_exceptions, source_df ) @classmethod def _add_critical_function_tag(cls, args: dict) -> dict: if "meta" in args.keys(): meta = args["meta"] if isinstance(meta["notes"], str): meta["notes"] = meta["notes"] + " **Critical function**." else: meta["notes"]["content"] = ( meta["notes"]["content"] + " **Critical function**." ) args["meta"] = meta return args else: args["meta"] = { "notes": { "format": "markdown", "content": "**Critical function**.", } } return args @staticmethod def _get_row_tagged_fail_df( failures_df: DataFrame, raised_exceptions: bool, source_df: DataFrame, source_pk: List[str], ) -> DataFrame: """Get the source_df DataFrame tagged with the row level failures. Args: failures_df: dataframe having all failed expectations from the DQ execution. raised_exceptions: whether there was at least one expectation raising exceptions (True) or not (False). source_df: the source dataframe being tagged with DQ results. source_pk: the primary key of the source data. Returns: the source_df tagged with the row level failures. """ if "unexpected_index_list" in failures_df.schema.simpleString(): row_failures_df = ( failures_df.alias("a") .withColumn("exploded_list", explode(col("unexpected_index_list"))) .selectExpr("a.*", "exploded_list.*") .groupBy(*source_pk) .agg( struct( first(col("run_name")).alias("run_name"), first(col("success")).alias("run_success"), lit(raised_exceptions).alias("raised_exceptions"), first(col("expectation_success")).alias("run_row_success"), collect_set( struct( col("expectation_type"), col("kwargs"), ) ).alias("dq_failure_details"), ).alias("dq_validations") ) ) if all(item in row_failures_df.columns for item in source_pk): join_cond = [ col(f"a.{key}").eqNullSafe(col(f"b.{key}")) for key in source_pk ] columns = [ col_name for col_name in source_df.columns if col_name != "dq_validations" ] # Since we are creating multiple rows per run, if the dq_validations # column already exists, we need to add the new dq_validations to # the existing dq_validations. existing_validations = "a.dq_validations" existing_validations_details = "a.dq_validations.dq_failure_details" new_validations = "b.dq_validations" new_validations_details = "b.dq_validations.dq_failure_details" if "dq_validations" in source_df.columns: source_df = ( source_df.alias("a") .join(row_failures_df.alias("b"), join_cond, "left") .select( *[f"a.{col}" for col in columns], when( col(new_validations).isNotNull() & col(existing_validations_details).isNotNull(), col(new_validations).withField( "dq_failure_details", concat( col(existing_validations_details), col(new_validations_details), ), ), ) .when( col(new_validations).isNotNull() & col(new_validations_details).isNotNull(), col(new_validations), ) .otherwise(col(existing_validations)) .alias("dq_validations"), ) ) else: source_df = ( source_df.alias("a") .join(row_failures_df.alias("b"), join_cond, "left") .select("a.*", new_validations) ) return source_df @staticmethod def _join_complementary_data( run_name: str, run_success: bool, raised_exceptions: bool, source_df: DataFrame ) -> DataFrame: """Join the source_df DataFrame with complementary data. The source_df was already tagged/joined with the row level DQ failures, in case there were any. However, there might be cases for which we don't have any failure (everything succeeded) or cases for which only not row level failures happened (e.g. table level expectations or column level aggregations), and, for those we need to join the source_df with complementary data. Args: run_name: the name of the DQ execution in great expectations. run_success: whether the general execution of the DQ was succeeded (True) or not (False). raised_exceptions: whether there was at least one expectation raising exceptions (True) or not (False). source_df: the source dataframe being tagged with DQ results. Returns: the source_df tagged with complementary data. """ complementary_data = [ { "dq_validations": { "run_name": run_name, "run_success": run_success, "raised_exceptions": raised_exceptions, "run_row_success": True, } } ] complementary_df = ExecEnv.SESSION.createDataFrame( complementary_data, schema=DQDefaults.DQ_VALIDATIONS_SCHEMA.value ) return ( source_df.crossJoin( complementary_df.withColumnRenamed( "dq_validations", "tmp_dq_validations" ) ) .withColumn( "dq_validations", ( when( col("dq_validations").isNotNull(), col("dq_validations") ).otherwise(col("tmp_dq_validations")) if "dq_validations" in source_df.columns else col("tmp_dq_validations") ), ) .drop("tmp_dq_validations") ) ================================================ FILE: lakehouse_engine/engine.py ================================================ """Contract of the lakehouse engine with all the available functions to be executed.""" from typing import List, Optional, OrderedDict from lakehouse_engine.algorithms.data_loader import DataLoader from lakehouse_engine.algorithms.gab import GAB from lakehouse_engine.algorithms.reconciliator import Reconciliator from lakehouse_engine.algorithms.sensors.heartbeat import Heartbeat from lakehouse_engine.algorithms.sensors.sensor import Sensor, SensorStatus from lakehouse_engine.core.definitions import ( CollectEngineUsage, SAPLogchain, TerminatorSpec, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.file_manager import FileManagerFactory from lakehouse_engine.core.sensor_manager import SensorUpstreamManager from lakehouse_engine.core.table_manager import TableManager from lakehouse_engine.terminators.notifier_factory import NotifierFactory from lakehouse_engine.terminators.sensor_terminator import SensorTerminator from lakehouse_engine.utils.acon_utils import ( validate_and_resolve_acon, validate_manager_list, ) from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.engine_usage_stats import EngineUsageStats def load_data( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> Optional[OrderedDict]: """Load data using the DataLoader algorithm. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ try: acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create(app_name="data_loader", config=acon.get("exec_env", None)) acon = validate_and_resolve_acon(acon, "in_motion") finally: EngineUsageStats.store_engine_usage( acon, load_data.__name__, collect_engine_usage, spark_confs ) return DataLoader(acon).execute() def execute_reconciliation( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Execute the Reconciliator algorithm. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ try: acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create( app_name="reconciliator", config=acon.get("exec_env", None) ) acon = validate_and_resolve_acon(acon) finally: EngineUsageStats.store_engine_usage( acon, execute_reconciliation.__name__, collect_engine_usage, spark_confs ) Reconciliator(acon).execute() def execute_dq_validation( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Execute the DQValidator algorithm. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ from lakehouse_engine.algorithms.dq_validator import DQValidator try: acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create( app_name="dq_validator", config=acon.get("exec_env", None) ) acon = validate_and_resolve_acon(acon, "at_rest") finally: EngineUsageStats.store_engine_usage( acon, execute_dq_validation.__name__, collect_engine_usage, spark_confs ) DQValidator(acon).execute() def manage_table( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Manipulate tables/views using Table Manager algorithm. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create(app_name="manage_table", config=acon.get("exec_env", None)) EngineUsageStats.store_engine_usage( acon, manage_table.__name__, collect_engine_usage, spark_confs ) TableManager(acon).get_function() def execute_manager( acon: dict, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Execute the Lakehouse Engine Manager. This function allows users to execute multiple managers in a single call by providing a list of acons. Args: acon: list of acons to be executed by the manager. collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ ExecEnv.get_or_create(app_name="lakehouse_engine_manager") acon_list = validate_manager_list(acon) for acon in acon_list: EngineUsageStats.store_engine_usage( acon, execute_manager.__name__, collect_engine_usage, spark_confs ) if acon["manager"] == "file": FileManagerFactory.execute_function(configs=acon) elif acon["manager"] == "table": TableManager(acon).get_function() else: raise ValueError(f"Manager {acon['manager']} not recognized.") def manage_files( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Manipulate s3 files using File Manager algorithm. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create(app_name="manage_files", config=acon.get("exec_env", None)) EngineUsageStats.store_engine_usage( acon, manage_files.__name__, collect_engine_usage, spark_confs ) FileManagerFactory.execute_function(configs=acon) def execute_sensor( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> bool: """Execute a sensor based on a Sensor Algorithm Configuration. A sensor is useful to check if an upstream system has new data. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create(app_name="execute_sensor", config=acon.get("exec_env", None)) EngineUsageStats.store_engine_usage( acon, execute_sensor.__name__, collect_engine_usage, spark_confs ) return Sensor(acon).execute() def execute_sensor_heartbeat( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Execute a sensor based on a Heartbeat Algorithm Configuration. The heartbeat mechanism monitors whether an upstream system has new data. The heartbeat job runs continuously within a defined data product or according to a user-defined schedule. This job operates based on the Control table, where source-related entries can be fed by users using the Heartbeat Data Feeder job. Each source (such as SAP, delta_table, Kafka, Local Manual Upload, etc.) can have tasks added in parallel within the Heartbeat Job. Based on source heartbeat ACON and control table entries, Heartbeat will send a final sensor acon to the existing sensor modules, which checks if a new event is available for the control table record. The sensor then returns the NEW_EVENT_AVAILABLE status to the Heartbeat modules, which update the control table. Following this, the related Databricks jobs are triggered through the Databricks Job API, ensuring that all dependencies are met. This process allows the Heartbeat sensor to efficiently manage and centralize the entire workflow with minimal user intervention and enhance sensor features by providing centralization, efficently manage and track using control table. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create( app_name="execute_heartbeat", config=acon.get("exec_env", None) ) EngineUsageStats.store_engine_usage( acon, execute_sensor_heartbeat.__name__, collect_engine_usage, spark_confs ) return Heartbeat(acon).execute() def trigger_heartbeat_sensor_jobs( acon: dict, ) -> None: """Trigger the jobs via Databricks job API. Args: acon: Heartbeat ACON containing data product configs and options. """ ExecEnv.get_or_create(app_name="trigger_heartbeat_sensor_jobs") Heartbeat(acon).heartbeat_sensor_trigger_jobs() def execute_heartbeat_sensor_data_feed( heartbeat_sensor_data_feed_path: str, heartbeat_sensor_control_table: str, ) -> None: """Control table Data feeder. It reads the CSV file stored at `data` folder and perform UPSERT and DELETE in control table. Args: heartbeat_sensor_data_feed_path: path where CSV file is stored. heartbeat_sensor_control_table: CONTROL table of Heartbeat sensor. """ ExecEnv.get_or_create(app_name="execute_heartbeat_sensor_data_feed") Heartbeat.heartbeat_sensor_control_table_data_feed( heartbeat_sensor_data_feed_path, heartbeat_sensor_control_table ) def update_heartbeat_sensor_status( heartbeat_sensor_control_table: str, sensor_table: str, job_id: str, ) -> None: """UPDATE heartbeat sensor status. Update heartbeat sensor control table with COMPLETE status and job_end_timestamp for the triggered job. Update sensor control table with PROCESSED_NEW_DATA status and status_change_timestamp for the triggered job. Args: heartbeat_sensor_control_table: Heartbeat sensor control table name. sensor_table: lakehouse engine sensor table name. job_id: job_id of the running job. It refers to trigger_job_id in Control table. """ ExecEnv.get_or_create(app_name="update_heartbeat_sensor_status") Heartbeat.update_heartbeat_sensor_completion_status( heartbeat_sensor_control_table, sensor_table, job_id ) def update_sensor_status( sensor_id: str, control_db_table_name: str, status: str = SensorStatus.PROCESSED_NEW_DATA.value, assets: List[str] = None, ) -> None: """Update internal sensor status. Update the sensor status in the control table, it should be used to tell the system that the sensor has processed all new data that was previously identified, hence updating the shifted sensor status. Usually used to move from `SensorStatus.ACQUIRED_NEW_DATA` to `SensorStatus.PROCESSED_NEW_DATA`, but there might be scenarios - still to identify - where we can update the sensor status from/to different statuses. Args: sensor_id: sensor id. control_db_table_name: `db.table` to store sensor checkpoints. status: status of the sensor. assets: a list of assets that are considered as available to consume downstream after this sensor has status PROCESSED_NEW_DATA. """ ExecEnv.get_or_create(app_name="update_sensor_status") SensorTerminator.update_sensor_status( sensor_id=sensor_id, control_db_table_name=control_db_table_name, status=status, assets=assets, ) def generate_sensor_query( sensor_id: str, filter_exp: str = None, control_db_table_name: str = None, upstream_key: str = None, upstream_value: str = None, upstream_table_name: str = None, ) -> str: """Generates a preprocess query to be used in a sensor configuration. Args: sensor_id: sensor id. filter_exp: expression to filter incoming new data. You can use the placeholder ?default_upstream_key and ?default_upstream_value, so that it can be replaced by the respective values in the control_db_table_name for this specific sensor_id. control_db_table_name: `db.table` to retrieve the last status change timestamp. This is only relevant for the jdbc sensor. upstream_key: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream). upstream_value: the upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). upstream_table_name: value for custom sensor to query new data from the upstream If none we will set the default value, our `sensor_new_data` view. Returns: The query string. """ ExecEnv.get_or_create(app_name="generate_sensor_preprocess_query") if filter_exp: return SensorUpstreamManager.generate_filter_exp_query( sensor_id=sensor_id, filter_exp=filter_exp, control_db_table_name=control_db_table_name, upstream_key=upstream_key, upstream_value=upstream_value, upstream_table_name=upstream_table_name, ) else: return SensorUpstreamManager.generate_sensor_table_preprocess_query( sensor_id=sensor_id ) def generate_sensor_sap_logchain_query( chain_id: str, dbtable: str = SAPLogchain.DBTABLE.value, status: str = SAPLogchain.GREEN_STATUS.value, engine_table_name: str = SAPLogchain.ENGINE_TABLE.value, ) -> str: """Generates a sensor query based in the SAP Logchain table. Args: chain_id: chain id to query the status on SAP. dbtable: `db.table` to retrieve the data to check if the sap chain is already finished. status: `db.table` to retrieve the last status change timestamp. engine_table_name: table name exposed with the SAP LOGCHAIN data. This table will be used in the jdbc query. Returns: The query string. """ ExecEnv.get_or_create(app_name="generate_sensor_sap_logchain_query") return SensorUpstreamManager.generate_sensor_sap_logchain_query( chain_id=chain_id, dbtable=dbtable, status=status, engine_table_name=engine_table_name, ) def send_notification(args: dict) -> None: """Send a notification using a notifier. Args: args: arguments for the notifier. """ notifier = NotifierFactory.get_notifier( spec=TerminatorSpec(function="notify", args=args) ) notifier.create_notification() notifier.send_notification() def execute_gab( acon_path: Optional[str] = None, acon: Optional[dict] = None, collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value, spark_confs: dict = None, ) -> None: """Execute the gold asset builder based on a GAB Algorithm Configuration. GaB is useful to build your gold assets with predefined functions for recurrent periods. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ acon = ConfigUtils.get_acon(acon_path, acon) ExecEnv.get_or_create(app_name="execute_gab", config=acon.get("exec_env", None)) EngineUsageStats.store_engine_usage( acon, execute_gab.__name__, collect_engine_usage, spark_confs ) GAB(acon).execute() ================================================ FILE: lakehouse_engine/io/__init__.py ================================================ """Input and Output package responsible for the behaviour of reading and writing.""" ================================================ FILE: lakehouse_engine/io/exceptions.py ================================================ """Package defining all the io custom exceptions.""" class IncrementalFilterInputNotFoundException(Exception): """Exception for when the input of an incremental filter is not found. This may occur when tables are being loaded in incremental way, taking the increment definition out of a specific table, but the table still does not exist, mainly because probably it was not loaded for the first time yet. """ pass class WrongIOFormatException(Exception): """Exception for when a user provides a wrong I/O format.""" pass class NotSupportedException(RuntimeError): """Exception for when a user provides a not supported operation.""" pass class InputNotFoundException(Exception): """Exception for when a user does not provide a mandatory input.""" pass class EndpointNotFoundException(Exception): """Exception for when the endpoint is not found by the Graph API.""" pass class LocalPathNotFoundException(Exception): """Exception for when a local path is not found.""" pass class WriteToLocalException(Exception): """Exception for when an error occurs when trying to write to the local path.""" pass class SharePointAPIError(Exception): """Custom exception class to handle errors Sharepoint API requests.""" pass class InvalidSharepointPathException(Exception): """Raised when folder path conflicts with file name. Happens if both `folder_relative_path` and `file_name` are set, but the folder path looks like a file path (last segment has a dot). """ pass ================================================ FILE: lakehouse_engine/io/reader.py ================================================ """Defines abstract reader behaviour.""" from abc import ABC, abstractmethod from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec from lakehouse_engine.utils.logging_handler import LoggingHandler class Reader(ABC): """Abstract Reader class.""" def __init__(self, input_spec: InputSpec): """Construct Reader instances. Args: input_spec: input specification for reading data. """ self._logger = LoggingHandler(self.__class__.__name__).get_logger() self._input_spec = input_spec @abstractmethod def read(self) -> DataFrame: """Abstract read method. Returns: A dataframe read according to the input specification. """ raise NotImplementedError ================================================ FILE: lakehouse_engine/io/reader_factory.py ================================================ """Module for reader factory.""" from abc import ABC from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import FILE_INPUT_FORMATS, InputFormat, InputSpec from lakehouse_engine.io.readers.dataframe_reader import DataFrameReader from lakehouse_engine.io.readers.file_reader import FileReader from lakehouse_engine.io.readers.jdbc_reader import JDBCReader from lakehouse_engine.io.readers.kafka_reader import KafkaReader from lakehouse_engine.io.readers.query_reader import QueryReader from lakehouse_engine.io.readers.sap_b4_reader import SAPB4Reader from lakehouse_engine.io.readers.sap_bw_reader import SAPBWReader from lakehouse_engine.io.readers.sharepoint_reader import SharepointReader from lakehouse_engine.io.readers.table_reader import TableReader class ReaderFactory(ABC): # noqa: B024 """Class for reader factory.""" @classmethod def get_data(cls, spec: InputSpec) -> DataFrame: """Get data according to the input specification following a factory pattern. Args: spec: input specification to get the data. Returns: A dataframe containing the data. """ if spec.db_table: read_df = TableReader(input_spec=spec).read() elif spec.data_format == InputFormat.JDBC.value: read_df = JDBCReader(input_spec=spec).read() elif spec.data_format in FILE_INPUT_FORMATS: read_df = FileReader(input_spec=spec).read() elif spec.data_format == InputFormat.KAFKA.value: read_df = KafkaReader(input_spec=spec).read() elif spec.data_format == InputFormat.SQL.value: read_df = QueryReader(input_spec=spec).read() elif spec.data_format == InputFormat.SAP_BW.value: read_df = SAPBWReader(input_spec=spec).read() elif spec.data_format == InputFormat.SAP_B4.value: read_df = SAPB4Reader(input_spec=spec).read() elif spec.data_format == InputFormat.DATAFRAME.value: read_df = DataFrameReader(input_spec=spec).read() elif spec.data_format == InputFormat.SFTP.value: from lakehouse_engine.io.readers.sftp_reader import SFTPReader read_df = SFTPReader(input_spec=spec).read() return SFTPReader(input_spec=spec).read() elif spec.data_format == InputFormat.SHAREPOINT.value: return SharepointReader(input_spec=spec).read() else: raise NotImplementedError( f"The requested input spec format {spec.data_format} is not supported." ) if spec.temp_view: read_df.createOrReplaceTempView(spec.temp_view) return read_df ================================================ FILE: lakehouse_engine/io/readers/__init__.py ================================================ """Readers package to define reading behaviour.""" ================================================ FILE: lakehouse_engine/io/readers/dataframe_reader.py ================================================ """Module to define behaviour to read from dataframes.""" from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec from lakehouse_engine.io.reader import Reader class DataFrameReader(Reader): """Class to read data from a dataframe.""" def __init__(self, input_spec: InputSpec): """Construct DataFrameReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read data from a dataframe. Returns: A dataframe containing the data from a dataframe previously computed. """ return self._input_spec.df_name ================================================ FILE: lakehouse_engine/io/readers/file_reader.py ================================================ """Module to define behaviour to read from files.""" from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import FILE_INPUT_FORMATS, InputSpec, ReadType from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader from lakehouse_engine.utils.schema_utils import SchemaUtils class FileReader(Reader): """Class to read from files.""" def __init__(self, input_spec: InputSpec): """Construct FileReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read file data. Returns: A dataframe containing the data from the files. """ if ( self._input_spec.read_type == ReadType.BATCH.value and self._input_spec.data_format in FILE_INPUT_FORMATS ): df = ExecEnv.SESSION.read.load( path=self._input_spec.location, format=self._input_spec.data_format, schema=SchemaUtils.from_input_spec(self._input_spec), **self._input_spec.options if self._input_spec.options else {}, ) if self._input_spec.with_filepath: # _metadata contains hidden columns df = df.selectExpr( "*", "_metadata.file_path as lhe_extraction_filepath" ) return df elif ( self._input_spec.read_type == ReadType.STREAMING.value and self._input_spec.data_format in FILE_INPUT_FORMATS ): df = ExecEnv.SESSION.readStream.load( path=self._input_spec.location, format=self._input_spec.data_format, schema=SchemaUtils.from_input_spec(self._input_spec), **self._input_spec.options if self._input_spec.options else {}, ) if self._input_spec.with_filepath: # _metadata contains hidden columns df = df.selectExpr( "*", "_metadata.file_path as lhe_extraction_filepath" ) return df else: raise NotImplementedError( "The requested read type and format combination is not supported." ) ================================================ FILE: lakehouse_engine/io/readers/jdbc_reader.py ================================================ """Module to define behaviour to read from JDBC sources.""" from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputFormat, InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader from lakehouse_engine.transformers.exceptions import WrongArgumentsException from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionUtils, ) class JDBCReader(Reader): """Class to read from JDBC source.""" def __init__(self, input_spec: InputSpec): """Construct JDBCReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read data from JDBC source. Returns: A dataframe containing the data from the JDBC source. """ if ( self._input_spec.options is not None and self._input_spec.options.get("predicates", None) is not None ): raise WrongArgumentsException("Predicates can only be used with jdbc_args.") options = self._input_spec.options if self._input_spec.options else {} if self._input_spec.calculate_upper_bound: jdbc_util = JDBCExtractionUtils( JDBCExtraction( user=options["user"], password=options["password"], url=options["url"], dbtable=options["dbtable"], extraction_type=options.get( "extraction_type", JDBCExtraction.extraction_type ), partition_column=options["partitionColumn"], calc_upper_bound_schema=self._input_spec.calc_upper_bound_schema, default_upper_bound=options.get( "default_upper_bound", JDBCExtraction.default_upper_bound ), ) ) # type: ignore options["upperBound"] = jdbc_util.get_spark_jdbc_optimal_upper_bound() if self._input_spec.jdbc_args: return ExecEnv.SESSION.read.options(**options).jdbc( **self._input_spec.jdbc_args ) else: return ( ExecEnv.SESSION.read.format(InputFormat.JDBC.value) .options(**options) .load() ) ================================================ FILE: lakehouse_engine/io/readers/kafka_reader.py ================================================ """Module to define behaviour to read from Kafka.""" from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputFormat, InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader class KafkaReader(Reader): """Class to read from Kafka.""" def __init__(self, input_spec: InputSpec): """Construct KafkaReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read Kafka data. Returns: A dataframe containing the data from Kafka. """ df = ExecEnv.SESSION.readStream.load( format=InputFormat.KAFKA.value, **self._input_spec.options if self._input_spec.options else {}, ) return df ================================================ FILE: lakehouse_engine/io/readers/query_reader.py ================================================ """Module to define behaviour to read from a query.""" from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader class QueryReader(Reader): """Class to read data from a query.""" def __init__(self, input_spec: InputSpec): """Construct QueryReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read data from a query. Returns: A dataframe containing the data from the query. """ return ExecEnv.SESSION.sql(self._input_spec.query) ================================================ FILE: lakehouse_engine/io/readers/sap_b4_reader.py ================================================ """Module to define behaviour to read from SAP B4 sources.""" from logging import Logger from typing import Tuple from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader from lakehouse_engine.utils.extraction.sap_b4_extraction_utils import ( ADSOTypes, SAPB4Extraction, SAPB4ExtractionUtils, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class SAPB4Reader(Reader): """Class to read from SAP B4 source.""" _LOGGER: Logger = LoggingHandler(__name__).get_logger() def __init__(self, input_spec: InputSpec): """Construct SAPB4Reader instances. Args: input_spec: input specification. """ super().__init__(input_spec) self.jdbc_utils = self._get_jdbc_utils() def read(self) -> DataFrame: """Read data from SAP B4 source. Returns: A dataframe containing the data from the SAP B4 source. """ options_args, jdbc_args = self._get_options() return ExecEnv.SESSION.read.options(**options_args).jdbc(**jdbc_args) def _get_jdbc_utils(self) -> SAPB4ExtractionUtils: jdbc_extraction = SAPB4Extraction( user=self._input_spec.options["user"], password=self._input_spec.options["password"], url=self._input_spec.options["url"], dbtable=self._input_spec.options["dbtable"], adso_type=self._input_spec.options["adso_type"], request_status_tbl=self._input_spec.options.get( "request_status_tbl", SAPB4Extraction.request_status_tbl ), changelog_table=self._input_spec.options.get( "changelog_table", ( self._input_spec.options["dbtable"] if self._input_spec.options["adso_type"] == ADSOTypes.AQ.value else self._input_spec.options["changelog_table"] ), ), data_target=SAPB4ExtractionUtils.get_data_target(self._input_spec.options), act_req_join_condition=self._input_spec.options.get( "act_req_join_condition", SAPB4Extraction.act_req_join_condition ), latest_timestamp_data_location=self._input_spec.options.get( "latest_timestamp_data_location", SAPB4Extraction.latest_timestamp_data_location, ), latest_timestamp_input_col=self._input_spec.options.get( "latest_timestamp_input_col", SAPB4Extraction.latest_timestamp_input_col, ), latest_timestamp_data_format=self._input_spec.options.get( "latest_timestamp_data_format", SAPB4Extraction.latest_timestamp_data_format, ), extraction_type=self._input_spec.options.get( "extraction_type", SAPB4Extraction.extraction_type ), driver=self._input_spec.options.get("driver", SAPB4Extraction.driver), num_partitions=self._input_spec.options.get( "numPartitions", SAPB4Extraction.num_partitions ), partition_column=self._input_spec.options.get( "partitionColumn", SAPB4Extraction.partition_column ), lower_bound=self._input_spec.options.get( "lowerBound", SAPB4Extraction.lower_bound ), upper_bound=self._input_spec.options.get( "upperBound", SAPB4Extraction.upper_bound ), default_upper_bound=self._input_spec.options.get( "default_upper_bound", SAPB4Extraction.default_upper_bound ), fetch_size=self._input_spec.options.get( "fetchSize", SAPB4Extraction.fetch_size ), compress=self._input_spec.options.get("compress", SAPB4Extraction.compress), custom_schema=self._input_spec.options.get( "customSchema", SAPB4Extraction.custom_schema ), extraction_timestamp=self._input_spec.options.get( "extraction_timestamp", SAPB4Extraction.extraction_timestamp, ), min_timestamp=self._input_spec.options.get( "min_timestamp", SAPB4Extraction.min_timestamp ), max_timestamp=self._input_spec.options.get( "max_timestamp", SAPB4Extraction.max_timestamp ), default_max_timestamp=self._input_spec.options.get( "default_max_timestamp", SAPB4Extraction.default_max_timestamp ), default_min_timestamp=self._input_spec.options.get( "default_min_timestamp", SAPB4Extraction.default_min_timestamp ), max_timestamp_custom_schema=self._input_spec.options.get( "max_timestamp_custom_schema", SAPB4Extraction.max_timestamp_custom_schema, ), generate_predicates=self._input_spec.generate_predicates, predicates=self._input_spec.options.get( "predicates", SAPB4Extraction.predicates ), predicates_add_null=self._input_spec.predicates_add_null, extra_cols_req_status_tbl=self._input_spec.options.get( "extra_cols_req_status_tbl", SAPB4Extraction.extra_cols_req_status_tbl ), calc_upper_bound_schema=self._input_spec.calc_upper_bound_schema, include_changelog_tech_cols=self._input_spec.options.get( "include_changelog_tech_cols", ( False if self._input_spec.options["adso_type"] == ADSOTypes.AQ.value else True ), ), ) return SAPB4ExtractionUtils(jdbc_extraction) def _get_options(self) -> Tuple[dict, dict]: """Get Spark Options using JDBC utilities. Returns: A tuple dict containing the options args and jdbc args to be passed to Spark. """ self._LOGGER.info( f"Initial options passed to the SAP B4 Reader: {self._input_spec.options}" ) options_args, jdbc_args = self.jdbc_utils.get_spark_jdbc_options() if self._input_spec.generate_predicates or self._input_spec.options.get( "predicates", None ): options_args.update( self.jdbc_utils.get_additional_spark_options( self._input_spec, options_args, ["partitionColumn", "numPartitions", "lowerBound", "upperBound"], ) ) else: if self._input_spec.calculate_upper_bound: options_args["upperBound"] = ( self.jdbc_utils.get_spark_jdbc_optimal_upper_bound() ) options_args.update( self.jdbc_utils.get_additional_spark_options( self._input_spec, options_args ) ) self._LOGGER.info( f"Final options to fill SAP B4 Reader Options: {options_args}" ) self._LOGGER.info(f"Final jdbc args to fill SAP B4 Reader JDBC: {jdbc_args}") return options_args, jdbc_args ================================================ FILE: lakehouse_engine/io/readers/sap_bw_reader.py ================================================ """Module to define behaviour to read from SAP BW sources.""" from logging import Logger from typing import Tuple from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader from lakehouse_engine.utils.extraction.sap_bw_extraction_utils import ( SAPBWExtraction, SAPBWExtractionUtils, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class SAPBWReader(Reader): """Class to read from SAP BW source.""" _LOGGER: Logger = LoggingHandler(__name__).get_logger() def __init__(self, input_spec: InputSpec): """Construct SAPBWReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) self.jdbc_utils = self._get_jdbc_utils() def read(self) -> DataFrame: """Read data from SAP BW source. Returns: A dataframe containing the data from the SAP BW source. """ options_args, jdbc_args = self._get_options() return ExecEnv.SESSION.read.options(**options_args).jdbc(**jdbc_args) def _get_jdbc_utils(self) -> SAPBWExtractionUtils: jdbc_extraction = SAPBWExtraction( user=self._input_spec.options["user"], password=self._input_spec.options["password"], url=self._input_spec.options["url"], dbtable=self._input_spec.options["dbtable"], latest_timestamp_data_location=self._input_spec.options.get( "latest_timestamp_data_location", SAPBWExtraction.latest_timestamp_data_location, ), latest_timestamp_input_col=self._input_spec.options.get( "latest_timestamp_input_col", SAPBWExtraction.latest_timestamp_input_col ), latest_timestamp_data_format=self._input_spec.options.get( "latest_timestamp_data_format", SAPBWExtraction.latest_timestamp_data_format, ), extraction_type=self._input_spec.options.get( "extraction_type", SAPBWExtraction.extraction_type ), act_request_table=self._input_spec.options.get( "act_request_table", SAPBWExtraction.act_request_table ), request_col_name=self._input_spec.options.get( "request_col_name", SAPBWExtraction.request_col_name ), act_req_join_condition=self._input_spec.options.get( "act_req_join_condition", SAPBWExtraction.act_req_join_condition ), driver=self._input_spec.options.get("driver", SAPBWExtraction.driver), changelog_table=self._input_spec.options.get( "changelog_table", SAPBWExtraction.changelog_table ), num_partitions=self._input_spec.options.get( "numPartitions", SAPBWExtraction.num_partitions ), partition_column=self._input_spec.options.get( "partitionColumn", SAPBWExtraction.partition_column ), lower_bound=self._input_spec.options.get( "lowerBound", SAPBWExtraction.lower_bound ), upper_bound=self._input_spec.options.get( "upperBound", SAPBWExtraction.upper_bound ), default_upper_bound=self._input_spec.options.get( "default_upper_bound", SAPBWExtraction.default_upper_bound ), fetch_size=self._input_spec.options.get( "fetchSize", SAPBWExtraction.fetch_size ), compress=self._input_spec.options.get("compress", SAPBWExtraction.compress), custom_schema=self._input_spec.options.get( "customSchema", SAPBWExtraction.custom_schema ), extraction_timestamp=self._input_spec.options.get( "extraction_timestamp", SAPBWExtraction.extraction_timestamp, ), odsobject=self._input_spec.options.get( "odsobject", SAPBWExtractionUtils.get_odsobject(self._input_spec.options), ), min_timestamp=self._input_spec.options.get( "min_timestamp", SAPBWExtraction.min_timestamp ), max_timestamp=self._input_spec.options.get( "max_timestamp", SAPBWExtraction.max_timestamp ), default_max_timestamp=self._input_spec.options.get( "default_max_timestamp", SAPBWExtraction.default_max_timestamp ), default_min_timestamp=self._input_spec.options.get( "default_min_timestamp", SAPBWExtraction.default_min_timestamp ), max_timestamp_custom_schema=self._input_spec.options.get( "max_timestamp_custom_schema", SAPBWExtraction.max_timestamp_custom_schema, ), include_changelog_tech_cols=self._input_spec.options.get( "include_changelog_tech_cols", SAPBWExtraction.include_changelog_tech_cols, ), generate_predicates=self._input_spec.generate_predicates, predicates=self._input_spec.options.get( "predicates", SAPBWExtraction.predicates ), predicates_add_null=self._input_spec.predicates_add_null, extra_cols_act_request=self._input_spec.options.get( "extra_cols_act_request", SAPBWExtraction.extra_cols_act_request ), get_timestamp_from_act_request=self._input_spec.options.get( "get_timestamp_from_act_request", SAPBWExtraction.get_timestamp_from_act_request, ), calc_upper_bound_schema=self._input_spec.calc_upper_bound_schema, sap_bw_schema=self._input_spec.options.get( "sap_bw_schema", SAPBWExtraction.sap_bw_schema ), ods_prefix=self._input_spec.options.get( "ods_prefix", SAPBWExtraction.ods_prefix ), logsys=self._input_spec.options.get("logsys", SAPBWExtraction.logsys), ) return SAPBWExtractionUtils(jdbc_extraction) def _get_options(self) -> Tuple[dict, dict]: """Get Spark Options using JDBC utilities. Returns: A tuple dict containing the options args and jdbc args to be passed to Spark. """ self._LOGGER.info( f"Initial options passed to the SAP BW Reader: {self._input_spec.options}" ) options_args, jdbc_args = self.jdbc_utils.get_spark_jdbc_options() if self._input_spec.generate_predicates or self._input_spec.options.get( "predicates", None ): options_args.update( self.jdbc_utils.get_additional_spark_options( self._input_spec, options_args, ["partitionColumn", "numPartitions", "lowerBound", "upperBound"], ) ) else: if self._input_spec.calculate_upper_bound: options_args["upperBound"] = ( self.jdbc_utils.get_spark_jdbc_optimal_upper_bound() ) options_args.update( self.jdbc_utils.get_additional_spark_options( self._input_spec, options_args ) ) self._LOGGER.info( f"Final options to fill SAP BW Reader Options: {options_args}" ) self._LOGGER.info(f"Final jdbc args to fill SAP BW Reader JDBC: {jdbc_args}") return options_args, jdbc_args ================================================ FILE: lakehouse_engine/io/readers/sftp_reader.py ================================================ """Module to define behaviour to read from SFTP.""" import gzip from datetime import datetime from io import TextIOWrapper from logging import Logger from typing import List from zipfile import ZipFile import pandas as pd from pandas import DataFrame as PandasDataFrame from pandas.errors import EmptyDataError from paramiko.sftp_file import SFTPFile from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec, ReadType from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader from lakehouse_engine.utils.extraction.sftp_extraction_utils import SFTPExtractionUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class SFTPReader(Reader): """Class to read from SFTP.""" _logger: Logger = LoggingHandler(__name__).get_logger() def __init__(self, input_spec: InputSpec): """Construct SFTPReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read SFTP data. Returns: A dataframe containing the data from SFTP. """ if self._input_spec.read_type == ReadType.BATCH.value: options_args = self._input_spec.options if self._input_spec.options else {} sftp_files_format = SFTPExtractionUtils.validate_format( self._input_spec.sftp_files_format.lower() ) location = SFTPExtractionUtils.validate_location(self._input_spec.location) sftp, transport = SFTPExtractionUtils.get_sftp_client(options_args) files_list = SFTPExtractionUtils.get_files_list( sftp, location, options_args ) dfs: List[PandasDataFrame] = [] try: for filename in files_list: with sftp.open(filename, "r") as sftp_file: try: pdf = self._read_files( filename, sftp_file, options_args.get("args", {}), sftp_files_format, ) if options_args.get("file_metadata", None): pdf["filename"] = filename pdf["modification_time"] = datetime.fromtimestamp( sftp.stat(filename).st_mtime ) self._append_files(pdf, dfs) except EmptyDataError: self._logger.info(f"{filename} - Empty or malformed file.") if dfs: df = ExecEnv.SESSION.createDataFrame(pd.concat(dfs)) else: raise ValueError( "No files were found with the specified parameters." ) finally: sftp.close() transport.close() else: raise NotImplementedError( "The requested read type supports only BATCH mode." ) return df @classmethod def _append_files(cls, pdf: PandasDataFrame, dfs: List) -> List: """Append to the list dataframes with data. Args: pdf: a Pandas dataframe containing data from files. dfs: a list of Pandas dataframes. Returns: A list of not empty Pandas dataframes. """ if not pdf.empty: dfs.append(pdf) return dfs @classmethod def _read_files( cls, filename: str, sftp_file: SFTPFile, option_args: dict, files_format: str ) -> PandasDataFrame: """Open and decompress files to be extracted from SFTP. For zip files, to avoid data type inferred issues during the iteration, all data will be read as string. Also, empty dataframes will NOT be considered to be processed. For the not considered ones, the file names will be logged. Args: filename: the filename to be read. sftp_file: SFTPFile object representing the open file. option_args: options from the acon. files_format: a string containing the file extension. Returns: A pandas dataframe with data from the file. """ reader = getattr(pd, f"read_{files_format}") if filename.endswith(".gz"): with gzip.GzipFile(fileobj=sftp_file, mode="rb") as gz_file: pdf = reader( TextIOWrapper(gz_file), # type: ignore **option_args, ) elif filename.endswith(".zip"): with ZipFile(sftp_file, "r") as zf: # type: ignore dfs = [ reader(TextIOWrapper(zf.open(f)), **option_args).fillna("") for f in zf.namelist() ] if not pd.concat(dfs, ignore_index=True).empty: pdf = pd.concat(dfs, ignore_index=True).astype(str) else: pdf = pd.DataFrame() cls._logger.info(f"{filename} - Empty or malformed file.") else: pdf = reader( sftp_file, **option_args, ) return pdf ================================================ FILE: lakehouse_engine/io/readers/sharepoint_reader.py ================================================ """Module to define the behaviour to read from Sharepoint.""" import csv import fnmatch import time from functools import reduce from pathlib import Path from typing import Optional from pyspark.sql import DataFrame from pyspark.sql.types import StructType from lakehouse_engine.core.definitions import InputSpec, SharepointFile from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.exceptions import ( InvalidSharepointPathException, NotSupportedException, ) from lakehouse_engine.io.reader import Reader from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.sharepoint_utils import SharepointUtils _LOGGER = LoggingHandler(__name__).get_logger() class SharepointReader(Reader): """Reader implementation for Sharepoint files.""" def __init__(self, input_spec: InputSpec): """Construct SharepointReader instance. Args: input_spec: InputSpec with Sharepoint parameters. """ super().__init__(input_spec) self.opts = self._input_spec.sharepoint_opts self.sharepoint_utils = self._get_sharepoint_utils() if self.opts.file_name and self.opts.folder_relative_path: folder_name = Path(self.opts.folder_relative_path).name if "." in folder_name: raise InvalidSharepointPathException( f"Invalid path setup: `folder_relative_path` " f"('{self.opts.folder_relative_path}') appears to include a file, " f"but `file_name` ('{self.opts.file_name}') was also provided. " f"Provide either a folder+file_name, or a full file path not both." ) _LOGGER.warning( "Using `file_name` with a folder path. " "This will read only one file. " "To read all files in the folder, set `file_name` to None." ) self.file_path = f"{self.opts.folder_relative_path}/{self.opts.file_name}" elif ( self.opts.folder_relative_path and "." in Path(self.opts.folder_relative_path).name ): self.file_path = self.opts.folder_relative_path # full path with extension else: self.file_path = self.opts.folder_relative_path if self.opts.file_name and self.opts.file_pattern: _LOGGER.warning( "`file_name` is provided. `file_pattern` will be ignored and only the " "specified file will be read." ) self.pattern = self.opts.file_pattern # may be None # Compute archive base folder from final self.file_path archive_base_folder = None if self.file_path: p = Path(self.file_path) archive_base_folder = str(p.parent) if p.suffix else str(p) # Set archive folders self.success_folder = ( f"{archive_base_folder}/{self.opts.archive_success_subfolder}" if (archive_base_folder and self.opts.archive_success_subfolder) else None ) self.error_folder = ( f"{archive_base_folder}/{self.opts.archive_error_subfolder}" if (archive_base_folder and self.opts.archive_error_subfolder) else None ) def read(self) -> DataFrame: """Read a Sharepoint file using a format-specific reader. This method delegates to a reader resolved by file extension or the declared `file_type` (e.g., SharepointCsvReader or SharepointExcelReader). Returns: Spark DataFrame. Raises: InputNotFoundException: Missing required Sharepoint options. NotSupportedException: Streaming requested or reader unsupported. """ self._input_spec.sharepoint_opts.validate_for_reader() if self._input_spec.read_type == "streaming": raise NotSupportedException( "Sharepoint reader doesn't support streaming input." ) return SharepointReaderFactory.get_reader(self._input_spec).read() def _get_sharepoint_utils(self) -> SharepointUtils: """Build a SharepointUtils instance from input specs. Returns: SharepointUtils. """ return SharepointUtils( client_id=self._input_spec.sharepoint_opts.client_id, tenant_id=self._input_spec.sharepoint_opts.tenant_id, local_path=self._input_spec.sharepoint_opts.local_path, api_version=self._input_spec.sharepoint_opts.api_version, site_name=self._input_spec.sharepoint_opts.site_name, drive_name=self._input_spec.sharepoint_opts.drive_name, file_name=self._input_spec.sharepoint_opts.file_name, folder_relative_path=self._input_spec.sharepoint_opts.folder_relative_path, chunk_size=self._input_spec.sharepoint_opts.chunk_size, local_options=self._input_spec.sharepoint_opts.local_options, secret=self._input_spec.sharepoint_opts.secret, conflict_behaviour=self._input_spec.sharepoint_opts.conflict_behaviour, file_pattern=self._input_spec.sharepoint_opts.file_pattern, file_type=self._input_spec.sharepoint_opts.file_type, ) class SharepointCsvReader(SharepointReader): """Read CSV files from Sharepoint and return Spark DataFrame. Supports reading a single file or combining multiple files in a folder. Ensures schema consistency and archives processed files. """ def read(self, file_path: str = None, pattern: str = None) -> DataFrame: """Read CSV data from Sharepoint. Args: file_path: Full file or folder path (overrides options if provided). pattern: Optional substring filter for folder mode. Returns: Spark DataFrame. Raises: ValueError: Invalid/missing path or path not found. """ file_path = file_path or self.file_path pattern = pattern or self.pattern if not file_path: raise ValueError( """`file_name` or `folder_relative_path` must be provided via sharepoint_opts.""" ) # Case 1: file_path includes a file (e.g., folder/file.csv or full path) if "." in Path(file_path).name: sp_file = self.sharepoint_utils.get_file_metadata(file_path) _LOGGER.info(f"Detected single-file read mode for '{file_path}'.") return self._load_and_archive_file(sp_file) # Case 2: it's a folder — use optional pattern if not self.sharepoint_utils.check_if_endpoint_exists(file_path): raise ValueError(f"Folder '{file_path}' does not exist in Sharepoint.") _LOGGER.info( f"Detected folder read mode for '{file_path}' " + ( f"with pattern '{pattern}'." if pattern else "with no pattern (all files)." ) ) return self.read_csv_folder(file_path, pattern) def _load_and_archive_file(self, sp_file: SharepointFile) -> DataFrame: """Download a Sharepoint CSV, stage it locally, load with Spark, and archive it. Handles: - Writing the CSV to a temporary local path. - Reading it as a Spark DataFrame. - Archiving goes to the configured success/error subfolders when enabled (defaults: "done"/"error"). Args: sp_file: File metadata and content. Returns: Spark DataFrame. Raises: ValueError: Empty content. Exception: Staging or read failure. """ if self.file_path: base_folder = ( str(Path(self.file_path).parent) if "." in Path(self.file_path).name else str(Path(self.file_path)) ) else: base_folder = sp_file._folder if getattr(sp_file, "_folder", None) else None success_subfolder = self.opts.archive_success_subfolder or "done" error_subfolder = self.opts.archive_error_subfolder or "error" success_folder = f"{base_folder}/{success_subfolder}" if base_folder else None error_folder = f"{base_folder}/{error_subfolder}" if base_folder else None archive_target = error_folder # default to error unless full read succeeds try: # IMPORTANT: empty check inside try so finally always runs if not sp_file.content: raise ValueError( f"File '{getattr(sp_file, 'file_path', None) or self.file_path}' " "is empty or could not be downloaded." ) with self.sharepoint_utils.staging_area() as tmp_dir_raw: tmp_dir: Path = Path(tmp_dir_raw) sp_file, df = self._load_csv_to_spark(sp_file, tmp_dir) archive_target = success_folder # only mark success after full read _LOGGER.info( f"Successfully read '{sp_file.file_path}' into Spark DataFrame." ) df = df.cache() df.count() # Force materialization return df except Exception as e: _LOGGER.error(f"Error processing '{sp_file.file_name}': {e}") raise finally: self.sharepoint_utils.archive_sharepoint_file( sp_file=sp_file, to_path=archive_target, move_enabled=self.opts.archive_enabled, ) def _get_csv_files_in_folder( self, folder_path: str, pattern: str = None ) -> list[SharepointFile]: """List CSV files in a Sharepoint folder, optionally filtered by pattern. Args: folder_path: Sharepoint folder path. pattern: Optional glob/substring pattern. Returns: List of SharepointFile. """ items = self.sharepoint_utils.list_items_in_path(folder_path) files = [] if pattern: _LOGGER.info( f"""Filtering Sharepoint files in '{folder_path}' using glob-style pattern: '{pattern}'. Ensure your pattern uses wildcards (e.g., '*.csv', 'sales_*.csv'). """ ) for item in items: file = SharepointFile( file_name=item["name"], time_created=item.get("createdDateTime", ""), time_modified=item.get("lastModifiedDateTime", ""), _folder=folder_path, ) if not file.is_csv: continue if pattern: if not fnmatch.fnmatch(file.file_name, pattern): continue files.append(file) return sorted(files, key=lambda f: f.file_name) def _load_csv_to_spark( self, sp_file: SharepointFile, tmp_dir: Path ) -> tuple[SharepointFile, DataFrame]: """Load a staged CSV into Spark and return file + DataFrame. Args: sp_file: Sharepoint file metadata. tmp_dir: Local staging directory. Returns: (SharepointFile, Spark DataFrame). Raises: ValueError: Empty or undownloadable file. """ sp_file = self.sharepoint_utils.get_file_metadata(sp_file.file_path) local_file = self.sharepoint_utils.save_to_staging_area(sp_file) spark_options = self.resolve_spark_csv_options(sp_file.content) try: _LOGGER.info(f"Starting to read file: {sp_file.file_name}") start_time = time.time() df = ( ExecEnv.SESSION.read.format("csv") .options(**spark_options) .load(str(local_file)) .cache() ) _LOGGER.info( f"""Finished reading file: {sp_file.file_name} in {round(time.time() - start_time, 2)} seconds""" ) df.count() # force materialization return sp_file, df except Exception as e: _LOGGER.error( f"Failed to read local copy of Sharepoint file: {local_file}", exc_info=True, ) raise ValueError( f"Failed to read Sharepoint file: '{sp_file.file_path}'." ) from e def read_csv_folder(self, folder_path: str, pattern: str = None) -> DataFrame: """Read and combine CSVs from a Sharepoint folder. If a pattern is provided, only files whose names contain the pattern will be read. Only archives files to the configured success subfolder if the full read and union succeeds. Files causing schema mismatches or other read issues are moved to the configured error subfolder (when enabled). Args: folder_path: Sharepoint folder path. pattern: Optional substring filter for filenames. Returns: Combined Spark DataFrame. Raises: ValueError: No valid files or schema mismatches. """ files = self._get_csv_files_in_folder(folder_path, pattern) if not files: raise ValueError(f"No CSV files found in folder: {folder_path}") valid_files, dfs = [], [] base_schema = None with self.sharepoint_utils.staging_area() as tmp_dir_raw: tmp_dir: Path = Path(tmp_dir_raw) for file in files: try: file_with_content, df = self._validate_and_read_file( file, tmp_dir, base_schema ) base_schema = base_schema or df.schema dfs.append(df) valid_files.append(file_with_content) except Exception as e: self._handle_file_error(file, folder_path, e) raise if not dfs: raise ValueError("No valid CSV files could be loaded from folder.") combined = reduce(lambda a, b: a.unionByName(b), dfs).cache() combined.count() for sp_file in valid_files: self.sharepoint_utils.archive_sharepoint_file( sp_file, to_path=( f"{folder_path}/{self.opts.archive_success_subfolder}" if self.opts.archive_success_subfolder else None ), move_enabled=self.opts.archive_enabled, ) return combined def _validate_and_read_file( self, file: SharepointFile, tmp_dir: Path, base_schema: Optional[StructType], ) -> tuple[SharepointFile, DataFrame]: """Validate schema and read CSV file into a Spark DataFrame. Args: file: Sharepoint file to read. tmp_dir: Temporary staging directory. base_schema: Schema to validate against. Returns: (validated SharepointFile, DataFrame). Raises: ValueError: Schema mismatch. """ file_with_content, df = self._load_csv_to_spark(file, tmp_dir) if base_schema and df.schema != base_schema: _LOGGER.error( f"""Schema mismatch in '{file.file_name}'. Expected: {base_schema}, Found: {df.schema}""" ) self.sharepoint_utils.archive_sharepoint_file( sp_file=file_with_content, to_path=self.error_folder, move_enabled=self.opts.archive_enabled, ) raise ValueError(f"Schema mismatch in '{file.file_name}'") return file_with_content, df def _handle_file_error( self, file: SharepointFile, folder_path: str, error: Exception, ) -> None: """Handle file read or processing errors by logging and archiving. Logs the error, prevents duplicate archiving, and moves the file to the error subfolder when enabled. Falls back gracefully if archiving fails. Args: file: Problematic SharepointFile. folder_path: Folder path for fallback archiving. error: Exception encountered. """ _LOGGER.error(f"Error processing '{file.file_name}': {error}") if not getattr(file, "_already_archived", False): file.skip_rename = True try: self.sharepoint_utils.archive_sharepoint_file( sp_file=file, to_path=self.error_folder, move_enabled=self.opts.archive_enabled, ) file._already_archived = True except Exception as archive_error: _LOGGER.warning(f"Secondary archiving failed: {archive_error}") else: _LOGGER.info( f"Skipping second archive for '{file.file_name}' (already archived)" ) def detect_delimiter( self, file_content: bytes, provided_delimiter: Optional[str] = None, expected_columns: Optional[list] = None, ) -> str: """Detect the appropriate delimiter for a CSV file. If a delimiter is explicitly provided by the user, it will be used directly (sniffing is bypassed). Otherwise, attempts to auto-detect the delimiter using csv.Sniffer based on the first line or expected columns. Args: file_content: Raw CSV bytes. provided_delimiter: Explicit delimiter to use. expected_columns: Optional expected header names. Returns: Final delimiter. Raises: ValueError: Unable to determine delimiter. """ if provided_delimiter: _LOGGER.info(f"User-specified delimiter '{provided_delimiter}' selected.") return provided_delimiter try: text = file_content.decode("utf-8") dialect = csv.Sniffer().sniff(text, delimiters=";,|\t") detected_delimiter = dialect.delimiter _LOGGER.info( f"No user-specified delimiter. Auto-detected: '{detected_delimiter}'" ) first_line = text.splitlines()[0].strip() actual_column_count = len(first_line.split(detected_delimiter)) if expected_columns: expected_count = len(expected_columns) if actual_column_count != expected_count: _LOGGER.warning( f"""Detected delimiter '{detected_delimiter}' resulted in {actual_column_count} columns, but {expected_count} were expected. Consider specifying the delimiter explicitly.""" ) elif actual_column_count <= 1: _LOGGER.warning( f"""Detected delimiter '{detected_delimiter}' resulted in only {actual_column_count} column. Consider specifying the delimiter explicitly in 'sharepoint_opts.local_options'.""" ) return detected_delimiter except Exception as e: _LOGGER.warning( f"Failed to auto-detect delimiter. Defaulting to comma. Reason: {e}" ) return "," def resolve_spark_csv_options(self, file_content: bytes) -> dict: """Resolve Spark CSV read options by validating or detecting delimiter. Args: file_content: Raw file bytes. Returns: Dict of Spark CSV options (includes delimiter). """ local_options = self._input_spec.sharepoint_opts.local_options or {} if "sep" in local_options: user_delimiter = local_options["sep"] elif "delimiter" in local_options: user_delimiter = local_options["delimiter"] else: user_delimiter = None expected_columns = local_options.get("expected_columns") final_delimiter = self.detect_delimiter( file_content=file_content, provided_delimiter=user_delimiter, expected_columns=expected_columns, ) # Warn if expected column names do not match the header when using the selected # delimiter if expected_columns: try: header_line = file_content.decode("utf-8").splitlines()[0].strip() actual_columns = [c.strip() for c in header_line.split(final_delimiter)] expected_normalized = [str(c).strip().lower() for c in expected_columns] actual_normalized = [c.strip().lower() for c in actual_columns] if actual_normalized != expected_normalized: _LOGGER.warning( "Expected columns don't match CSV header using delimiter '%s'. " "Expected: %s vs. Actual: %s. The read will proceed; " "consider specifying the correct delimiter or " "updating expected_columns.", final_delimiter, expected_columns, actual_columns, ) except Exception as e: _LOGGER.warning( "Failed to validate expected_columns against CSV header. " "The read will proceed. Reason: %s", e, ) # Safety fallback if detector returned nothing for some reason final_delimiter = final_delimiter or "," spark_options = dict(local_options) spark_options["sep"] = final_delimiter # Remove "delimiter" to avoid ambiguity as spark uses "sep" spark_options.pop("delimiter", None) return spark_options class SharepointExcelReader(SharepointReader): """Read Excel files from Sharepoint (not yet implemented).""" def read(self) -> DataFrame: """Read Excel files from Sharepoint. This method is not yet implemented and currently raises an error. Intended for future support of .xlsx file read from Sharepoint folders or files. Raises: NotImplementedError: Always, since Excel reading is not implemented. """ raise NotImplementedError("Excel reading is not yet implemented.") class SharepointReaderFactory: """Select the correct Sharepoint reader based on file type, file name, folder path. Default to using the file path from SharepointUtils instance. """ @staticmethod def get_reader(input_spec: InputSpec) -> SharepointReader: """Select the appropriate Sharepoint reader based on input specification. Resolution order: 1. Use file extension from `file_name` if provided. 2. If `folder_relative_path` includes a file with extension, use that. 3. If neither applies, use `file_type`. Args: input_spec: InputSpec with Sharepoint options. Returns: Reader instance for the resolved file type. Raises: ValueError: If file format is unsupported or cannot be determined. """ opts = input_spec.sharepoint_opts # 1. If reading a specific file, use file_name if opts.file_name: ext = Path(opts.file_name).suffix.lower() # 2. If folder_relative_path includes extension, treat it as full path elif opts.folder_relative_path and "." in Path(opts.folder_relative_path).name: ext = Path(opts.folder_relative_path).suffix.lower() # 3. Otherwise, rely on file_type elif opts.file_type: ext = f".{opts.file_type.lower()}" else: raise ValueError( """Cannot determine file format. Please provide `file_name`, a full file path in `folder_relative_path`, or explicitly set `file_type`.""" ) readers = { ".csv": SharepointCsvReader, ".xlsx": SharepointExcelReader, } try: _LOGGER.info(f"Detected {ext} read mode.") return readers[ext](input_spec) except KeyError: raise ValueError(f"Unsupported file format: {ext}") ================================================ FILE: lakehouse_engine/io/readers/table_reader.py ================================================ """Module to define behaviour to read from tables.""" from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputSpec, ReadType from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader import Reader class TableReader(Reader): """Class to read data from a table.""" def __init__(self, input_spec: InputSpec): """Construct TableReader instances. Args: input_spec: input specification. """ super().__init__(input_spec) def read(self) -> DataFrame: """Read data from a table. Returns: A dataframe containing the data from the table. """ if self._input_spec.read_type == ReadType.BATCH.value: return ExecEnv.SESSION.read.options( **self._input_spec.options if self._input_spec.options else {} ).table(self._input_spec.db_table) elif self._input_spec.read_type == ReadType.STREAMING.value: return ExecEnv.SESSION.readStream.options( **self._input_spec.options if self._input_spec.options else {} ).table(self._input_spec.db_table) else: self._logger.error("The requested read type is not supported.") raise NotImplementedError ================================================ FILE: lakehouse_engine/io/writer.py ================================================ """Defines abstract writer behaviour.""" from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, OrderedDict from pyspark.sql import DataFrame from pyspark.sql.functions import lit from lakehouse_engine.core.definitions import DQSpec, OutputSpec from lakehouse_engine.transformers.transformer_factory import TransformerFactory from lakehouse_engine.utils.logging_handler import LoggingHandler class Writer(ABC): """Abstract Writer class.""" def __init__( self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict = None ): """Construct Writer instances. Args: output_spec: output specification to write data. df: dataframe to write. data: list of all dfs generated on previous steps before writer. """ self._logger = LoggingHandler(self.__class__.__name__).get_logger() self._output_spec = output_spec self._df = df self._data = data @abstractmethod def write(self) -> Optional[OrderedDict]: """Abstract write method.""" raise NotImplementedError @staticmethod def write_transformed_micro_batch(**kwargs: Any) -> Callable: """Define how to write a streaming micro batch after transforming it. This function must define an inner function that manipulates a streaming batch, and then return that function. Look for concrete implementations of this function for more clarity. Args: kwargs: any keyword arguments. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: logger = LoggingHandler(__name__).get_logger() logger.warning("Skipping transform micro batch... nothing to do.") return inner @classmethod def get_transformed_micro_batch( cls, output_spec: OutputSpec, batch_df: DataFrame, batch_id: int, data: OrderedDict, ) -> DataFrame: """Get the result of the transformations applied to a micro batch dataframe. Args: output_spec: output specification associated with the writer. batch_df: batch dataframe (given from streaming foreachBatch). batch_id: if of the batch (given from streaming foreachBatch). data: list of all dfs generated on previous steps before writer to be available on micro batch transforms. Returns: The transformed dataframe. """ transformed_df = batch_df if output_spec.with_batch_id: transformed_df = transformed_df.withColumn("lhe_batch_id", lit(batch_id)) for transformer in output_spec.streaming_micro_batch_transformers: transformed_df = transformed_df.transform( TransformerFactory.get_transformer(transformer, data) ) return transformed_df @classmethod def get_streaming_trigger(cls, output_spec: OutputSpec) -> Dict: """Define which streaming trigger will be used. Args: output_spec: output specification. Returns: A dict containing streaming trigger. """ trigger: Dict[str, Any] = {} if output_spec.streaming_available_now: trigger["availableNow"] = output_spec.streaming_available_now elif output_spec.streaming_once: trigger["once"] = output_spec.streaming_once elif output_spec.streaming_processing_time: trigger["processingTime"] = output_spec.streaming_processing_time elif output_spec.streaming_continuous: trigger["continuous"] = output_spec.streaming_continuous else: raise NotImplementedError( "The requested output spec streaming trigger is not supported." ) return trigger @staticmethod def run_micro_batch_dq_process(df: DataFrame, dq_spec: List[DQSpec]) -> DataFrame: """Run the data quality process in a streaming micro batch dataframe. Iterates over the specs and performs the checks or analysis depending on the data quality specification provided in the configuration. Args: df: the dataframe in which to run the dq process on. dq_spec: data quality specification. Returns: the validated dataframe. """ from lakehouse_engine.dq_processors.dq_factory import DQFactory validated_df = df for spec in dq_spec: validated_df = DQFactory.run_dq_process(spec, df) return validated_df ================================================ FILE: lakehouse_engine/io/writer_factory.py ================================================ """Module for writer factory.""" from abc import ABC from typing import OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import ( FILE_OUTPUT_FORMATS, OutputFormat, OutputSpec, WriteType, ) from lakehouse_engine.io.writer import Writer from lakehouse_engine.io.writers.console_writer import ConsoleWriter from lakehouse_engine.io.writers.dataframe_writer import DataFrameWriter from lakehouse_engine.io.writers.delta_merge_writer import DeltaMergeWriter from lakehouse_engine.io.writers.file_writer import FileWriter from lakehouse_engine.io.writers.jdbc_writer import JDBCWriter from lakehouse_engine.io.writers.kafka_writer import KafkaWriter from lakehouse_engine.io.writers.rest_api_writer import RestApiWriter from lakehouse_engine.io.writers.sharepoint_writer import SharepointWriter from lakehouse_engine.io.writers.table_writer import TableWriter class WriterFactory(ABC): # noqa: B024 """Class for writer factory.""" AVAILABLE_WRITERS = { OutputFormat.TABLE.value: TableWriter, OutputFormat.DELTAFILES.value: DeltaMergeWriter, OutputFormat.JDBC.value: JDBCWriter, OutputFormat.FILE.value: FileWriter, OutputFormat.KAFKA.value: KafkaWriter, OutputFormat.CONSOLE.value: ConsoleWriter, OutputFormat.DATAFRAME.value: DataFrameWriter, OutputFormat.REST_API.value: RestApiWriter, OutputFormat.SHAREPOINT.value: SharepointWriter, } @classmethod def _get_writer_name(cls, spec: OutputSpec) -> str: """Get the writer name according to the output specification. Args: OutputSpec spec: output specification to write data. Returns: Writer: writer name that will be created to write the data. """ if spec.db_table and spec.write_type != WriteType.MERGE.value: writer_name = OutputFormat.TABLE.value elif ( spec.data_format == OutputFormat.DELTAFILES.value or spec.db_table ) and spec.write_type == WriteType.MERGE.value: writer_name = OutputFormat.DELTAFILES.value elif spec.data_format in FILE_OUTPUT_FORMATS: writer_name = OutputFormat.FILE.value else: writer_name = spec.data_format return writer_name @classmethod def get_writer(cls, spec: OutputSpec, df: DataFrame, data: OrderedDict) -> Writer: """Get a writer according to the output specification using a factory pattern. Args: spec: output specification to write data. df: dataframe to be written. data: list of all dfs generated on previous steps before writer. Returns: Writer: writer that will write the data. """ writer_name = cls._get_writer_name(spec) writer = cls.AVAILABLE_WRITERS.get(writer_name) if writer: return writer(output_spec=spec, df=df, data=data) # type: ignore else: raise NotImplementedError( f"The requested output spec format {spec.data_format} is not supported." ) ================================================ FILE: lakehouse_engine/io/writers/__init__.py ================================================ """Package containing the writers responsible for writing data.""" ================================================ FILE: lakehouse_engine/io/writers/console_writer.py ================================================ """Module to define behaviour to write to console.""" from typing import Callable, OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.writer import Writer from lakehouse_engine.utils.logging_handler import LoggingHandler class ConsoleWriter(Writer): """Class to write data to console.""" _logger = LoggingHandler(__name__).get_logger() def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct ConsoleWriter instances. Args: output_spec: output specification df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Write data to console.""" self._output_spec.options = ( self._output_spec.options if self._output_spec.options else {} ) if not self._df.isStreaming: self._logger.info("Dataframe preview:") self._show_df(self._df, self._output_spec) else: self._logger.info("Stream Dataframe preview:") self._write_to_console_in_streaming_mode( self._df, self._output_spec, self._data ) @staticmethod def _show_df(df: DataFrame, output_spec: OutputSpec) -> None: """Given a dataframe it applies Spark's show function to show it. Args: df: dataframe to be shown. output_spec: output specification. """ df.show( n=output_spec.options.get("limit", 20), truncate=output_spec.options.get("truncate", True), vertical=output_spec.options.get("vertical", False), ) @staticmethod def _show_streaming_df(output_spec: OutputSpec) -> Callable: """Define how to show a streaming df. Args: output_spec: output specification. Returns: A function to show df in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) ConsoleWriter._logger.info(f"Showing DF for batch {batch_id}") ConsoleWriter._show_df(batch_df, output_spec) return inner @staticmethod def _write_to_console_in_streaming_mode( df: DataFrame, output_spec: OutputSpec, data: OrderedDict ) -> None: """Write to console in streaming mode. Args: df: dataframe to write. output_spec: output specification. data: list of all dfs generated on previous steps before writer. """ df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec)) if ( output_spec.streaming_micro_batch_transformers or output_spec.streaming_micro_batch_dq_processors ): stream_df = df_writer.foreachBatch( ConsoleWriter._write_transformed_micro_batch(output_spec, data) ).start() else: stream_df = df_writer.foreachBatch( ConsoleWriter._show_streaming_df(output_spec) ).start() if output_spec.streaming_await_termination: stream_df.awaitTermination(output_spec.streaming_await_termination_timeout) @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) ConsoleWriter._show_df(transformed_df, output_spec) return inner ================================================ FILE: lakehouse_engine/io/writers/dataframe_writer.py ================================================ """Module to define behaviour to write to dataframe.""" import uuid from typing import Callable, Optional, OrderedDict from pyspark.sql import DataFrame from pyspark.sql.types import StructType from lakehouse_engine.core.definitions import OutputFormat, OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.exceptions import NotSupportedException from lakehouse_engine.io.writer import Writer from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.spark_utils import SparkUtils class DataFrameWriter(Writer): """Class to write data to dataframe.""" _logger = LoggingHandler(__name__).get_logger() def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct DataFrameWriter instances. Args: output_spec: output specification. df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) self.view_prefix = "global_temp" if not ExecEnv.IS_SERVERLESS else "" def write(self) -> Optional[OrderedDict]: """Write data to dataframe.""" self._output_spec.options = ( self._output_spec.options if self._output_spec.options else {} ) written_dfs: OrderedDict = OrderedDict({}) if ( self._output_spec.streaming_processing_time or self._output_spec.streaming_continuous ): raise NotSupportedException( f"DataFrame writer doesn't support " f"processing time or continuous streaming " f"for step ${self._output_spec.spec_id}." ) if self._df.isStreaming: output_df = self._write_to_dataframe_in_streaming_mode( self._df, self._output_spec, self._data ) else: output_df = self._df written_dfs[self._output_spec.spec_id] = output_df return written_dfs def _get_prefixed_view_name(self, stream_df_view_name: str) -> str: """Return the fully qualified view name with prefix if needed.""" return ".".join(filter(None, [self.view_prefix, stream_df_view_name])) def _create_temp_view(self, df: DataFrame, stream_df_view_name: str) -> None: """Given a dataframe create a temp view to be available for consumption. Args: df: dataframe to be shown. stream_df_view_name: stream df view name. """ prefixed_view_name = self._get_prefixed_view_name(stream_df_view_name) if self._table_exists(stream_df_view_name): self._logger.info("Temp view already exists") existing_data = ExecEnv.SESSION.table(f"{prefixed_view_name}") df = existing_data.union(df) SparkUtils.create_temp_view(df, stream_df_view_name) def _write_streaming_df(self, stream_df_view_name: str) -> Callable: """Define how to create a df from streaming df. Args: stream_df_view_name: stream df view name. Returns: A function to show df in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) self._create_temp_view(batch_df, stream_df_view_name) return inner def _write_to_dataframe_in_streaming_mode( self, df: DataFrame, output_spec: OutputSpec, data: OrderedDict ) -> DataFrame: """Write to DataFrame in streaming mode. Args: df: dataframe to write. output_spec: output specification. data: list of all dfs generated on previous steps before writer. """ app_id = str(uuid.uuid4()) stream_df_view_name = f"`{app_id}_{output_spec.spec_id}`" self._logger.info("Drop temp view if exists") prefixed_view_name = self._get_prefixed_view_name(stream_df_view_name) if self._table_exists(stream_df_view_name): # Cleaning Temp view to not maintain state and impact # consecutive acon runs ExecEnv.SESSION.sql(f"DROP VIEW {prefixed_view_name}") df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec)) if ( output_spec.streaming_micro_batch_transformers or output_spec.streaming_micro_batch_dq_processors ): stream_df = ( df_writer.options(**output_spec.options if output_spec.options else {}) .format(OutputFormat.NOOP.value) .foreachBatch( self._write_transformed_micro_batch( output_spec, data, stream_df_view_name ) ) .start() ) else: stream_df = ( df_writer.options(**output_spec.options if output_spec.options else {}) .format(OutputFormat.NOOP.value) .foreachBatch(self._write_streaming_df(stream_df_view_name)) .start() ) if output_spec.streaming_await_termination: stream_df.awaitTermination(output_spec.streaming_await_termination_timeout) self._logger.info("Reading stream data as df if exists") if self._table_exists(stream_df_view_name): stream_data_as_df = ExecEnv.SESSION.table(f"{prefixed_view_name}") else: self._logger.info( f"DataFrame writer couldn't find any data to return " f"for streaming, check if you are using checkpoint " f"for step {output_spec.spec_id}." ) stream_data_as_df = ExecEnv.SESSION.createDataFrame( data=[], schema=StructType([]) ) return stream_data_as_df def _table_exists(self, table_name: str) -> bool: """Check if the table or view exists in the session catalog. Args: table_name: table/view name to check if exists in the session. """ if not ExecEnv.IS_SERVERLESS: tables = ExecEnv.SESSION.sql(f"SHOW TABLES IN {self.view_prefix}") else: tables = ExecEnv.SESSION.sql("SHOW TABLES") return ( len(tables.filter(f"tableName = '{table_name.strip('`')}'").collect()) > 0 ) def _write_transformed_micro_batch( self, output_spec: OutputSpec, data: OrderedDict, stream_as_df_view: str ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. stream_as_df_view: stream df view name. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) self._create_temp_view(transformed_df, stream_as_df_view) return inner ================================================ FILE: lakehouse_engine/io/writers/delta_merge_writer.py ================================================ """Module to define the behaviour of delta merges.""" from typing import Callable, Optional, OrderedDict from delta.tables import DeltaMergeBuilder, DeltaTable from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputFormat, OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.exceptions import WrongIOFormatException from lakehouse_engine.io.writer import Writer class DeltaMergeWriter(Writer): """Class to merge data using delta lake.""" def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct DeltaMergeWriter instances. Args: output_spec: output specification containing merge options and relevant information. df: the dataframe containing the new data to be merged. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Merge new data with current data.""" delta_table = self._get_delta_table(self._output_spec) if self._df.isStreaming: stream_df = ( self._df.writeStream.options( **self._output_spec.options if self._output_spec.options else {} ) .foreachBatch( self._write_transformed_micro_batch( self._output_spec, self._data, delta_table ) ) .trigger(**Writer.get_streaming_trigger(self._output_spec)) .start() ) if self._output_spec.streaming_await_termination: stream_df.awaitTermination( self._output_spec.streaming_await_termination_timeout ) else: DeltaMergeWriter._merge(delta_table, self._output_spec, self._df) @staticmethod def _get_delta_table(output_spec: OutputSpec) -> DeltaTable: """Get the delta table given an output specification w/ table name or location. Args: output_spec: output specification. Returns: DeltaTable: the delta table instance. """ if output_spec.db_table: delta_table = DeltaTable.forName(ExecEnv.SESSION, output_spec.db_table) elif output_spec.data_format == OutputFormat.DELTAFILES.value: delta_table = DeltaTable.forPath(ExecEnv.SESSION, output_spec.location) else: raise WrongIOFormatException( f"{output_spec.data_format} is not compatible with Delta Merge " f"Writer." ) return delta_table @staticmethod def _insert( delta_merge: DeltaMergeBuilder, insert_predicate: Optional[str], insert_column_set: Optional[dict], ) -> DeltaMergeBuilder: """Get the builder of merge data with insert predicate and column set. Args: delta_merge: builder of the merge data. insert_predicate: condition of the insert. insert_column_set: rules for setting the values of columns that need to be inserted. Returns: DeltaMergeBuilder: builder of the merge data with insert. """ if insert_predicate: if insert_column_set: delta_merge = delta_merge.whenNotMatchedInsert( condition=insert_predicate, values=insert_column_set, ) else: delta_merge = delta_merge.whenNotMatchedInsertAll( condition=insert_predicate ) else: if insert_column_set: delta_merge = delta_merge.whenNotMatchedInsert(values=insert_column_set) else: delta_merge = delta_merge.whenNotMatchedInsertAll() return delta_merge @staticmethod def _merge(delta_table: DeltaTable, output_spec: OutputSpec, df: DataFrame) -> None: """Perform a delta lake merge according to several merge options. Args: delta_table: delta table to which to merge data. output_spec: output specification containing the merge options. df: dataframe with the new data to be merged into the delta table. """ delta_merge = delta_table.alias("current").merge( df.alias("new"), output_spec.merge_opts.merge_predicate ) if not output_spec.merge_opts.insert_only: if output_spec.merge_opts.delete_predicate: delta_merge = delta_merge.whenMatchedDelete( output_spec.merge_opts.delete_predicate ) delta_merge = DeltaMergeWriter._update( delta_merge, output_spec.merge_opts.update_predicate, output_spec.merge_opts.update_column_set, ) delta_merge = DeltaMergeWriter._insert( delta_merge, output_spec.merge_opts.insert_predicate, output_spec.merge_opts.insert_column_set, ) delta_merge.execute() @staticmethod def _update( delta_merge: DeltaMergeBuilder, update_predicate: Optional[str], update_column_set: Optional[dict], ) -> DeltaMergeBuilder: """Get the builder of merge data with update predicate and column set. Args: delta_merge: builder of the merge data. update_predicate: condition of the update. update_column_set: rules for setting the values of columns that need to be updated. Returns: DeltaMergeBuilder: builder of the merge data with update. """ if update_predicate: if update_column_set: delta_merge = delta_merge.whenMatchedUpdate( condition=update_predicate, set=update_column_set, ) else: delta_merge = delta_merge.whenMatchedUpdateAll( condition=update_predicate ) else: if update_column_set: delta_merge = delta_merge.whenMatchedUpdate(set=update_column_set) else: delta_merge = delta_merge.whenMatchedUpdateAll() return delta_merge @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict, delta_table: Optional[DeltaTable] = None, ) -> Callable: """Perform the merge in streaming mode by specifying a transform function. This function returns a function that will be invoked in the foreachBatch in streaming mode, performing a delta lake merge while streaming the micro batches. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. delta_table: delta table for which to merge the streaming data with. Returns: Function to call in .foreachBatch streaming function. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) DeltaMergeWriter._merge(delta_table, output_spec, transformed_df) return inner ================================================ FILE: lakehouse_engine/io/writers/file_writer.py ================================================ """Module to define behaviour to write to files.""" from typing import Callable, OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.writer import Writer class FileWriter(Writer): """Class to write data to files.""" def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct FileWriter instances. Args: output_spec: output specification df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Write data to files.""" if not self._df.isStreaming: self._write_to_files_in_batch_mode(self._df, self._output_spec) else: self._write_to_files_in_streaming_mode( self._df, self._output_spec, self._data ) @staticmethod def _write_to_files_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None: """Write to files in batch mode. Args: df: dataframe to write. output_spec: output specification. """ df.write.format(output_spec.data_format).partitionBy( output_spec.partitions ).options(**output_spec.options if output_spec.options else {}).mode( output_spec.write_type ).save( output_spec.location ) @staticmethod def _write_to_files_in_streaming_mode( df: DataFrame, output_spec: OutputSpec, data: OrderedDict ) -> None: """Write to files in streaming mode. Args: df: dataframe to write. output_spec: output specification. data: list of all dfs generated on previous steps before writer. """ df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec)) if ( output_spec.streaming_micro_batch_transformers or output_spec.streaming_micro_batch_dq_processors ): stream_df = ( df_writer.options(**output_spec.options if output_spec.options else {}) .foreachBatch( FileWriter._write_transformed_micro_batch(output_spec, data) ) .start() ) else: stream_df = ( df_writer.format(output_spec.data_format) .partitionBy(output_spec.partitions) .options(**output_spec.options if output_spec.options else {}) .outputMode(output_spec.write_type) .start(output_spec.location) ) if output_spec.streaming_await_termination: stream_df.awaitTermination(output_spec.streaming_await_termination_timeout) @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) FileWriter._write_to_files_in_batch_mode(transformed_df, output_spec) return inner ================================================ FILE: lakehouse_engine/io/writers/jdbc_writer.py ================================================ """Module that defines the behaviour to write to JDBC targets.""" from typing import Callable, OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.writer import Writer class JDBCWriter(Writer): """Class to write to JDBC targets.""" def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct JDBCWriter instances. Args: output_spec: output specification. df: dataframe to be writen. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Write data into JDBC target.""" if not self._df.isStreaming: self._write_to_jdbc_in_batch_mode(self._df, self._output_spec) else: stream_df = ( self._df.writeStream.trigger( **Writer.get_streaming_trigger(self._output_spec) ) .options( **self._output_spec.options if self._output_spec.options else {} ) .foreachBatch( self._write_transformed_micro_batch(self._output_spec, self._data) ) .start() ) if self._output_spec.streaming_await_termination: stream_df.awaitTermination( self._output_spec.streaming_await_termination_timeout ) @staticmethod def _write_to_jdbc_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None: """Write to jdbc in batch mode. Args: df: dataframe to write. output_spec: output specification. """ df.write.format(output_spec.data_format).partitionBy( output_spec.partitions ).options(**output_spec.options if output_spec.options else {}).mode( output_spec.write_type ).save( output_spec.location ) @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) JDBCWriter._write_to_jdbc_in_batch_mode(transformed_df, output_spec) return inner ================================================ FILE: lakehouse_engine/io/writers/kafka_writer.py ================================================ """Module that defines the behaviour to write to Kafka.""" from typing import Callable, OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.writer import Writer class KafkaWriter(Writer): """Class to write to a Kafka target.""" def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct KafkaWriter instances. Args: output_spec: output specification. df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Write data to Kafka.""" if not self._df.isStreaming: self._write_to_kafka_in_batch_mode(self._df, self._output_spec) else: self._write_to_kafka_in_streaming_mode( self._df, self._output_spec, self._data ) @staticmethod def _write_to_kafka_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None: """Write to Kafka in batch mode. Args: df: dataframe to write. output_spec: output specification. """ df.write.format(output_spec.data_format).options( **output_spec.options if output_spec.options else {} ).mode(output_spec.write_type).save() @staticmethod def _write_to_kafka_in_streaming_mode( df: DataFrame, output_spec: OutputSpec, data: OrderedDict ) -> None: """Write to kafka in streaming mode. Args: df: dataframe to write. output_spec: output specification. data: list of all dfs generated on previous steps before writer. """ df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec)) if ( output_spec.streaming_micro_batch_transformers or output_spec.streaming_micro_batch_dq_processors ): stream_df = ( df_writer.options(**output_spec.options if output_spec.options else {}) .foreachBatch( KafkaWriter._write_transformed_micro_batch(output_spec, data) ) .start() ) else: stream_df = ( df_writer.format(output_spec.data_format) .options(**output_spec.options if output_spec.options else {}) .start() ) if output_spec.streaming_await_termination: stream_df.awaitTermination(output_spec.streaming_await_termination_timeout) @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) KafkaWriter._write_to_kafka_in_batch_mode(transformed_df, output_spec) return inner ================================================ FILE: lakehouse_engine/io/writers/rest_api_writer.py ================================================ """Module to define behaviour to write to REST APIs.""" import json from typing import Any, Callable, OrderedDict from pyspark.sql import DataFrame, Row from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.writer import Writer from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.rest_api import ( RESTApiException, RestMethods, RestStatusCodes, execute_api_request, ) class RestApiWriter(Writer): """Class to write data to a REST API.""" _logger = LoggingHandler(__name__).get_logger() def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct RestApiWriter instances. Args: output_spec: output specification. df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Write data to REST API.""" if not self._df.isStreaming: self._write_to_rest_api_in_batch_mode(self._df, self._output_spec) else: self._write_to_rest_api_in_streaming_mode( self._df, self._output_spec, self._data ) @staticmethod def _get_func_to_send_payload_to_rest_api(output_spec: OutputSpec) -> Callable: """Define and return a function to send the payload to the REST api. Args: output_spec: Output Specification containing configurations to communicate with the REST api. Within the output_spec, the user can specify several options: - rest_api_header: http headers. - rest_api_basic_auth: basic http authentication details (e.g., {"username": "x", "password": "y"}). - rest_api_url: url of the api. - rest_api_method: REST method (e.g., POST or PUT). - rest_api_sleep_seconds: sleep seconds to avoid throttling. - rest_api_is_file_payload: if the payload to be sent to the api is in the format of a file using multipart encoding upload. if this is true, then the payload will always be sent using the "files" parameter in Python's requests library. - rest_api_file_payload_name: when rest_api_is_file_payload is true, this option can be used to define the file identifier in Python's requests library. - extra_json_payload: when rest_api_file_payload_name is False, can be used to provide additional JSON variables to add to the original payload. This is useful to complement the original payload with some extra input to better configure the final payload to send to the REST api. An example can be to add a constant configuration value to add to the payload data. Returns: Function to be called inside Spark dataframe.foreach. """ headers = output_spec.options.get("rest_api_header", None) basic_auth_dict = output_spec.options.get("rest_api_basic_auth", None) url = output_spec.options["rest_api_url"] method = output_spec.options.get("rest_api_method", RestMethods.POST.value) sleep_seconds = output_spec.options.get("rest_api_sleep_seconds", 0) is_file_payload = output_spec.options.get("rest_api_is_file_payload", False) file_payload_name = output_spec.options.get( "rest_api_file_payload_name", "file" ) extra_json_payload = output_spec.options.get( "rest_api_extra_json_payload", None ) success_status_codes = output_spec.options.get( "rest_api_success_status_codes", RestStatusCodes.OK_STATUS_CODES.value ) def send_payload_to_rest_api(row: Row) -> Any: """Send payload to the REST API. The payload needs to be prepared as a JSON string column in a dataframe. E.g., {"a": "a value", "b": "b value"}. Args: row: a row in a dataframe. """ if "payload" not in row: raise ValueError("Input DataFrame must contain 'payload' column.") str_payload = row.payload payload = None if not is_file_payload: payload = json.loads(str_payload) else: payload = {file_payload_name: str_payload} if extra_json_payload: payload.update(extra_json_payload) RestApiWriter._logger.debug(f"Original payload: {str_payload}") RestApiWriter._logger.debug(f"Final payload: {payload}") response = execute_api_request( method=method, url=url, headers=headers, basic_auth_dict=basic_auth_dict, json=payload if not is_file_payload else None, files=payload if is_file_payload else None, sleep_seconds=sleep_seconds, ) RestApiWriter._logger.debug( f"Response: {response.status_code} - {response.text}" ) if response.status_code not in success_status_codes: raise RESTApiException( f"API response status code {response.status_code} is not in" f" {success_status_codes}. Got {response.text}" ) return send_payload_to_rest_api @staticmethod def _write_to_rest_api_in_batch_mode( df: DataFrame, output_spec: OutputSpec ) -> None: """Write to REST API in Spark batch mode. This function uses the dataframe.foreach function to generate a payload for each row of the dataframe and send it to the REST API endpoint. Warning! Make sure your execution environment supports RDD api operations, as there are environments where RDD operation may not be supported. As, df.foreach() is a shorthand for df.rdd.foreach(), this can bring issues in such environments. Args: df: dataframe to write. output_spec: output specification. """ df.foreach(RestApiWriter._get_func_to_send_payload_to_rest_api(output_spec)) @staticmethod def _write_to_rest_api_in_streaming_mode( df: DataFrame, output_spec: OutputSpec, data: OrderedDict ) -> None: """Write to REST API in streaming mode. Args: df: dataframe to write. output_spec: output specification. data: list of all dfs generated on previous steps before writer. """ df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec)) stream_df = ( df_writer.options(**output_spec.options if output_spec.options else {}) .foreachBatch( RestApiWriter._write_transformed_micro_batch(output_spec, data) ) .start() ) if output_spec.streaming_await_termination: stream_df.awaitTermination(output_spec.streaming_await_termination_timeout) @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) RestApiWriter._write_to_rest_api_in_batch_mode(transformed_df, output_spec) return inner ================================================ FILE: lakehouse_engine/io/writers/sharepoint_writer.py ================================================ """Module to define the behaviour to write to Sharepoint.""" import os from typing import OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.io.exceptions import ( EndpointNotFoundException, NotSupportedException, WriteToLocalException, ) from lakehouse_engine.io.writer import Writer from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.sharepoint_utils import SharepointUtils class SharepointWriter(Writer): """Class to write data to Sharepoint. This writer is designed specifically for uploading a single file to Sharepoint. It first writes the data locally before uploading it to the specified Sharepoint location. Since it handles only a single file at a time, any logic for writing multiple files must be implemented on the notebook-side. """ def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct FileWriter instances. Args: output_spec: output specification df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) self.sharepoint_utils = self._get_sharepoint_utils() self._logger = LoggingHandler(__name__).get_logger() def write(self) -> None: """Upload data to Sharepoint.""" if self._df.isStreaming: raise NotSupportedException("Sharepoint writer doesn't support streaming!") self._output_spec.sharepoint_opts.validate_for_writer() if not self.sharepoint_utils.check_if_endpoint_exists( folder_root_path=self._output_spec.sharepoint_opts.folder_relative_path ): raise EndpointNotFoundException("The provided endpoint does not exist!") self._write_to_sharepoint_in_batch_mode(self._df) def _get_sharepoint_utils(self) -> SharepointUtils: sharepoint_utils = SharepointUtils( client_id=self._output_spec.sharepoint_opts.client_id, tenant_id=self._output_spec.sharepoint_opts.tenant_id, local_path=self._output_spec.sharepoint_opts.local_path, api_version=self._output_spec.sharepoint_opts.api_version, site_name=self._output_spec.sharepoint_opts.site_name, drive_name=self._output_spec.sharepoint_opts.drive_name, file_name=self._output_spec.sharepoint_opts.file_name, folder_relative_path=self._output_spec.sharepoint_opts.folder_relative_path, chunk_size=self._output_spec.sharepoint_opts.chunk_size, local_options=self._output_spec.sharepoint_opts.local_options, secret=self._output_spec.sharepoint_opts.secret, conflict_behaviour=self._output_spec.sharepoint_opts.conflict_behaviour, ) return sharepoint_utils def _write_to_sharepoint_in_batch_mode(self, df: DataFrame) -> None: """Write to Sharepoint in batch mode. This method first writes the provided DataFrame to a local file using the SharePointUtils `write_to_local_path` method. If the local file is successfully written, it then uploads the file to Sharepoint using the `write_to_sharepoint` method, logging the process and outcome. Args: df: The DataFrame to write to a local file and subsequently upload to Sharepoint. """ local_path = self._output_spec.sharepoint_opts.local_path file_name = self._output_spec.sharepoint_opts.file_name self._logger.info(f"Starting to write the data to the local path: {local_path}") try: self.sharepoint_utils.write_to_local_path(df) except IOError as err: self.sharepoint_utils.delete_local_path() self._logger.info(f"Deleted the local folder: {local_path}") raise WriteToLocalException( f"The data was not written on the local path: {local_path}" ) from err self._logger.info(f"The data was written to the local path: {local_path}") file_size = os.path.getsize(local_path) self._logger.info( f"Uploading the {file_name} ({file_size} bytes) to Sharepoint." ) self.sharepoint_utils.write_to_sharepoint() self._logger.info(f"The {file_name} was uploaded to Sharepoint with success!") self.sharepoint_utils.delete_local_path() self._logger.info(f"Deleted the local folder: {local_path}") ================================================ FILE: lakehouse_engine/io/writers/table_writer.py ================================================ """Module that defines the behaviour to write to tables.""" from typing import Any, Callable, OrderedDict from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputFormat, OutputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.writer import Writer class TableWriter(Writer): """Class to write to a table.""" def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict): """Construct TableWriter instances. Args: output_spec: output specification. df: dataframe to be written. data: list of all dfs generated on previous steps before writer. """ super().__init__(output_spec, df, data) def write(self) -> None: """Write data to a table. After the write operation we repair the table (e.g., update partitions). However, there's a caveat to this, which is the fact that this repair operation is not reachable if we are running long-running streaming mode. Therefore, we recommend not using the TableWriter with formats other than delta lake for those scenarios (as delta lake does not need msck repair). So, you can: 1) use delta lake format for the table; 2) use the FileWriter and run the repair with a certain frequency in a separate task of your pipeline. """ if not self._df.isStreaming: self._write_to_table_in_batch_mode(self._df, self._output_spec) else: df_writer = self._df.writeStream.trigger( **Writer.get_streaming_trigger(self._output_spec) ) if ( self._output_spec.streaming_micro_batch_transformers or self._output_spec.streaming_micro_batch_dq_processors ): stream_df = ( df_writer.options( **self._output_spec.options if self._output_spec.options else {} ) .foreachBatch( self._write_transformed_micro_batch( self._output_spec, self._data ) ) .start() ) if self._output_spec.streaming_await_termination: stream_df.awaitTermination( self._output_spec.streaming_await_termination_timeout ) else: self._write_to_table_in_streaming_mode(df_writer, self._output_spec) if ( self._output_spec.data_format != OutputFormat.DELTAFILES.value and self._output_spec.partitions ): ExecEnv.SESSION.sql(f"MSCK REPAIR TABLE {self._output_spec.db_table}") @staticmethod def _write_to_table_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None: """Write to a metastore table in batch mode. Args: df: dataframe to write. output_spec: output specification. """ df_writer = df.write.format(output_spec.data_format) if output_spec.partitions: df_writer = df_writer.partitionBy(output_spec.partitions) if output_spec.location: df_writer = df_writer.options( path=output_spec.location, **output_spec.options if output_spec.options else {}, ) else: df_writer = df_writer.options( **output_spec.options if output_spec.options else {} ) df_writer.mode(output_spec.write_type).saveAsTable(output_spec.db_table) @staticmethod def _write_to_table_in_streaming_mode( df_writer: Any, output_spec: OutputSpec ) -> None: """Write to a metastore table in streaming mode. Args: df_writer: dataframe writer. output_spec: output specification. """ df_writer = df_writer.outputMode(output_spec.write_type).format( output_spec.data_format ) if output_spec.partitions: df_writer = df_writer.partitionBy(output_spec.partitions) if output_spec.location: df_writer = df_writer.options( path=output_spec.location, **output_spec.options if output_spec.options else {}, ) else: df_writer = df_writer.options( **output_spec.options if output_spec.options else {} ) if output_spec.streaming_await_termination: df_writer.toTable(output_spec.db_table).awaitTermination( output_spec.streaming_await_termination_timeout ) else: df_writer.toTable(output_spec.db_table) @staticmethod def _write_transformed_micro_batch( # type: ignore output_spec: OutputSpec, data: OrderedDict ) -> Callable: """Define how to write a streaming micro batch after transforming it. Args: output_spec: output specification. data: list of all dfs generated on previous steps before writer. Returns: A function to be executed in the foreachBatch spark write method. """ def inner(batch_df: DataFrame, batch_id: int) -> None: ExecEnv.get_for_each_batch_session(batch_df) transformed_df = Writer.get_transformed_micro_batch( output_spec, batch_df, batch_id, data ) if output_spec.streaming_micro_batch_dq_processors: transformed_df = Writer.run_micro_batch_dq_process( transformed_df, output_spec.streaming_micro_batch_dq_processors ) TableWriter._write_to_table_in_batch_mode(transformed_df, output_spec) return inner ================================================ FILE: lakehouse_engine/terminators/__init__.py ================================================ """Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).""" ================================================ FILE: lakehouse_engine/terminators/cdf_processor.py ================================================ """Defines change data feed processor behaviour.""" from datetime import datetime, timedelta from typing import OrderedDict from delta.tables import DeltaTable from pyspark.sql import DataFrame from pyspark.sql.functions import col, date_format from lakehouse_engine.core.definitions import ( InputSpec, OutputFormat, OutputSpec, ReadType, TerminatorSpec, WriteType, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.reader_factory import ReaderFactory from lakehouse_engine.io.writer_factory import WriterFactory from lakehouse_engine.utils.logging_handler import LoggingHandler class CDFProcessor(object): """Change data feed processor class.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def expose_cdf(cls, spec: TerminatorSpec) -> None: """Expose CDF to external location. Args: spec: terminator specification. """ cls._logger.info("Reading CDF from input table...") df_cdf = ReaderFactory.get_data(cls._get_table_cdf_input_specs(spec)) new_df_cdf = df_cdf.withColumn( "_commit_timestamp", date_format(col("_commit_timestamp"), "yyyyMMddHHmmss"), ) cls._logger.info("Writing CDF to external table...") cls._write_cdf_to_external( spec, new_df_cdf.repartition( spec.args.get( "materialized_cdf_num_partitions", col("_commit_timestamp") ) ), ) # used to delete old data on CDF table (don't remove parquet). if spec.args.get("clean_cdf", True): cls._logger.info("Cleaning CDF table...") cls.delete_old_data(spec) # used to delete old parquet files. if spec.args.get("vacuum_cdf", False): cls._logger.info("Vacuuming CDF table...") cls.vacuum_cdf_data(spec) @staticmethod def _write_cdf_to_external( spec: TerminatorSpec, df: DataFrame, data: OrderedDict = None ) -> None: """Write cdf results dataframe. Args: spec: terminator specification. df: dataframe with cdf results to write. data: list of all dfs generated on previous steps before writer. """ WriterFactory.get_writer( spec=OutputSpec( spec_id="materialized_cdf", input_id="input_table", location=spec.args["materialized_cdf_location"], write_type=WriteType.APPEND.value, data_format=spec.args.get("data_format", OutputFormat.DELTAFILES.value), options=spec.args["materialized_cdf_options"], partitions=["_commit_timestamp"], ), df=df, data=data, ).write() @staticmethod def _get_table_cdf_input_specs(spec: TerminatorSpec) -> InputSpec: """Get the input specifications from a terminator spec. Args: spec: terminator specifications. Returns: List of input specifications. """ options = { "readChangeFeed": "true", **spec.args.get("db_table_options", {}), } input_specs = InputSpec( spec_id="input_table", db_table=spec.args["db_table"], read_type=ReadType.STREAMING.value, data_format=OutputFormat.DELTAFILES.value, options=options, ) return input_specs @classmethod def delete_old_data(cls, spec: TerminatorSpec) -> None: """Delete old data from cdf delta table. Args: spec: terminator specifications. """ today_datetime = datetime.today() limit_date = today_datetime + timedelta( days=spec.args.get("days_to_keep", 30) * -1 ) limit_timestamp = limit_date.strftime("%Y%m%d%H%M%S") cdf_delta_table = DeltaTable.forPath( ExecEnv.SESSION, spec.args["materialized_cdf_location"] ) cdf_delta_table.delete(col("_commit_timestamp") < limit_timestamp) @classmethod def vacuum_cdf_data(cls, spec: TerminatorSpec) -> None: """Vacuum old data from cdf delta table. Args: spec: terminator specifications. """ cdf_delta_table = DeltaTable.forPath( ExecEnv.SESSION, spec.args["materialized_cdf_location"] ) cdf_delta_table.vacuum(spec.args.get("vacuum_hours", 168)) ================================================ FILE: lakehouse_engine/terminators/dataset_optimizer.py ================================================ """Module with dataset optimizer terminator.""" from typing import List, Optional from pyspark.sql.utils import AnalysisException, ParseException from lakehouse_engine.core.table_manager import TableManager from lakehouse_engine.transformers.exceptions import WrongArgumentsException from lakehouse_engine.utils.logging_handler import LoggingHandler class DatasetOptimizer(object): """Class with dataset optimizer terminator.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def optimize_dataset( cls, db_table: Optional[str] = None, location: Optional[str] = None, compute_table_stats: bool = True, vacuum: bool = True, vacuum_hours: int = 720, optimize: bool = True, optimize_where: Optional[str] = None, optimize_zorder_col_list: Optional[List[str]] = None, debug: bool = False, ) -> None: """Optimize a dataset based on a set of pre-conceived optimizations. Most of the time the dataset is a table, but it can be a file-based one only. Args: db_table: `database_name.table_name`. location: dataset/table filesystem location. compute_table_stats: to compute table statistics or not. vacuum: (delta lake tables only) whether to vacuum the delta lake table or not. vacuum_hours: (delta lake tables only) number of hours to consider in vacuum operation. optimize: (delta lake tables only) whether to optimize the table or not. Custom optimize parameters can be supplied through ExecEnv (Spark) configs optimize_where: expression to use in the optimize function. optimize_zorder_col_list: (delta lake tables only) list of columns to consider in the zorder optimization process. Custom optimize parameters can be supplied through ExecEnv (Spark) configs. debug: flag indicating if we are just debugging this for local tests and therefore pass through all the exceptions to perform some assertions in local tests. """ if optimize: if debug: try: cls._optimize( db_table, location, optimize_where, optimize_zorder_col_list ) except ParseException: pass else: cls._optimize( db_table, location, optimize_where, optimize_zorder_col_list ) if vacuum: cls._vacuum(db_table, location, vacuum_hours) if compute_table_stats: if debug: try: cls._compute_table_stats(db_table) except AnalysisException: pass else: cls._compute_table_stats(db_table) @classmethod def _compute_table_stats(cls, db_table: str) -> None: """Compute table statistics. Args: db_table: `.
` string. """ if not db_table: raise WrongArgumentsException("A table needs to be provided.") config = {"function": "compute_table_statistics", "table_or_view": db_table} cls._logger.info(f"Computing table statistics for {db_table}...") TableManager(config).compute_table_statistics() @classmethod def _vacuum(cls, db_table: str, location: str, hours: int) -> None: """Vacuum a delta table. Args: db_table: `.
` string. Takes precedence over location. location: location of the delta table. hours: number of hours to consider in vacuum operation. """ if not db_table and not location: raise WrongArgumentsException("A table or location need to be provided.") table_or_location = db_table if db_table else f"delta.`{location}`" config = { "function": "compute_table_statistics", "table_or_view": table_or_location, "vacuum_hours": hours, } cls._logger.info(f"Vacuuming table {table_or_location}...") TableManager(config).vacuum() @classmethod def _optimize( cls, db_table: str, location: str, where: str, zorder_cols: List[str] ) -> None: """Optimize a delta table. Args: db_table: `.
` string. Takes precedence over location. location: location of the delta table. where: expression to use in the optimize function. zorder_cols: list of columns to consider in the zorder optimization process. """ if not db_table and not location: raise WrongArgumentsException("A table or location needs to be provided.") table_or_location = db_table if db_table else f"delta.`{location}`" config = { "function": "compute_table_statistics", "table_or_view": table_or_location, "optimize_where": where, "optimize_zorder_col_list": ",".join(zorder_cols if zorder_cols else []), } cls._logger.info(f"Optimizing table {table_or_location}...") TableManager(config).optimize() ================================================ FILE: lakehouse_engine/terminators/notifier.py ================================================ """Module with notification terminator.""" from abc import ABC, abstractmethod from jinja2 import Template from lakehouse_engine.core.definitions import ( NotificationRuntimeParameters, TerminatorSpec, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.terminators.notifiers.notification_templates import ( NotificationsTemplates, ) from lakehouse_engine.utils.databricks_utils import DatabricksUtils from lakehouse_engine.utils.logging_handler import LoggingHandler class Notifier(ABC): """Abstract Notification class.""" _logger = LoggingHandler(__name__).get_logger() def __init__(self, notification_spec: TerminatorSpec): """Construct Notification instances. Args: notification_spec: notification specification. """ self.type = notification_spec.args.get("type") self.notification = notification_spec.args @abstractmethod def create_notification(self) -> None: """Abstract create notification method.""" raise NotImplementedError @abstractmethod def send_notification(self) -> None: """Abstract send notification method.""" raise NotImplementedError def _render_notification_field(self, template_field: str) -> str: """Render the notification given args. Args: template_field: Message with templates to be replaced. Returns: Rendered field """ args = {} field_template = Template(template_field) if ( NotificationRuntimeParameters.DATABRICKS_JOB_NAME.value in template_field or NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID.value in template_field or NotificationRuntimeParameters.JOB_EXCEPTION.value in template_field ): workspace_id, job_name = DatabricksUtils.get_databricks_job_information( ExecEnv.SESSION ) args["databricks_job_name"] = job_name args["databricks_workspace_id"] = workspace_id args["exception"] = self.notification.get("exception") return field_template.render(args) @staticmethod def check_if_notification_is_failure_notification( spec: TerminatorSpec, ) -> bool: """Check if given notification is a failure notification. Args: spec: spec to validate if it is a failure notification. Returns: A boolean telling if the notification is a failure notification """ notification = spec.args is_notification_failure_notification: bool = False if "template" in notification.keys(): template: dict = NotificationsTemplates.EMAIL_NOTIFICATIONS_TEMPLATES.get( notification["template"], {} ) if template: is_notification_failure_notification = notification.get( "on_failure", True ) else: raise ValueError(f"""Template {notification["template"]} not found.""") else: is_notification_failure_notification = notification.get("on_failure", True) return is_notification_failure_notification ================================================ FILE: lakehouse_engine/terminators/notifier_factory.py ================================================ """Module for notifier factory.""" from lakehouse_engine.core.definitions import NotifierType, TerminatorSpec from lakehouse_engine.terminators.notifier import Notifier from lakehouse_engine.terminators.notifiers.email_notifier import EmailNotifier from lakehouse_engine.terminators.notifiers.exceptions import NotifierNotFoundException class NotifierFactory(object): """Class for notification factory.""" NOTIFIER_TYPES = {NotifierType.EMAIL.value: EmailNotifier} @classmethod def get_notifier(cls, spec: TerminatorSpec) -> Notifier: """Get a notifier according to the terminator specs using a factory. Args: spec: terminator specification. Returns: Notifier: notifier that will handle notifications. """ notifier_name = spec.args.get("type") notifier = cls.NOTIFIER_TYPES.get(notifier_name) if notifier: return notifier(notification_spec=spec) else: raise NotifierNotFoundException( f"The requested notification format {notifier_name} is not supported." ) @staticmethod def generate_failure_notification(spec: list, exception: Exception) -> None: """Check if it is necessary to send a failure notification and generate it. Args: spec: List of termination specs exception: Exception that caused the failure. """ notification_specs = [] for terminator in spec: if terminator.function == "notify": notification_specs.append(terminator) for notification in notification_specs: failure_notification_spec = notification.args generate_failure_notification = failure_notification_spec.get( "generate_failure_notification", False ) if generate_failure_notification or ( Notifier.check_if_notification_is_failure_notification(notification) ): failure_notification_spec["exception"] = str(exception) if generate_failure_notification: failure_notification_spec["template"] = ( f"""failure_notification_{failure_notification_spec["type"]}""" ) failure_spec = TerminatorSpec( function="notification", args=failure_notification_spec ) notifier = NotifierFactory.get_notifier(failure_spec) notifier.create_notification() notifier.send_notification() ================================================ FILE: lakehouse_engine/terminators/notifiers/__init__.py ================================================ """Notifications module.""" ================================================ FILE: lakehouse_engine/terminators/notifiers/email_notifier.py ================================================ """Module with email notifier.""" import asyncio import smtplib from email.mime.application import MIMEApplication from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from posixpath import basename from typing import Any from lakehouse_engine.core.definitions import TerminatorSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.terminators.notifier import Notifier from lakehouse_engine.terminators.notifiers.exceptions import ( NotifierConfigException, NotifierTemplateNotFoundException, ) from lakehouse_engine.terminators.notifiers.notification_templates import ( NotificationsTemplates, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class EmailNotifier(Notifier): """Base Notification class.""" _logger = LoggingHandler(__name__).get_logger() def __init__(self, notification_spec: TerminatorSpec): """Construct Email Notification instance. Args: notification_spec: notification specification. """ super().__init__(notification_spec) def create_notification(self) -> None: """Creates the notification to be sent.""" if "template" in self.notification.keys(): template: dict = NotificationsTemplates.EMAIL_NOTIFICATIONS_TEMPLATES.get( self.notification["template"], {} ) if template: self.notification["message"] = self._render_notification_field( template["message"] ) self.notification["subject"] = self._render_notification_field( template["subject"] ) self.notification["mimetype"] = template["mimetype"] else: raise NotifierTemplateNotFoundException( f"""Template {self.notification["template"]} does not exist""" ) elif "message" in self.notification.keys(): self.notification["message"] = self._render_notification_field( self.notification["message"] ) self.notification["subject"] = self._render_notification_field( self.notification["subject"] ) else: raise NotifierConfigException("Malformed Notification Definition") def send_notification(self) -> None: """Sends the notification by using a series of methods.""" self._validate_email_notification() server = self.notification["server"] notification_office_email_servers = ["smtp.office365.com"] if ( ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers is not None and server in ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers ): raise NotifierConfigException( f"Trying to use disallowed smtp server: '{server}'.\n" f"Disallowed smtp servers: " f"{str(ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers)}" ) elif server in notification_office_email_servers: self._authenticate_and_send_office365() else: self._authenticate_and_send_simple_smtp() def _authenticate_and_send_office365(self) -> None: """Authenticates and sends an email notification using Graph API.""" from azure.identity.aio import ClientSecretCredential from msgraph import GraphServiceClient self._logger.info("Attempting authentication using Graph API.") request_body = self._create_graph_api_email_body() self._logger.info(f"Sending notification email with body: {request_body}") credential = ClientSecretCredential( tenant_id=self.notification["tenant_id"], client_id=self.notification["user"], client_secret=self.notification["password"], ) client = GraphServiceClient(credentials=credential) import nest_asyncio nest_asyncio.apply() asyncio.get_event_loop().run_until_complete( client.users.by_user_id(self.notification["from"]).send_mail.post( body=request_body ) ) self._logger.info("Notification email sent successfully.") def _authenticate_and_send_simple_smtp(self) -> None: """Authenticates and sends an email notification using simple authentication.""" with smtplib.SMTP( self.notification["server"], self.notification["port"] ) as smtp: try: smtp.starttls() smtp.login( self.notification.get("user", ""), self.notification.get("password", ""), ) except smtplib.SMTPException as e: self._logger.exception( f"Exception while authenticating to smtp: {str(e)}" ) self._logger.exception( "Attempting to send the notification without authentication" ) mesg = MIMEMultipart() mesg["From"] = self.notification["from"] to = self.notification.get("to", []) cc = self.notification.get("cc", []) bcc = self.notification.get("bcc", []) mesg["To"] = ", ".join(to) mesg["CC"] = ", ".join(cc) mesg["BCC"] = ", ".join(bcc) mesg["Subject"] = self.notification["subject"] mesg["Importance"] = self._get_importance( self.notification.get("importance", "normal") ) match self.notification.get("mimetype", "plain"): case "html" | "text/html": mimetype = "html" case "text" | "text/plain" | "plain" | "text/text": mimetype = "text" case _: self._logger.warning( f"""Unknown mimetype '{self.notification["mimetype"]}' """ f"provided. Defaulting to 'plain'." ) mimetype = "text" body = MIMEText(self.notification["message"], mimetype) mesg.attach(body) for f in self.notification.get("attachments", []): with open(f, "rb") as fil: part = MIMEApplication(fil.read(), Name=basename(f)) part["Content-Disposition"] = 'attachment; filename="%s"' % basename(f) mesg.attach(part) try: smtp.sendmail( self.notification["from"], to + cc + bcc, mesg.as_string() ) self._logger.info("Email sent successfully.") except smtplib.SMTPException as e: self._logger.exception(f"Exception while sending email: {str(e)}") def _validate_email_notification(self) -> None: """Validates the email notification.""" if not self.notification.get("from"): raise NotifierConfigException( "Email notification must contain 'from' field." ) if not self.notification.get("server"): raise NotifierConfigException( "Email notification must contain 'server' field." ) if not self.notification.get("port"): raise NotifierConfigException( "Email notification must contain 'port' field." ) if ( not self.notification.get("to") and not self.notification.get("cc") and not self.notification.get("bcc") ): raise NotifierConfigException( "No recipients provided. Please provide at least one recipient." ) def _get_importance(self, importance: str) -> Any: """Get the importance of the email notification. Args: importance: Importance level of the email. Returns: Importance level for the email notification. """ from msgraph.generated.models.importance import Importance match importance: case "critical" | "high": return Importance.High case "normal": return Importance.Normal case "low": return Importance.Low case _: self._logger.warning( f"""Unknown importance '{importance}' provided. """ f"Defaulting to 'normal'." ) return Importance.Normal def _create_graph_api_email_body(self) -> Any: """Create the email body for the Graph API. Returns: Email body for the Graph API. """ from msgraph.generated.models.body_type import BodyType from msgraph.generated.models.file_attachment import FileAttachment from msgraph.generated.models.item_body import ItemBody from msgraph.generated.models.message import Message from msgraph.generated.users.item.send_mail.send_mail_post_request_body import ( SendMailPostRequestBody, ) request_body = SendMailPostRequestBody() message = Message() message.subject = self.notification["subject"] message_body = ItemBody() message_body.content = self.notification["message"] match self.notification.get("mimetype", "plain"): case "html" | "text/html": message_body.content_type = BodyType.Html case "text" | "text/plain" | "plain" | "text/text": message_body.content_type = BodyType.Text case _: self._logger.warning( f"""Unknown mimetype '{self.notification["mimetype"]}' """ f"provided. Defaulting to 'text'." ) message_body.content_type = BodyType.Text message.body = message_body attachments = [] for attachment_file in self.notification.get("attachments", []): attachment_name = attachment_file.split("/")[-1] with open(attachment_file, "rb") as f: content = f.read() attachment = FileAttachment() attachment.name = attachment_name attachment.size = len(content) attachment.content_bytes = content attachments.append(attachment) message.attachments = attachments # type: ignore message.to_recipients = self._set_graph_api_recipients("to") message.cc_recipients = self._set_graph_api_recipients("cc") message.bcc_recipients = self._set_graph_api_recipients("bcc") message.importance = self._get_importance( self.notification.get("importance", "normal") ) request_body.message = message request_body.save_to_sent_items = False return request_body def _set_graph_api_recipients(self, recipient_type: str) -> list: """Set the recipients for the Graph API. Args: recipient_type: Type of recipient (to, cc or bcc). Returns: List of recipients for the Graph API. """ from msgraph.generated.models.email_address import EmailAddress from msgraph.generated.models.recipient import Recipient recipients = [] for email in self.notification.get(recipient_type, []): recipient = Recipient() recipient_address = EmailAddress() recipient_address.address = email recipient.email_address = recipient_address recipients.append(recipient) return recipients ================================================ FILE: lakehouse_engine/terminators/notifiers/exceptions.py ================================================ """Package defining all the Notifier custom exceptions.""" class NotifierNotFoundException(Exception): """Exception for when the notifier is not found.""" pass class NotifierConfigException(Exception): """Exception for when the notifier configuration is invalid.""" pass class NotifierTemplateNotFoundException(Exception): """Exception for when the notifier is not found.""" pass class NotifierTemplateConfigException(Exception): """Exception for when the notifier config is incorrect.""" pass ================================================ FILE: lakehouse_engine/terminators/notifiers/notification_templates.py ================================================ """Email notification templates.""" class NotificationsTemplates(object): """Templates for notifications.""" EMAIL_NOTIFICATIONS_TEMPLATES = { "failure_notification_email": { "subject": "Service Failure", "mimetype": "text/text", "message": """ Job {{ databricks_job_name }} in workspace {{ databricks_workspace_id }} has failed with the exception: {{ exception }}""", "on_failure": True, }, } ================================================ FILE: lakehouse_engine/terminators/sensor_terminator.py ================================================ """Module with sensor terminator.""" from typing import List from lakehouse_engine.core.definitions import SensorSpec, SensorStatus from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.sensor_manager import SensorControlTableManager from lakehouse_engine.utils.logging_handler import LoggingHandler class SensorTerminator(object): """Sensor Terminator class.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def update_sensor_status( cls, sensor_id: str, control_db_table_name: str, status: str = SensorStatus.PROCESSED_NEW_DATA.value, assets: List[str] = None, ) -> None: """Update internal sensor status. Update the sensor status in the control table, it should be used to tell the system that the sensor has processed all new data that was previously identified, hence updating the shifted sensor status. Usually used to move from `SensorStatus.ACQUIRED_NEW_DATA` to `SensorStatus.PROCESSED_NEW_DATA`, but there might be scenarios - still to identify - where we can update the sensor status from/to different statuses. Args: sensor_id: sensor id. control_db_table_name: `db.table` to store sensor checkpoints. status: status of the sensor. assets: a list of assets that are considered as available to consume downstream after this sensor has status PROCESSED_NEW_DATA. """ if status not in [s.value for s in SensorStatus]: raise NotImplementedError(f"Status {status} not accepted in sensor.") ExecEnv.get_or_create(app_name="update_sensor_status") SensorControlTableManager.update_sensor_status( sensor_spec=SensorSpec( sensor_id=sensor_id, control_db_table_name=control_db_table_name, assets=assets, input_spec=None, preprocess_query=None, checkpoint_location=None, ), status=status, ) ================================================ FILE: lakehouse_engine/terminators/spark_terminator.py ================================================ """Module with spark terminator.""" from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.logging_handler import LoggingHandler class SparkTerminator(object): """Spark Terminator class.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def terminate_spark(cls) -> None: """Terminate spark session.""" cls._logger.info("Terminating spark session...") ExecEnv.SESSION.stop() ================================================ FILE: lakehouse_engine/terminators/terminator_factory.py ================================================ """Module with the factory pattern to return terminators.""" from typing import Optional from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import TerminatorSpec from lakehouse_engine.terminators.notifier import Notifier from lakehouse_engine.terminators.notifier_factory import NotifierFactory from lakehouse_engine.utils.logging_handler import LoggingHandler class TerminatorFactory(object): """TerminatorFactory class following the factory pattern.""" _logger = LoggingHandler(__name__).get_logger() @staticmethod def execute_terminator( spec: TerminatorSpec, df: Optional[DataFrame] = None ) -> None: """Execute a terminator following the factory pattern. Args: spec: terminator specification. df: dataframe to be used in the terminator. Needed when a terminator requires one dataframe as input. Returns: Transformer function to be executed in .transform() spark function. """ if spec.function == "optimize_dataset": from lakehouse_engine.terminators.dataset_optimizer import DatasetOptimizer DatasetOptimizer.optimize_dataset(**spec.args) elif spec.function == "terminate_spark": from lakehouse_engine.terminators.spark_terminator import SparkTerminator SparkTerminator.terminate_spark() elif spec.function == "expose_cdf": from lakehouse_engine.terminators.cdf_processor import CDFProcessor CDFProcessor.expose_cdf(spec) elif spec.function == "notify": if not Notifier.check_if_notification_is_failure_notification(spec): notifier = NotifierFactory.get_notifier(spec) notifier.create_notification() notifier.send_notification() else: raise NotImplementedError( f"The requested terminator {spec.function} is not implemented." ) ================================================ FILE: lakehouse_engine/transformers/__init__.py ================================================ """Package to define transformers available in the lakehouse engine.""" ================================================ FILE: lakehouse_engine/transformers/aggregators.py ================================================ """Aggregators module.""" from typing import Callable from pyspark.sql import DataFrame from pyspark.sql.functions import col, max # noqa: A004 from lakehouse_engine.utils.logging_handler import LoggingHandler class Aggregators(object): """Class containing all aggregation functions.""" _logger = LoggingHandler(__name__).get_logger() @staticmethod def get_max_value(input_col: str, output_col: str = "latest") -> Callable: """Get the maximum value of a given column of a dataframe. Args: input_col: name of the input column. output_col: name of the output column (defaults to "latest"). Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='get_max_value')}} """ def inner(df: DataFrame) -> DataFrame: return df.select(col(input_col)).agg(max(input_col).alias(output_col)) return inner ================================================ FILE: lakehouse_engine/transformers/column_creators.py ================================================ """Column creators transformers module.""" from typing import Any, Callable, Dict from pyspark.sql import DataFrame, Window from pyspark.sql.functions import col, lit, monotonically_increasing_id, row_number from pyspark.sql.types import IntegerType from lakehouse_engine.transformers.exceptions import ( UnsupportedStreamingTransformerException, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class ColumnCreators(object): """Class containing all functions that can create columns to add value.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def with_row_id( cls, output_col: str = "lhe_row_id", ) -> Callable: """Create a sequential but not consecutive id. Args: output_col: optional name of the output column. Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='with_row_id')}} """ def inner(df: DataFrame) -> DataFrame: if not df.isStreaming: return df.withColumn(output_col, monotonically_increasing_id()) else: raise UnsupportedStreamingTransformerException( "Transformer with_row_id is not supported in streaming mode." ) return inner @classmethod def with_auto_increment_id( cls, output_col: str = "lhe_row_id", rdd: bool = True ) -> Callable: """Create a sequential and consecutive id. Args: output_col: optional name of the output column. rdd: optional parameter to use spark rdd. Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='with_auto_increment_id')}} """ def inner(df: DataFrame) -> DataFrame: if not df.isStreaming: if len(df.take(1)) == 0: # if df is empty we have to prevent the algorithm from failing return df.withColumn(output_col, lit(None).cast(IntegerType())) elif rdd: return ( df.rdd.zipWithIndex() .toDF() .select(col("_1.*"), col("_2").alias(output_col)) ) else: w = Window.orderBy(monotonically_increasing_id()) return df.withColumn(output_col, (row_number().over(w)) - 1) else: raise UnsupportedStreamingTransformerException( "Transformer with_auto_increment_id is not supported in " "streaming mode." ) return inner @classmethod def with_literals( cls, literals: Dict[str, Any], ) -> Callable: """Create columns given a map of column names and literal values (constants). Args: Dict[str, Any] literals: map of column names and literal values (constants). Returns: Callable: A function to be executed in the .transform() spark function. {{get_example(method_name='with_literals')}} """ def inner(df: DataFrame) -> DataFrame: df_with_literals = df for name, value in literals.items(): df_with_literals = df_with_literals.withColumn(name, lit(value)) return df_with_literals return inner ================================================ FILE: lakehouse_engine/transformers/column_reshapers.py ================================================ """Module with column reshaping transformers.""" from collections import OrderedDict from typing import Any, Callable, Dict, List, Optional import pyspark.sql.types as spark_types from pyspark.sql import DataFrame from pyspark.sql.avro.functions import from_avro from pyspark.sql.functions import ( col, explode_outer, expr, from_json, map_entries, struct, to_json, ) from lakehouse_engine.transformers.exceptions import WrongArgumentsException from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils class ColumnReshapers(object): """Class containing column reshaping transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def cast(cls, cols: Dict[str, str]) -> Callable: """Cast specific columns into the designated type. Args: cols: dict with columns and respective target types. Target types need to have the exact name of spark types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html Returns: A function to be called in .transform() spark function. {{get_example(method_name='cast')}} """ def inner(df: DataFrame) -> DataFrame: cast_df = df for c, t in cols.items(): cast_df = cast_df.withColumn(c, col(c).cast(getattr(spark_types, t)())) return cast_df return inner @classmethod def column_selector(cls, cols: OrderedDict) -> Callable: """Select specific columns with specific output aliases. Args: cols: dict with columns to select and respective aliases. Returns: A function to be called in .transform() spark function. {{get_example(method_name='column_selector')}} """ def inner(df: DataFrame) -> DataFrame: return df.select(*[col(c).alias(a) for c, a in cols.items()]) return inner @classmethod def flatten_schema( cls, max_level: int = None, shorten_names: bool = False, alias: bool = True, num_chars: int = 7, ignore_cols: List = None, ) -> Callable: """Flatten the schema of the dataframe. Args: max_level: level until which you want to flatten the schema. Default: None. shorten_names: whether to shorten the names of the prefixes of the fields being flattened or not. Default: False. alias: whether to define alias for the columns being flattened or not. Default: True. num_chars: number of characters to consider when shortening the names of the fields. Default: 7. ignore_cols: columns which you don't want to flatten. Default: None. Returns: A function to be called in .transform() spark function. {{get_example(method_name='flatten_schema')}} """ def inner(df: DataFrame) -> DataFrame: return df.select( SchemaUtils.schema_flattener( schema=df.schema, max_level=max_level, shorten_names=shorten_names, alias=alias, num_chars=num_chars, ignore_cols=ignore_cols, ) ) return inner @classmethod def explode_columns( cls, explode_arrays: bool = False, array_cols_to_explode: List[str] = None, explode_maps: bool = False, map_cols_to_explode: List[str] = None, ) -> Callable: """Explode columns with types like ArrayType and MapType. After it can be applied the flatten_schema transformation, if we desired for example to explode the map (as we explode a StructType) or to explode a StructType inside the array. We recommend you to specify always the columns desired to explode and not explode all columns. Args: explode_arrays: whether you want to explode array columns (True) or not (False). Default: False. array_cols_to_explode: array columns which you want to explode. If you don't specify it will get all array columns and explode them. Default: None. explode_maps: whether you want to explode map columns (True) or not (False). Default: False. map_cols_to_explode: map columns which you want to explode. If you don't specify it will get all map columns and explode them. Default: None. Returns: A function to be called in .transform() spark function. {{get_example(method_name='explode_columns')}} """ def inner(df: DataFrame) -> DataFrame: if explode_arrays or (array_cols_to_explode is not None): df = cls._explode_arrays(df, array_cols_to_explode) if explode_maps or (map_cols_to_explode is not None): df = cls._explode_maps(df, map_cols_to_explode) return df return inner @classmethod def _get_columns( cls, df: DataFrame, data_type: Any, ) -> List: """Get a list of columns from the dataframe of the data types specified. Args: df: input dataframe. data_type: data type specified. Returns: List of columns with the datatype specified. """ cols = [] for field in df.schema.fields: if isinstance(field.dataType, data_type): cols.append(field.name) return cols @classmethod def with_expressions(cls, cols_and_exprs: Dict[str, str]) -> Callable: """Execute Spark SQL expressions to create the specified columns. This function uses the Spark expr function. [Check here]( https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.expr.html). Args: cols_and_exprs: dict with columns and respective expressions to compute (Spark SQL expressions). Returns: A function to be called in .transform() spark function. {{get_example(method_name='with_expressions')}} """ def inner(df: DataFrame) -> DataFrame: enriched_df = df for c, e in cols_and_exprs.items(): enriched_df = enriched_df.withColumn(c, expr(e)) return enriched_df return inner @classmethod def rename(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable: """Rename specific columns into the designated name. Args: cols: dict with columns and respective target names. escape_col_names: whether to escape column names (e.g. `/BIC/COL1`) or not. If True it creates a column with the new name and drop the old one. If False, uses the native withColumnRenamed Spark function. Default: True. Returns: Function to be called in .transform() spark function. {{get_example(method_name='rename')}} """ def inner(df: DataFrame) -> DataFrame: renamed_df = df if escape_col_names: for old_name, new_name in cols.items(): renamed_df = renamed_df.withColumn(new_name, col(old_name)) renamed_df = renamed_df.drop(old_name) else: for old_name, new_name in cols.items(): renamed_df = df.withColumnRenamed(old_name, new_name) return renamed_df return inner @classmethod def from_avro( cls, schema: str = None, key_col: str = "key", value_col: str = "value", options: dict = None, expand_key: bool = False, expand_value: bool = True, ) -> Callable: """Select all attributes from avro. Args: schema: the schema string. key_col: the name of the key column. value_col: the name of the value column. options: extra options (e.g., mode: "PERMISSIVE"). expand_key: whether you want to expand the content inside the key column or not. Default: false. expand_value: whether you want to expand the content inside the value column or not. Default: true. Returns: Function to be called in .transform() spark function. {{get_example(method_name='from_avro')}} """ def inner(df: DataFrame) -> DataFrame: cols_to_select = [ column for column in df.columns if column not in [key_col, value_col] ] return df.select( *cols_to_select, key_col, from_avro(col(value_col), schema, options if options else None).alias( value_col ), ).select( *cols_to_select, f"{key_col}.*" if expand_key else key_col, f"{value_col}.*" if expand_value else value_col, ) return inner @classmethod def from_avro_with_registry( cls, schema_registry: str, value_schema: str, value_col: str = "value", key_schema: str = None, key_col: str = "key", expand_key: bool = False, expand_value: bool = True, options: dict = None, ) -> Callable: """Select all attributes from avro using a schema registry. Args: schema_registry: the url to the schema registry. value_schema: the name of the value schema entry in the schema registry. value_col: the name of the value column. key_schema: the name of the key schema entry in the schema registry. Default: None. key_col: the name of the key column. expand_key: whether you want to expand the content inside the key column or not. Default: false. expand_value: whether you want to expand the content inside the value column or not. Default: true. options: extra options (e.g., mode: "PERMISSIVE"). Returns: Function to be called in .transform() spark function. {{get_example(method_name='from_avro_with_registry')}} """ def inner(df: DataFrame) -> DataFrame: cols_to_select = [ column for column in df.columns if column not in [key_col, value_col] ] return df.select( # type: ignore *cols_to_select, ( from_avro( data=col(key_col), subject=key_schema, schemaRegistryAddress=schema_registry, # type: ignore options=options if options else None, ).alias(key_col) if key_schema else key_col ), from_avro( data=col(value_col), subject=value_schema, schemaRegistryAddress=schema_registry, # type: ignore options=options if options else None, ).alias(value_col), ).select( *cols_to_select, f"{key_col}.*" if expand_key else key_col, f"{value_col}.*" if expand_value else value_col, ) return inner @classmethod def from_json( cls, input_col: str, schema_path: Optional[str] = None, schema: Optional[dict] = None, json_options: Optional[dict] = None, drop_all_cols: bool = False, disable_dbfs_retry: bool = False, ) -> Callable: """Convert a json string into a json column (struct). The new json column can be added to the existing columns (default) or it can replace all the others, being the only one to output. The new column gets the same name as the original one suffixed with '_json'. Args: input_col: dict with columns and respective target names. schema_path: path to the StructType schema (spark schema). schema: dict with the StructType schema (spark schema). json_options: options to parse the json value. drop_all_cols: whether to drop all the input columns or not. Defaults to False. disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: A function to be called in .transform() spark function. {{get_example(method_name='from_json')}} """ def inner(df: DataFrame) -> DataFrame: if schema_path: json_schema = SchemaUtils.from_file(schema_path, disable_dbfs_retry) elif schema: json_schema = SchemaUtils.from_dict(schema) else: raise WrongArgumentsException( "A file or dict schema needs to be provided." ) if drop_all_cols: df_with_json = df.select( from_json( col(input_col).cast("string").alias(f"{input_col}_json"), json_schema, json_options if json_options else None, ).alias(f"{input_col}_json") ) else: df_with_json = df.select( "*", from_json( col(input_col).cast("string").alias(f"{input_col}_json"), json_schema, json_options if json_options else None, ).alias(f"{input_col}_json"), ) return df_with_json return inner @classmethod def to_json( cls, in_cols: List[str], out_col: str, json_options: Optional[dict] = None ) -> Callable: """Convert dataframe columns into a json value. Args: in_cols: name(s) of the input column(s). Example values: "*" - all columns; "my_col" - one column named "my_col"; "my_col1, my_col2" - two columns. out_col: name of the output column. json_options: options to parse the json value. Returns: A function to be called in .transform() spark function. {{get_example(method_name='to_json')}} """ def inner(df: DataFrame) -> DataFrame: return df.withColumn( out_col, to_json(struct(*in_cols), json_options if json_options else None), ) return inner @classmethod def _explode_arrays(cls, df: DataFrame, cols_to_explode: List[str]) -> DataFrame: """Explode array columns from dataframe. Args: df: the dataframe to apply the explode operation. cols_to_explode: list of array columns to perform explode. Returns: A dataframe with array columns exploded. """ if cols_to_explode is None: cols_to_explode = cls._get_columns(df, spark_types.ArrayType) for column in cols_to_explode: df = df.withColumn(column, explode_outer(column)) return df @classmethod def _explode_maps(cls, df: DataFrame, cols_to_explode: List[str]) -> DataFrame: """Explode map columns from dataframe. Args: df: the dataframe to apply the explode operation. cols_to_explode: list of map columns to perform explode. Returns: A dataframe with map columns exploded. """ if cols_to_explode is None: cols_to_explode = cls._get_columns(df, spark_types.MapType) for column in cols_to_explode: df = df.withColumn(column, explode_outer(map_entries(col(column)))) return df ================================================ FILE: lakehouse_engine/transformers/condensers.py ================================================ """Condensers module.""" from typing import Callable, List, Optional from pyspark.sql import DataFrame, Window from pyspark.sql.functions import col, row_number from lakehouse_engine.transformers.exceptions import ( UnsupportedStreamingTransformerException, WrongArgumentsException, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class Condensers(object): """Class containing all the functions to condensate data for later merges.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def condense_record_mode_cdc( cls, business_key: List[str], record_mode_col: str, valid_record_modes: List[str], ranking_key_desc: Optional[List[str]] = None, ranking_key_asc: Optional[List[str]] = None, ) -> Callable: """Condense Change Data Capture (CDC) based on record_mode strategy. This CDC data is particularly seen in some CDC enabled systems. Other systems may have different CDC strategies. Args: business_key: The business key (logical primary key) of the data. ranking_key_desc: In this type of CDC condensation the data needs to be in descending order in a certain way, using columns specified in this parameter. ranking_key_asc: In this type of CDC condensation the data needs to be in ascending order in a certain way, using columns specified in this parameter. record_mode_col: Name of the record mode input_col. valid_record_modes: Depending on the context, not all record modes may be considered for condensation. Use this parameter to skip those. Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='condense_record_mode_cdc')}} """ if not ranking_key_desc and not ranking_key_asc: raise WrongArgumentsException( "The condense_record_mode_cdc transformer requires data to be either" "in descending or ascending order, but no arguments for ordering" "were provided." ) def inner(df: DataFrame) -> DataFrame: if not df.isStreaming: partition_window = Window.partitionBy( [col(c) for c in business_key] ).orderBy( [ col(c).desc() for c in (ranking_key_desc if ranking_key_desc else []) ] # type: ignore + [ col(c).asc() for c in (ranking_key_asc if ranking_key_asc else []) ] # type: ignore ) return ( df.withColumn("ranking", row_number().over(partition_window)) .filter( col(record_mode_col).isNull() | col(record_mode_col).isin(valid_record_modes) ) .filter(col("ranking") == 1) .drop("ranking") ) else: raise UnsupportedStreamingTransformerException( "Transformer condense_record_mode_cdc is not supported in " "streaming mode." ) return inner @classmethod def group_and_rank( cls, group_key: List[str], ranking_key: List[str], descending: bool = True ) -> Callable: """Condense data based on a simple group by + take latest mechanism. Args: group_key: list of column names to use in the group by. ranking_key: the data needs to be in descending order using columns specified in this parameter. descending: if the ranking considers descending order or not. Defaults to True. Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='group_and_rank')}} """ def inner(df: DataFrame) -> DataFrame: if not df.isStreaming: partition_window = Window.partitionBy( [col(c) for c in group_key] ).orderBy( [ col(c).desc() if descending else col(c).asc() for c in (ranking_key if ranking_key else []) ] # type: ignore ) return ( df.withColumn("ranking", row_number().over(partition_window)) .filter(col("ranking") == 1) .drop("ranking") ) else: raise UnsupportedStreamingTransformerException( "Transformer group_and_rank is not supported in streaming mode." ) return inner ================================================ FILE: lakehouse_engine/transformers/custom_transformers.py ================================================ """Custom transformers module.""" from typing import Callable from pyspark.sql import DataFrame class CustomTransformers(object): """Class representing a CustomTransformers.""" @staticmethod def custom_transformation(custom_transformer: Callable) -> Callable: """Execute a custom transformation provided by the user. This transformer can be very useful whenever the user cannot use our provided transformers, or they want to write complex logic in the transform step of the algorithm. .. warning:: Attention! Please bear in mind that the custom_transformer function provided as argument needs to receive a DataFrame and return a DataFrame, because it is how Spark's .transform method is able to chain the transformations. Example: ```python def my_custom_logic(df: DataFrame) -> DataFrame: ``` Args: custom_transformer: custom transformer function. A python function with all required pyspark logic provided by the user. Returns: Callable: the same function provided as parameter, in order to e called later in the TransformerFactory. {{get_example(method_name='custom_transformation')}} """ return custom_transformer @staticmethod def sql_transformation(sql: str) -> Callable: """Execute a SQL transformation provided by the user. This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers. Args: sql: the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view. Returns: Callable: A function to be called in .transform() spark function. {{get_example(method_name='sql_transformation')}} """ def inner(df: DataFrame) -> DataFrame: return df.sparkSession.sql(sql) return inner ================================================ FILE: lakehouse_engine/transformers/data_maskers.py ================================================ """Module with data masking transformers.""" from typing import Callable, List from pyspark.sql import DataFrame from pyspark.sql.functions import hash, sha2 # noqa: A004 from lakehouse_engine.transformers.exceptions import WrongArgumentsException from lakehouse_engine.utils.logging_handler import LoggingHandler class DataMaskers(object): """Class containing data masking transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def hash_masker( cls, cols: List[str], approach: str = "SHA", num_bits: int = 256, suffix: str = "_hash", ) -> Callable: """Mask specific columns using an hashing approach. Args: cols: list of column names to mask. approach: hashing approach. Defaults to 'SHA'. There's "MURMUR3" as well. num_bits: number of bits of the SHA approach. Only applies to SHA approach. suffix: suffix to apply to new column name. Defaults to "_hash". Note: you can pass an empty suffix to have the original column replaced. Returns: A function to be called in .transform() spark function. {{get_example(method_name='hash_masker')}} """ def inner(df: DataFrame) -> DataFrame: masked_df = df for col in cols: if approach == "MURMUR3": masked_df = masked_df.withColumn(col + suffix, hash(col)) elif approach == "SHA": masked_df = masked_df.withColumn(col + suffix, sha2(col, num_bits)) else: raise WrongArgumentsException("Hashing approach is not supported.") return masked_df return inner @classmethod def column_dropper(cls, cols: List[str]) -> Callable: """Drop specific columns. Args: cols: list of column names to drop. Returns: A function to be called in .transform() spark function. {{get_example(method_name='column_dropper')}} """ def inner(df: DataFrame) -> DataFrame: drop_df = df for col in cols: drop_df = drop_df.drop(col) return drop_df return inner ================================================ FILE: lakehouse_engine/transformers/date_transformers.py ================================================ """Module containing date transformers.""" from datetime import datetime from typing import Callable, List, Optional from pyspark.sql import DataFrame from pyspark.sql.functions import col, date_format, lit, to_date, to_timestamp from lakehouse_engine.utils.logging_handler import LoggingHandler class DateTransformers(object): """Class with set of transformers to transform dates in several forms.""" _logger = LoggingHandler(__name__).get_logger() @staticmethod def add_current_date(output_col: str) -> Callable: """Add column with current date. The current date comes from the driver as a constant, not from every executor. Args: output_col: name of the output column. Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='add_current_date')}} """ def inner(df: DataFrame) -> DataFrame: return df.withColumn(output_col, lit(datetime.now())) return inner @staticmethod def convert_to_date( cols: List[str], source_format: Optional[str] = None ) -> Callable: """Convert multiple string columns with a source format into dates. Args: cols: list of names of the string columns to convert. source_format: dates source format (e.g., YYYY-MM-dd). [Check here]( https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html). Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='convert_to_date')}} """ def inner(df: DataFrame) -> DataFrame: converted_df = df for c in cols: converted_df = converted_df.withColumn( c, to_date(col(c), source_format) ) return converted_df return inner @staticmethod def convert_to_timestamp( cols: List[str], source_format: Optional[str] = None ) -> Callable: """Convert multiple string columns with a source format into timestamps. Args: cols: list of names of the string columns to convert. source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS). [Check here]( https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html). Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='convert_to_timestamp')}} """ def inner(df: DataFrame) -> DataFrame: converted_df = df for c in cols: converted_df = converted_df.withColumn( c, to_timestamp(col(c), source_format) ) return converted_df return inner @staticmethod def format_date(cols: List[str], target_format: Optional[str] = None) -> Callable: """Convert multiple date/timestamp columns into strings with the target format. Args: cols: list of names of the string columns to convert. target_format: strings target format (e.g., YYYY-MM-dd). [Check here]( https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html). Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='format_date')}} """ def inner(df: DataFrame) -> DataFrame: converted_df = df for c in cols: converted_df = converted_df.withColumn( c, date_format(col(c), target_format) ) return converted_df return inner @staticmethod def get_date_hierarchy(cols: List[str], formats: Optional[dict] = None) -> Callable: """Create day/month/week/quarter/year hierarchy for the provided date columns. Uses Spark's extract function. Args: cols: list of names of the date columns to create the hierarchy. formats: dict with the correspondence between the hierarchy and the format to apply. [Check here]( https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html). Example: { "year": "year", "month": "month", "day": "day", "week": "week", "quarter": "quarter" } Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='get_date_hierarchy')}} """ if not formats: formats = { "year": "year", "month": "month", "day": "day", "week": "week", "quarter": "quarter", } def inner(df: DataFrame) -> DataFrame: transformer_df = df for c in cols: transformer_df = transformer_df.selectExpr( "*", f"extract({formats['day']} from {c}) as {c}_day", f"extract({formats['month']} from {c}) as {c}_month", f"extract({formats['week']} from {c}) as {c}_week", f"extract({formats['quarter']} from {c}) as {c}_quarter", f"extract({formats['year']} from {c}) as {c}_year", ) return transformer_df return inner ================================================ FILE: lakehouse_engine/transformers/exceptions.py ================================================ """Module for all the transformers exceptions.""" class WrongArgumentsException(Exception): """Exception for when a user provides wrong arguments to a transformer.""" pass class UnsupportedStreamingTransformerException(Exception): """Exception for when a user requests a transformer not supported in streaming.""" pass ================================================ FILE: lakehouse_engine/transformers/filters.py ================================================ """Module containing the filters transformers.""" from typing import Any, Callable, List, Optional from pyspark.sql import DataFrame from pyspark.sql.functions import col from lakehouse_engine.transformers.watermarker import Watermarker from lakehouse_engine.utils.logging_handler import LoggingHandler class Filters(object): """Class containing the filters transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def incremental_filter( cls, input_col: str, increment_value: Optional[Any] = None, increment_df: Optional[DataFrame] = None, increment_col: str = "latest", greater_or_equal: bool = False, ) -> Callable: """Incrementally Filter a certain dataframe given an increment logic. This logic can either be an increment value or an increment dataframe from which the get the latest value from. By default, the operator for the filtering process is greater or equal to cover cases where we receive late arriving data not cover in a previous load. You can change greater_or_equal to false to use greater, when you trust the source will never output more data with the increment after you have load the data (e.g., you will never load data until the source is still dumping data, which may cause you to get an incomplete picture of the last arrived data). Args: input_col: input column name increment_value: value to which to filter the data, considering the provided input_Col. increment_df: a dataframe to get the increment value from. you either specify this or the increment_value (this takes precedence). This is a good approach to get the latest value from a given dataframe that was read and apply that value as filter here. In this way you can perform incremental loads based on the last value of a given dataframe (e.g., table or file based). Can be used together with the get_max_value transformer to accomplish these incremental based loads. See our append load feature tests to see how to provide an acon for incremental loads, taking advantage of the scenario explained here. increment_col: name of the column from which to get the increment value from (when using increment_df approach). This assumes there's only one row in the increment_df, reason why is a good idea to use together with the get_max_value transformer. Defaults to "latest" because that's the default output column name provided by the get_max_value transformer. greater_or_equal: if filtering should be done by also including the increment value or not (useful for scenarios where you are performing increment loads but still want to include data considering the increment value, and not only values greater than that increment... examples may include scenarios where you already loaded data including those values, but the source produced more data containing those values). Defaults to false. Returns: A function to be called in .transform() spark function. {{get_example(method_name='incremental_filter')}} """ def inner(df: DataFrame) -> DataFrame: if increment_df: if greater_or_equal: return df.filter( # type: ignore col(input_col) >= increment_df.collect()[0][increment_col] ) else: return df.filter( # type: ignore col(input_col) > increment_df.collect()[0][increment_col] ) else: if greater_or_equal: return df.filter(col(input_col) >= increment_value) # type: ignore else: return df.filter(col(input_col) > increment_value) # type: ignore return inner @staticmethod def expression_filter(exp: str) -> Callable: """Filter a dataframe based on an expression. Args: exp: filter expression. Returns: A function to be called in .transform() spark function. {{get_example(method_name='expression_filter')}} """ def inner(df: DataFrame) -> DataFrame: return df.filter(exp) # type: ignore return inner @staticmethod def column_filter_exp(exp: List[str]) -> Callable: """Filter a dataframe's columns based on a list of SQL expressions. Args: exp: column filter expressions. Returns: A function to be called in .transform() spark function. {{get_example(method_name='column_filter_exp')}} """ def inner(df: DataFrame) -> DataFrame: return df.selectExpr(*exp) # type: ignore return inner @staticmethod def drop_duplicate_rows( cols: List[str] = None, watermarker: dict = None ) -> Callable: """Drop duplicate rows using spark function dropDuplicates(). This transformer can be used with or without arguments. The provided argument needs to be a list of columns. For example: [“Name”,”VAT”] will drop duplicate records within "Name" and "VAT" columns. If the transformer is used without providing any columns list or providing an empty list, such as [] the result will be the same as using the distinct() pyspark function. If the watermark dict is present it will ensure that the drop operation will apply to rows within the watermark timeline window. Args: cols: column names. watermarker: properties to apply watermarker to the transformer. Returns: A function to be called in .transform() spark function. {{get_example(method_name='drop_duplicate_rows')}} """ def inner(df: DataFrame) -> DataFrame: if watermarker: df = Watermarker.with_watermark( watermarker["col"], watermarker["watermarking_time"] )(df) if not cols: return df.dropDuplicates() else: return df.dropDuplicates(cols) return inner ================================================ FILE: lakehouse_engine/transformers/joiners.py ================================================ """Module with join transformers.""" import uuid from typing import Callable, List, Optional from pyspark.sql import DataFrame from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.transformers.watermarker import Watermarker from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.spark_utils import SparkUtils class Joiners(object): """Class containing join transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def join( cls, join_with: DataFrame, join_condition: str, left_df_alias: str = "a", right_df_alias: str = "b", join_type: str = "inner", broadcast_join: bool = True, select_cols: Optional[List[str]] = None, watermarker: Optional[dict] = None, ) -> Callable: """Join two dataframes based on specified type and columns. Some stream to stream joins are only possible if you apply Watermark, so this method also provides a parameter to enable watermarking specification. Args: left_df_alias: alias of the first dataframe. join_with: right dataframe. right_df_alias: alias of the second dataframe. join_condition: condition to join dataframes. join_type: type of join. Defaults to inner. Available values: inner, cross, outer, full, full outer, left, left outer, right, right outer, semi, left semi, anti, and left anti. broadcast_join: whether to perform a broadcast join or not. select_cols: list of columns to select at the end. watermarker: properties to apply watermarking. Returns: A function to be called in .transform() spark function. {{get_example(method_name='join')}} """ def inner(df: DataFrame) -> DataFrame: # The goal here is to avoid problems on # simultaneously running process, # so an id is added as a prefix for the alias. app_id = str(uuid.uuid4()) left = f"`{app_id}_{left_df_alias}`" right = f"`{app_id}_{right_df_alias}`" df_join_with = join_with if watermarker: left_df_watermarking = watermarker.get(left_df_alias, None) right_df_watermarking = watermarker.get(right_df_alias, None) if left_df_watermarking: df = Watermarker.with_watermark( left_df_watermarking["col"], left_df_watermarking["watermarking_time"], )(df) if right_df_watermarking: df_join_with = Watermarker.with_watermark( right_df_watermarking["col"], right_df_watermarking["watermarking_time"], )(df_join_with) l_prefix = SparkUtils.create_temp_view(df, left, return_prefix=True) r_prefix = SparkUtils.create_temp_view( df_join_with, right, return_prefix=True ) query = f""" SELECT {f"/*+ BROADCAST({right_df_alias}) */" if broadcast_join else ""} {", ".join(select_cols)} FROM {l_prefix}{left} AS {left_df_alias} {join_type.upper()} JOIN {r_prefix}{right} AS {right_df_alias} ON {join_condition} """ # nosec: B608 cls._logger.info(f"Execution query: {query}") return ExecEnv.SESSION.sql(query) return inner ================================================ FILE: lakehouse_engine/transformers/null_handlers.py ================================================ """Module with null handlers transformers.""" from typing import Callable, List from pyspark.sql import DataFrame from lakehouse_engine.utils.logging_handler import LoggingHandler class NullHandlers(object): """Class containing null handler transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def replace_nulls( cls, replace_on_nums: bool = True, default_num_value: int = -999, replace_on_strings: bool = True, default_string_value: str = "UNKNOWN", subset_cols: List[str] = None, ) -> Callable: """Replace nulls in a dataframe. Args: replace_on_nums: if it is to replace nulls on numeric columns. Applies to ints, longs and floats. default_num_value: default integer value to use as replacement. replace_on_strings: if it is to replace nulls on string columns. default_string_value: default string value to use as replacement. subset_cols: list of columns in which to replace nulls. If not provided, all nulls in all columns will be replaced as specified. Returns: A function to be called in .transform() spark function. {{get_example(method_name='replace_nulls')}} """ def inner(df: DataFrame) -> DataFrame: if replace_on_nums: df = df.na.fill(default_num_value, subset_cols) if replace_on_strings: df = df.na.fill(default_string_value, subset_cols) return df return inner ================================================ FILE: lakehouse_engine/transformers/optimizers.py ================================================ """Optimizers module.""" from typing import Callable from pyspark.sql import DataFrame from pyspark.storagelevel import StorageLevel from lakehouse_engine.utils.logging_handler import LoggingHandler class Optimizers(object): """Class containing all the functions that can provide optimizations.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def cache(cls) -> Callable: """Caches the current dataframe. The default storage level used is MEMORY_AND_DISK. Returns: A function to be called in .transform() spark function. {{get_example(method_name='cache')}} """ def inner(df: DataFrame) -> DataFrame: return df.cache() return inner @classmethod def persist(cls, storage_level: str = None) -> Callable: """Caches the current dataframe with a specific StorageLevel. Args: storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER. [More options here]( https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.StorageLevel.html). Returns: A function to be called in .transform() spark function. {{get_example(method_name='persist')}} """ def inner(df: DataFrame) -> DataFrame: level = getattr( StorageLevel, storage_level, StorageLevel.MEMORY_AND_DISK_DESER ) return df.persist(level) return inner @classmethod def unpersist(cls, blocking: bool = False) -> Callable: """Removes the dataframe from the disk and memory. Args: blocking: whether to block until all the data blocks are removed from disk/memory or run asynchronously. Returns: A function to be called in .transform() spark function. {{get_example(method_name='unpersist')}} """ def inner(df: DataFrame) -> DataFrame: return df.unpersist(blocking) return inner ================================================ FILE: lakehouse_engine/transformers/regex_transformers.py ================================================ """Regex transformers module.""" from typing import Callable from pyspark.sql import DataFrame from pyspark.sql.functions import col, regexp_extract from lakehouse_engine.utils.logging_handler import LoggingHandler class RegexTransformers(object): """Class containing all regex functions.""" _logger = LoggingHandler(__name__).get_logger() @staticmethod def with_regex_value( input_col: str, output_col: str, regex: str, drop_input_col: bool = False, idx: int = 1, ) -> Callable: """Get the result of applying a regex to an input column (via regexp_extract). Args: input_col: name of the input column. output_col: name of the output column. regex: regular expression. drop_input_col: whether to drop input_col or not. idx: index to return. Returns: A function to be executed in the .transform() spark function. {{get_example(method_name='with_regex_value')}} """ def inner(df: DataFrame) -> DataFrame: df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx)) if drop_input_col: df = df.drop(input_col) return df return inner ================================================ FILE: lakehouse_engine/transformers/repartitioners.py ================================================ """Module with repartitioners transformers.""" from typing import Callable, List, Optional from pyspark.sql import DataFrame from lakehouse_engine.transformers.exceptions import WrongArgumentsException from lakehouse_engine.utils.logging_handler import LoggingHandler class Repartitioners(object): """Class containing repartitioners transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def coalesce(cls, num_partitions: int) -> Callable: """Coalesce a dataframe into n partitions. Args: num_partitions: num of partitions to coalesce. Returns: A function to be called in .transform() spark function. {{get_example(method_name='coalesce')}} """ def inner(df: DataFrame) -> DataFrame: return df.coalesce(num_partitions) return inner @classmethod def repartition( cls, num_partitions: Optional[int] = None, cols: Optional[List[str]] = None ) -> Callable: """Repartition a dataframe into n partitions. If num_partitions is provided repartitioning happens based on the provided number, otherwise it happens based on the values of the provided cols (columns). Args: num_partitions: num of partitions to repartition. cols: list of columns to use for repartitioning. Returns: A function to be called in .transform() spark function. {{get_example(method_name='repartition')}} """ def inner(df: DataFrame) -> DataFrame: if cols: return df.repartition(num_partitions, *cols) elif num_partitions: return df.repartition(num_partitions) else: raise WrongArgumentsException( "num_partitions or cols should be specified" ) return inner ================================================ FILE: lakehouse_engine/transformers/transformer_factory.py ================================================ """Module with the factory pattern to return transformers.""" from typing import Callable, OrderedDict from lakehouse_engine.core.definitions import TransformerSpec from lakehouse_engine.transformers.aggregators import Aggregators from lakehouse_engine.transformers.column_creators import ColumnCreators from lakehouse_engine.transformers.column_reshapers import ColumnReshapers from lakehouse_engine.transformers.condensers import Condensers from lakehouse_engine.transformers.custom_transformers import CustomTransformers from lakehouse_engine.transformers.data_maskers import DataMaskers from lakehouse_engine.transformers.date_transformers import DateTransformers from lakehouse_engine.transformers.filters import Filters from lakehouse_engine.transformers.joiners import Joiners from lakehouse_engine.transformers.null_handlers import NullHandlers from lakehouse_engine.transformers.optimizers import Optimizers from lakehouse_engine.transformers.regex_transformers import RegexTransformers from lakehouse_engine.transformers.repartitioners import Repartitioners from lakehouse_engine.transformers.unions import Unions from lakehouse_engine.transformers.watermarker import Watermarker from lakehouse_engine.utils.logging_handler import LoggingHandler class TransformerFactory(object): """TransformerFactory class following the factory pattern.""" _logger = LoggingHandler(__name__).get_logger() UNSUPPORTED_STREAMING_TRANSFORMERS = [ "condense_record_mode_cdc", "group_and_rank", "with_auto_increment_id", "with_row_id", ] AVAILABLE_TRANSFORMERS = { "add_current_date": DateTransformers.add_current_date, "cache": Optimizers.cache, "cast": ColumnReshapers.cast, "coalesce": Repartitioners.coalesce, "column_dropper": DataMaskers.column_dropper, "column_filter_exp": Filters.column_filter_exp, "column_selector": ColumnReshapers.column_selector, "condense_record_mode_cdc": Condensers.condense_record_mode_cdc, "convert_to_date": DateTransformers.convert_to_date, "convert_to_timestamp": DateTransformers.convert_to_timestamp, "custom_transformation": CustomTransformers.custom_transformation, "drop_duplicate_rows": Filters.drop_duplicate_rows, "expression_filter": Filters.expression_filter, "format_date": DateTransformers.format_date, "flatten_schema": ColumnReshapers.flatten_schema, "explode_columns": ColumnReshapers.explode_columns, "from_avro": ColumnReshapers.from_avro, "from_avro_with_registry": ColumnReshapers.from_avro_with_registry, "from_json": ColumnReshapers.from_json, "get_date_hierarchy": DateTransformers.get_date_hierarchy, "get_max_value": Aggregators.get_max_value, "group_and_rank": Condensers.group_and_rank, "hash_masker": DataMaskers.hash_masker, "incremental_filter": Filters.incremental_filter, "join": Joiners.join, "persist": Optimizers.persist, "rename": ColumnReshapers.rename, "repartition": Repartitioners.repartition, "replace_nulls": NullHandlers.replace_nulls, "sql_transformation": CustomTransformers.sql_transformation, "to_json": ColumnReshapers.to_json, "union": Unions.union, "union_by_name": Unions.union_by_name, "with_watermark": Watermarker.with_watermark, "unpersist": Optimizers.unpersist, "with_auto_increment_id": ColumnCreators.with_auto_increment_id, "with_expressions": ColumnReshapers.with_expressions, "with_literals": ColumnCreators.with_literals, "with_regex_value": RegexTransformers.with_regex_value, "with_row_id": ColumnCreators.with_row_id, } @staticmethod def get_transformer(spec: TransformerSpec, data: OrderedDict = None) -> Callable: """Get a transformer following the factory pattern. Args: spec: transformer specification (individual transformation... not to be confused with list of all transformations). data: ordered dict of dataframes to be transformed. Needed when a transformer requires more than one dataframe as input. Returns: Transformer function to be executed in .transform() spark function. {{get_example(method_name='get_transformer')}} """ if spec.function == "incremental_filter": # incremental_filter optionally expects a DataFrame as input, so find it. args_copy = TransformerFactory._get_spec_args_copy(spec.args) if "increment_df" in args_copy: args_copy["increment_df"] = data[args_copy["increment_df"]] return TransformerFactory.AVAILABLE_TRANSFORMERS[ # type: ignore spec.function ](**args_copy) elif spec.function == "join": # get the dataframe given the input_id in the input specs of the acon. args_copy = TransformerFactory._get_spec_args_copy(spec.args) args_copy["join_with"] = data[args_copy["join_with"]] return TransformerFactory.AVAILABLE_TRANSFORMERS[ # type: ignore spec.function ](**args_copy) elif spec.function == "union" or spec.function == "union_by_name": # get the list of dataframes given the input_id in the input specs # of the acon. args_copy = TransformerFactory._get_spec_args_copy(spec.args) args_copy["union_with"] = [] for union_with_spec_id in spec.args["union_with"]: args_copy["union_with"].append(data[union_with_spec_id]) return TransformerFactory.AVAILABLE_TRANSFORMERS[ # type: ignore spec.function ](**args_copy) elif spec.function in TransformerFactory.AVAILABLE_TRANSFORMERS: return TransformerFactory.AVAILABLE_TRANSFORMERS[ # type: ignore spec.function ](**spec.args) else: raise NotImplementedError( f"The requested transformer {spec.function} is not implemented." ) @staticmethod def _get_spec_args_copy(spec_args: dict) -> dict: """Returns a shallow copy of `spec_args` to ensure immutability. Args: spec_args (dict): A dictionary containing the arguments of a TransformerSpec. Returns: dict: A shallow copy of `spec_args`, preventing modifications to the original dictionary. This is important in Spark, especially when retries of failed attempts occur. For example, if during the first run the `join_with` argument (initially a string) is replaced with a DataFrame (as done in the `get_transformer` function), then on a retry, depending on how Spark handles state, the `join_with` argument may no longer be a string but a DataFrame, leading to key error. """ return dict(spec_args) ================================================ FILE: lakehouse_engine/transformers/unions.py ================================================ """Module with union transformers.""" from functools import reduce from typing import Callable, List from pyspark.sql import DataFrame from lakehouse_engine.utils.logging_handler import LoggingHandler class Unions(object): """Class containing union transformers.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def union( cls, union_with: List[DataFrame], deduplication: bool = True, ) -> Callable: """Union dataframes, resolving columns by position (not by name). Args: union_with: list of dataframes to union. deduplication: whether to perform deduplication of elements or not. Returns: A function to be called in .transform() spark function. {{get_example(method_name='union')}} """ def inner(df: DataFrame) -> DataFrame: union_df = reduce(lambda x, y: x.union(y), [df] + union_with) return union_df.distinct() if deduplication else union_df return inner @classmethod def union_by_name( cls, union_with: List[DataFrame], deduplication: bool = True, allow_missing_columns: bool = True, ) -> Callable: """Union dataframes, resolving columns by name (not by position). Args: union_with: list of dataframes to union. deduplication: whether to perform deduplication of elements or not. allow_missing_columns: allow the union of DataFrames with different schemas. Returns: A function to be called in .transform() spark function. {{get_example(method_name='union_by_name')}} """ def inner(df: DataFrame) -> DataFrame: union_df = reduce( lambda x, y: x.unionByName( y, allowMissingColumns=allow_missing_columns ), [df] + union_with, ) return union_df.distinct() if deduplication else union_df return inner ================================================ FILE: lakehouse_engine/transformers/watermarker.py ================================================ """Watermarker module.""" from typing import Callable from pyspark.sql import DataFrame from lakehouse_engine.utils.logging_handler import LoggingHandler class Watermarker(object): """Class containing all watermarker transformers.""" _logger = LoggingHandler(__name__).get_logger() @staticmethod def with_watermark(watermarker_column: str, watermarker_time: str) -> Callable: """Get the dataframe with watermarker defined. Args: watermarker_column: name of the input column to be considered for the watermarking. Note: it must be a timestamp. watermarker_time: time window to define the watermark value. Returns: A function to be executed on other transformers. {{get_example(method_name='with_watermark')}} """ def inner(df: DataFrame) -> DataFrame: return df.withWatermark(watermarker_column, watermarker_time) return inner ================================================ FILE: lakehouse_engine/utils/__init__.py ================================================ """Utilities package.""" ================================================ FILE: lakehouse_engine/utils/acon_utils.py ================================================ """Module to perform validations and resolve the acon.""" from lakehouse_engine.core.definitions import ( FILE_MANAGER_OPERATIONS, TABLE_MANAGER_OPERATIONS, DQType, InputFormat, OutputFormat, ) from lakehouse_engine.io.exceptions import WrongIOFormatException from lakehouse_engine.utils.dq_utils import PrismaUtils from lakehouse_engine.utils.logging_handler import LoggingHandler _LOGGER = LoggingHandler(__name__).get_logger() def validate_manager_list(acon: dict) -> list: """Function to validate an acon with a list of operations. Args: acon: Acon to be validated. """ error_list: list[str] = [] operations: list[dict] = acon.get("operations", []) if not operations: raise RuntimeError("No operations found in the acon.") for operation in operations: validate_managers(operation, error_list) if error_list: error_list_str = "\n" + "\n".join(error_list) raise RuntimeError(f"Errors found during validation:{error_list_str}") return operations def validate_and_resolve_acon(acon: dict, execution_point: str = "") -> dict: """Function to validate and resolve the acon. Args: acon: Acon to be validated and resolved. execution_point: Execution point to resolve the dq functions. Returns: Acon after validation and resolution. """ # Performing validations validate_readers(acon) validate_writers(acon) validate_managers(acon) # Resolving the acon if execution_point: acon = resolve_dq_functions(acon, execution_point) _LOGGER.info(f"Read Algorithm Configuration: {str(acon)}") return acon def validate_readers(acon: dict) -> None: """Function to validate the readers in the acon. Args: acon: Acon to be validated. Raises: RuntimeError: If the input format is not supported. """ if "input_specs" in acon.keys() or "input_spec" in acon.keys(): for spec in acon.get("input_specs", []) or [acon.get("input_spec", {})]: if ( not InputFormat.exists(spec.get("data_format")) and "db_table" not in spec.keys() ): raise WrongIOFormatException( f"Input format not supported: {spec.get('data_format')}" ) def validate_writers(acon: dict) -> None: """Function to validate the writers in the acon. Args: acon: Acon to be validated. Raises: RuntimeError: If the output format is not supported. """ if "output_specs" in acon.keys() or "output_spec" in acon.keys(): for spec in acon.get("output_specs", []) or [acon.get("output_spec", {})]: if not OutputFormat.exists(spec.get("data_format")): raise WrongIOFormatException( f"Output format not supported: {spec.get('data_format')}" ) def validate_managers(acon: dict, error_list: list = None) -> None: """Function to validate the managers in the acon. Args: acon: Acon to be validated. error_list: List to collect errors. """ manager_type = acon.get("manager") temp_error_list = [] if not manager_type: return function_name = acon.get("function") if not function_name: error = "Missing 'function' parameter for manager" temp_error_list.append(error) if manager_type == "file": operations_dict = FILE_MANAGER_OPERATIONS elif manager_type == "table": operations_dict = TABLE_MANAGER_OPERATIONS else: error = f"Manager type not supported: {manager_type}" temp_error_list.append(error) if function_name not in operations_dict: error = f"Function '{function_name}' not supported for {manager_type} manager" temp_error_list.append(error) else: expected_params = operations_dict[function_name] missing_mandatory = validate_mandatory_parameters(acon, expected_params) if missing_mandatory: error = ( f"Missing mandatory parameters for {manager_type} " f"manager function {function_name}: {missing_mandatory}" ) temp_error_list.append(error) type_errors = validate_parameter_types(acon, expected_params) if type_errors: error = ( f"Type validation errors for {manager_type} " f"manager function {function_name}: {type_errors}" ) temp_error_list.append(error) if error_list is not None: error_list.extend(temp_error_list) else: if temp_error_list: error_list_str = "\n".join(temp_error_list) raise RuntimeError(error_list_str) def validate_mandatory_parameters(acon: dict, expected_params: dict) -> list: """Function to validate mandatory parameters in the acon. Args: acon: Acon to be validated. expected_params: Expected parameters with their mandatory status. Returns: List of missing mandatory parameters. """ missing_mandatory = [] for param_name, param_info in expected_params.items(): if param_info["mandatory"] and param_name not in acon: missing_mandatory.append(param_name) return missing_mandatory def validate_parameter_types(acon: dict, expected_params: dict) -> list: """Function to validate parameter types in the acon. Args: acon: Acon to be validated. expected_params: Expected parameters with their types. Returns: List of type validation errors. """ type_errors = [] for param_name, param_value in acon.items(): if param_name in expected_params: expected_type = expected_params[param_name]["type"] param_type_name = type(param_value).__name__ expected_python_type = { "str": str, "bool": bool, "int": int, "list": list, }.get(expected_type) if expected_python_type and not isinstance( param_value, expected_python_type ): type_errors.append( f"Parameter '{param_name}' expected {expected_type}, " f"got {param_type_name}" ) return type_errors def resolve_dq_functions(acon: dict, execution_point: str) -> dict: """Function to resolve the dq functions in the acon. Args: acon: Acon to resolve the dq functions. execution_point: Execution point of the dq_functions. Returns: Acon after resolving the dq functions. """ if acon.get("dq_spec"): if acon.get("dq_spec").get("dq_type") == DQType.PRISMA.value: acon["dq_spec"] = PrismaUtils.build_prisma_dq_spec( spec=acon.get("dq_spec"), execution_point=execution_point ) elif acon.get("dq_specs"): resolved_dq_specs = [] for spec in acon.get("dq_specs", []): if spec.get("dq_type") == DQType.PRISMA.value: resolved_dq_specs.append( PrismaUtils.build_prisma_dq_spec( spec=spec, execution_point=execution_point ) ) else: resolved_dq_specs.append(spec) acon["dq_specs"] = resolved_dq_specs return acon ================================================ FILE: lakehouse_engine/utils/configs/__init__.py ================================================ """Config utilities package.""" ================================================ FILE: lakehouse_engine/utils/configs/config_utils.py ================================================ """Module to read configurations.""" from importlib.metadata import PackageNotFoundError, version from typing import Any, Optional import yaml from importlib_resources import as_file, files from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions class ConfigUtils(object): """Config utilities class.""" _LOGGER = LoggingHandler(__name__).get_logger() SENSITIVE_INFO = [ "kafka.ssl.keystore.password", "kafka.ssl.truststore.password", "password", "secret", "credential", "credentials", "pass", "key", ] @classmethod def get_acon( cls, acon_path: Optional[str] = None, acon: Optional[dict] = None, disable_dbfs_retry: bool = False, ) -> dict: """Get acon based on a filesystem path or on a dict. Args: acon_path: path of the acon (algorithm configuration) file. acon: acon provided directly through python code (e.g., notebooks or other apps). disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: Dict representation of an acon. """ acon = ( acon if acon else ConfigUtils.read_json_acon(acon_path, disable_dbfs_retry) ) return acon @staticmethod def get_config(package: str = "lakehouse_engine.configs") -> Any: """Get the lakehouse engine configuration file. Args: package: package where the engine default configurations can be found. Returns: Configuration dictionary """ config_path = files(package) / "engine.yaml" with as_file(config_path) as config_file: with open(config_file, "r") as config: config = yaml.safe_load(config) return config @staticmethod def get_config_from_file(config_file_path: str) -> Any: """Get the lakehouse engine configurations using a file path. Args: config_file_path: a string with a path for a yaml file with custom configurations. Returns: Configuration dictionary """ with open(config_file_path, "r") as config: config = yaml.safe_load(config) return config @classmethod def get_engine_version(cls) -> str: """Get Lakehouse Engine version from the installed packages. Returns: String of engine version. """ try: _version = version("lakehouse-engine") except PackageNotFoundError: cls._LOGGER.info("Could not identify Lakehouse Engine version.") _version = "" return str(_version) @staticmethod def read_json_acon(path: str, disable_dbfs_retry: bool = False) -> Any: """Read an acon (algorithm configuration) file. Args: path: path to the acon file. disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: The acon file content as a dict. """ return FileStorageFunctions.read_json(path, disable_dbfs_retry) @staticmethod def read_sql(path: str, disable_dbfs_retry: bool = False) -> Any: """Read a DDL file in Spark SQL format from a cloud object storage system. Args: path: path to the SQL file. disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: Content of the SQL file. """ return FileStorageFunctions.read_sql(path, disable_dbfs_retry) @classmethod def remove_sensitive_info(cls, dict_to_replace: dict | list) -> dict | list: """Remove sensitive info from a dictionary. Args: dict_to_replace: dict where we want to remove sensitive info. Returns: dict without sensitive information. """ if isinstance(dict_to_replace, list): return [cls.remove_sensitive_info(k) for k in dict_to_replace] elif isinstance(dict_to_replace, dict): return { k: "******" if k in cls.SENSITIVE_INFO else cls.remove_sensitive_info(v) for k, v in dict_to_replace.items() } else: return dict_to_replace ================================================ FILE: lakehouse_engine/utils/databricks_utils.py ================================================ """Utilities for databricks operations.""" import ast import json import os import re from typing import Any, Tuple from pyspark.sql import SparkSession from lakehouse_engine.core.definitions import EngineStats from lakehouse_engine.utils.logging_handler import LoggingHandler class DatabricksUtils(object): """Databricks utilities class.""" _LOGGER = LoggingHandler(__name__).get_logger() @staticmethod def is_serverless_workload() -> bool: """Check if the current databricks workload is serverless. Returns: True if the current databricks workload is serverless, False otherwise. """ if os.getenv("IS_SERVERLESS", "false").lower() == "true": return True else: return False @staticmethod def get_db_utils(spark: SparkSession) -> Any: """Get db utils on databricks. Args: spark: spark session. Returns: Dbutils from databricks. """ try: from pyspark.dbutils import DBUtils if "dbutils" not in locals(): dbutils = DBUtils(spark) else: dbutils = locals().get("dbutils") except ImportError: import IPython dbutils = IPython.get_ipython().user_ns["dbutils"] return dbutils @staticmethod def get_databricks_job_information(spark: SparkSession) -> Tuple[str, str]: """Get notebook context from running acon. Args: spark: spark session. Returns: Dict containing databricks notebook context. """ dbutils = DatabricksUtils.get_db_utils(spark) notebook_context = json.loads( ( dbutils.notebook.entry_point.getDbutils() .notebook() .getContext() .safeToJson() ) ) return notebook_context["attributes"].get("orgId"), notebook_context[ "attributes" ].get("jobName") @staticmethod def _get_dp_name(job_name: str) -> str: """Extract the dp_name from a Databricks job name. The job name is expected to have a suffix separated by '-', and the dp_name is the part before the last '-'. Only '_' is used in the rest of the job name. E.g. 'sadp-template-my_awesome_job' Args: job_name: The Databricks job name string. Returns: The extracted dp_name. """ return job_name.rsplit("-", 1)[0] if job_name and "-" in job_name else job_name @staticmethod def get_spark_conf_values(usage_stats: dict, spark_confs: dict) -> None: """Get information from spark session configurations. Args: usage_stats: usage_stats dictionary file. spark_confs: optional dictionary with the spark tags to be used when collecting the engine usage. """ from lakehouse_engine.core.exec_env import ExecEnv spark_confs = ( EngineStats.DEF_SPARK_CONFS if spark_confs is None else EngineStats.DEF_SPARK_CONFS | spark_confs ) for spark_conf_key, spark_conf_value in spark_confs.items(): # whenever the spark_conf_value has #, it means it is an array, so we need # to split it and adequately process it if "#" in spark_conf_value: array_key = spark_conf_value.split("#") array_values = ast.literal_eval( ExecEnv.SESSION.conf.get(array_key[0], "[]") ) final_value = [ key_val["value"] for key_val in array_values if key_val["key"] == array_key[1] ] usage_stats[spark_conf_key] = ( final_value[0] if len(final_value) > 0 else "" ) else: usage_stats[spark_conf_key] = ExecEnv.SESSION.conf.get( spark_conf_value, "" ) run_id_extracted = re.search("run-([1-9]\\w+)", usage_stats.get("run_id", "")) usage_stats["run_id"] = run_id_extracted.group(1) if run_id_extracted else "" @classmethod def get_usage_context_for_serverless(cls, usage_stats: dict) -> None: """Get information from the execution environment for serverless scenarios. Since in serverless environments we might not have access to all the spark confs we want to collect, we will try to get that information from the execution environment when possible. Args: usage_stats: usage_stats dictionary file. """ try: from dbruntime.databricks_repl_context import get_context from lakehouse_engine.core.exec_env import ExecEnv context = get_context() for key, attr in EngineStats.DEF_DATABRICKS_CONTEXT_KEYS.items(): if key == "dp_name": usage_stats[key] = DatabricksUtils._get_dp_name( getattr(context, attr, None) ) elif key == "environment": usage_stats[key] = ExecEnv.get_environment() else: usage_stats[key] = getattr(context, attr, None) except Exception as ex: cls._LOGGER.error(f"Error getting Serverless Usage Context: {ex}") ================================================ FILE: lakehouse_engine/utils/dq_utils.py ================================================ """Module containing utils for DQ processing.""" from json import loads from pyspark.sql.functions import col, from_json, schema_of_json, struct from lakehouse_engine.core.definitions import DQSpec, DQTableBaseParameters, DQType from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.dq_processors.exceptions import DQSpecMalformedException from lakehouse_engine.utils.logging_handler import LoggingHandler _LOGGER = LoggingHandler(__name__).get_logger() class DQUtils: """Utils related to the data quality process.""" @staticmethod def import_dq_rules_from_table( spec: dict, execution_point: str, base_expectation_arguments: list, extra_meta_arguments: list, ) -> dict: """Import dq rules from a table. Args: spec: data quality specification. execution_point: if the execution is in_motion or at_rest. base_expectation_arguments: base arguments for dq functions. extra_meta_arguments: extra meta arguments for dq functions. Returns: The dictionary containing the dq spec with dq functions defined. """ dq_db_table = spec["dq_db_table"] dq_functions = [] if spec.get("dq_table_table_filter"): dq_table_table_filter = spec["dq_table_table_filter"] else: raise DQSpecMalformedException( "When importing rules from a table " "dq_table_table_filter must be defined." ) extra_filters_query = ( f""" and {spec["dq_table_extra_filters"]}""" if spec.get("dq_table_extra_filters") else "" ) fields = base_expectation_arguments + extra_meta_arguments dq_functions_query = f""" SELECT {", ".join(fields)} FROM {dq_db_table} WHERE execution_point='{execution_point}' and table = '{dq_table_table_filter}' {extra_filters_query}""" # nosec: B608 raw_dq_functions = ExecEnv.SESSION.sql(dq_functions_query) arguments = raw_dq_functions.select("arguments").collect() parsed_arguments = [loads(argument.arguments) for argument in arguments] combined_dict: dict = {} for argument in parsed_arguments: combined_dict = {**combined_dict, **argument} dq_function_arguments_schema = schema_of_json(str(combined_dict)) processed_dq_functions = ( raw_dq_functions.withColumn( "json_data", from_json(col("arguments"), dq_function_arguments_schema) ) .withColumn( "parsed_arguments", struct( col("json_data.*"), struct(extra_meta_arguments).alias("meta"), ), ) .drop(col("json_data")) ) unique_dq_functions = processed_dq_functions.drop_duplicates( ["dq_tech_function", "arguments"] ) duplicated_rows = processed_dq_functions.subtract(unique_dq_functions) if duplicated_rows.count() > 0: _LOGGER.warning("Found Duplicates Rows:") duplicated_rows.show(truncate=False) processed_dq_functions_list = unique_dq_functions.collect() for processed_dq_function in processed_dq_functions_list: dq_functions.append( { "function": f"{processed_dq_function.dq_tech_function}", "args": { k: v for k, v in processed_dq_function.parsed_arguments.asDict( recursive=True ).items() if v is not None }, } ) spec["dq_functions"] = dq_functions return spec @staticmethod def validate_dq_functions( spec: dict, execution_point: str = "", extra_meta_arguments: list = None ) -> None: """Function to validate the dq functions defined in the dq_spec. This function validates that the defined dq_functions contain all the fields defined in the extra_meta_arguments parameter. Args: spec: data quality specification. execution_point: if the execution is in_motion or at_rest. extra_meta_arguments: extra meta arguments for dq functions. Raises: DQSpecMalformedException: If the dq spec is malformed. """ dq_functions = spec["dq_functions"] if not extra_meta_arguments: _LOGGER.info( "No extra meta parameters defined. " "Skipping validation of imported dq rule." ) return for dq_function in dq_functions: if not dq_function.get("args").get("meta", None): raise DQSpecMalformedException( "The dq function must have a meta field containing all " f"the fields defined: {extra_meta_arguments}." ) else: meta = dq_function["args"]["meta"] given_keys = meta.keys() missing_keys = sorted(set(extra_meta_arguments) - set(given_keys)) if missing_keys: raise DQSpecMalformedException( "The dq function meta field must contain all the " f"fields defined: {extra_meta_arguments}.\n" f"Found fields: {list(given_keys)}.\n" f"Diff: {list(missing_keys)}" ) if execution_point and meta["execution_point"] != execution_point: raise DQSpecMalformedException( "The dq function execution point must be the same as " "the execution point of the dq spec." ) class PrismaUtils: """Prisma related utils.""" @staticmethod def build_prisma_dq_spec(spec: dict, execution_point: str) -> dict: """Fetch dq functions from given table. Args: spec: data quality specification. execution_point: if the execution is in_motion or at_rest. Returns: The dictionary containing the dq spec with dq functions defined. """ if spec.get("dq_db_table"): spec = DQUtils.import_dq_rules_from_table( spec, execution_point, DQTableBaseParameters.PRISMA_BASE_PARAMETERS.value, ExecEnv.ENGINE_CONFIG.dq_functions_column_list, ) elif spec.get("dq_functions"): DQUtils.validate_dq_functions( spec, execution_point, ExecEnv.ENGINE_CONFIG.dq_functions_column_list, ) else: raise DQSpecMalformedException( "When using PRISMA either dq_db_table or " "dq_functions needs to be defined." ) dq_bucket = ( ExecEnv.ENGINE_CONFIG.dq_bucket if ExecEnv.get_environment() == "prod" else ExecEnv.ENGINE_CONFIG.dq_dev_bucket ) spec["critical_functions"] = [] spec["execution_point"] = execution_point spec["result_sink_db_table"] = None spec["result_sink_explode"] = True spec["fail_on_error"] = spec.get("fail_on_error", False) spec["max_percentage_failure"] = spec.get("max_percentage_failure", 1) if not spec.get("result_sink_extra_columns", None): spec["result_sink_extra_columns"] = [ "validation_results.expectation_config.meta", ] else: spec["result_sink_extra_columns"] = [ "validation_results.expectation_config.meta", ] + spec["result_sink_extra_columns"] if not spec.get("data_product_name", None): raise DQSpecMalformedException( "When using PRISMA DQ data_product_name must be defined." ) spec["result_sink_location"] = ( f"{dq_bucket}/{spec['data_product_name']}/result_sink/" ) spec["processed_keys_location"] = ( f"{dq_bucket}/{spec['data_product_name']}/dq_processed_keys/" ) if not spec.get("tbl_to_derive_pk", None) and not spec.get( "unexpected_rows_pk", None ): raise DQSpecMalformedException( "When using PRISMA DQ either " "tbl_to_derive_pk or unexpected_rows_pk need to be defined." ) return spec @staticmethod def validate_rule_id_duplication( specs: list[DQSpec], ) -> dict[str, str]: """Verify uniqueness of the dq_rule_id. Args: specs: a list of DQSpec to be validated Returns: A dictionary with the spec_id as key and rule_id as value for any duplicates. """ error_dict = {} for spec in specs: dq_db_table = spec.dq_db_table dq_functions = spec.dq_functions spec_id = spec.spec_id if spec.dq_type == DQType.PRISMA.value and dq_db_table: dq_rule_id_query = f""" SELECT dq_rule_id, COUNT(*) AS count FROM {dq_db_table} GROUP BY dq_rule_id HAVING COUNT(*) > 1; """ # nosec: B608 duplicate_rule_id_table = ExecEnv.SESSION.sql(dq_rule_id_query) if not duplicate_rule_id_table.isEmpty(): rows = duplicate_rule_id_table.collect() df_str = "; ".join([str(row) for row in rows]) error_dict[f"dq_spec_id: {spec_id}"] = df_str elif spec.dq_type == DQType.PRISMA.value and dq_functions: dq_rules_id_list = [] for dq_function in dq_functions: dq_rules_id_list.append(dq_function.args["meta"]["dq_rule_id"]) if len(dq_rules_id_list) != len(set(dq_rules_id_list)): error_dict[f"dq_spec_id: {spec_id}"] = "; ".join( [str(dq_rule_id) for dq_rule_id in dq_rules_id_list] ) return error_dict ================================================ FILE: lakehouse_engine/utils/engine_usage_stats.py ================================================ """Utilities for recording the engine activity.""" import json from datetime import datetime from urllib.parse import urlparse from lakehouse_engine.core.definitions import CollectEngineUsage from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.databricks_utils import DatabricksUtils from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions class EngineUsageStats(object): """Engine Usage utilities class.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def store_engine_usage( cls, acon: dict, func_name: str, collect_engine_usage: str = None, spark_confs: dict = None, ) -> None: """Collects and store Lakehouse Engine usage statistics. These statistics include the acon and other relevant information, such as the lakehouse engine version and the functions/algorithms being used. Args: acon: acon dictionary file. func_name: function name that called this log acon. collect_engine_usage: Lakehouse usage statistics collection strategy. spark_confs: optional dictionary with the spark confs to be used when collecting the engine usage. """ if not cls._should_collect_usage(collect_engine_usage): return try: start_timestamp = datetime.now() timestamp_str = start_timestamp.strftime("%Y%m%d%H%M%S") usage_stats = cls._prepare_usage_stats(acon, spark_confs) engine_usage_path = cls._select_usage_path( usage_stats, collect_engine_usage ) if engine_usage_path is None: return cls._add_metadata_to_stats(usage_stats, func_name, start_timestamp) log_file_name = f"eng_usage_{func_name}_{timestamp_str}.json" usage_stats_str = json.dumps(usage_stats, default=str) url = urlparse( f"{engine_usage_path}/{usage_stats['dp_name']}/" f"{start_timestamp.year}/{start_timestamp.month}/" f"{log_file_name}", allow_fragments=False, ) try: FileStorageFunctions.write_payload( engine_usage_path, url, usage_stats_str ) cls._LOGGER.info("Storing Lakehouse Engine usage statistics") except FileNotFoundError as e: cls._LOGGER.error(f"Could not write engine stats into file: {e}.") except Exception as e: cls._LOGGER.error( "Failed while collecting the lakehouse engine stats: " f"Unexpected {e=}, {type(e)=}." ) @classmethod def _should_collect_usage(cls, collect_engine_usage: str) -> bool: return ( collect_engine_usage in [CollectEngineUsage.ENABLED.value, CollectEngineUsage.PROD_ONLY.value] or ExecEnv.ENGINE_CONFIG.collect_engine_usage in CollectEngineUsage.ENABLED.value ) @classmethod def _prepare_usage_stats(cls, acon: dict, spark_confs: dict) -> dict: usage_stats = {"acon": ConfigUtils.remove_sensitive_info(acon)} if not ExecEnv.IS_SERVERLESS: DatabricksUtils.get_spark_conf_values(usage_stats, spark_confs) else: DatabricksUtils.get_usage_context_for_serverless(usage_stats) return usage_stats @classmethod def _select_usage_path( cls, usage_stats: dict, collect_engine_usage: str ) -> str | None: if usage_stats.get("environment") == "prod": return ExecEnv.ENGINE_CONFIG.engine_usage_path elif collect_engine_usage != CollectEngineUsage.PROD_ONLY.value: return ExecEnv.ENGINE_CONFIG.engine_dev_usage_path return None @classmethod def _add_metadata_to_stats( cls, usage_stats: dict, func_name: str, start_timestamp: datetime ) -> None: usage_stats["function"] = func_name usage_stats["engine_version"] = ConfigUtils.get_engine_version() usage_stats["start_timestamp"] = start_timestamp usage_stats["year"] = start_timestamp.year usage_stats["month"] = start_timestamp.month ================================================ FILE: lakehouse_engine/utils/expectations_utils.py ================================================ """Utilities to be used by custom expectations.""" from typing import Any, Dict def validate_result( expectation_configuration: Any, metrics: dict, ) -> None: """Validates that the unexpected_index_list in the tests is corretly defined. Additionally, it validates the expectation using the GE _validate method. Args: expectation_configuration: Expectation configuration. metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution engine used in the expectation. base_expectation: Base expectation to validate. """ example_unexpected_index_list = _get_example_unexpected_index_list( expectation_configuration ) test_unexpected_index_list = _get_test_unexpected_index_list( expectation_configuration.map_metric, metrics ) if example_unexpected_index_list: if example_unexpected_index_list != test_unexpected_index_list: raise AssertionError( f"Example unexpected_index_list: {example_unexpected_index_list}\n" f"Test unexpected_index_list: {test_unexpected_index_list}" ) def _get_example_unexpected_index_list(expectation_configuration: Any) -> list: """Retrieves the unexpected index list defined from the example used on the test. This needs to be done manually because GE allows us to get either the complete output of the test or the complete configuration used on the test. To get around this limitation this function is used to fetch the example used in the test directly from the expectation itself. Args: expectation_configuration: Expectation configuration. Returns: List of unexpected indexes defined in the example used. """ filtered_example: dict = {"out": {"unexpected_index_list": []}} for example in expectation_configuration.examples: for test in example["tests"]: # type: ignore example_result_format = [] if "result_format" in expectation_configuration.result_format: example_result_format = expectation_configuration.result_format if test["in"]["result_format"] == example_result_format: filtered_example = test example_unexpected_index_list = [] if "unexpected_index_list" in filtered_example["out"]: example_unexpected_index_list = filtered_example["out"]["unexpected_index_list"] return example_unexpected_index_list def _get_test_unexpected_index_list(metric_name: str, metrics: Dict) -> list: """Retrieves the unexpected index list from the test case that has been run. Args: metric_name: Name of the metric to retrieve the unexpected index list. metrics: Metric values resulting from the test. Returns: List of unexpected indexes retrieved form the test. """ test_unexpected_index_list = [] if f"{metric_name}.unexpected_index_list" in metrics: if metrics[f"{metric_name}.unexpected_index_list"]: test_unexpected_index_list = metrics[f"{metric_name}.unexpected_index_list"] else: test_unexpected_index_list = [] return test_unexpected_index_list ================================================ FILE: lakehouse_engine/utils/extraction/__init__.py ================================================ """Extraction utilities package.""" ================================================ FILE: lakehouse_engine/utils/extraction/jdbc_extraction_utils.py ================================================ """Utilities module for JDBC extraction processes.""" from abc import abstractmethod from dataclasses import dataclass from datetime import datetime, timezone from enum import Enum from logging import Logger from typing import Any, Dict, List, Optional, Tuple from lakehouse_engine.core.definitions import InputFormat, InputSpec, ReadType from lakehouse_engine.utils.logging_handler import LoggingHandler class JDBCExtractionType(Enum): """Standardize the types of extractions we can have from a JDBC source.""" INIT = "init" DELTA = "delta" @dataclass class JDBCExtraction(object): """Configurations available for an Extraction from a JDBC source. These configurations cover: - user: username to connect to JDBC source. - password: password to connect to JDBC source (always use secrets, don't use text passwords in your code). - url: url to connect to JDBC source. - dbtable: `database.table` to extract data from. - calc_upper_bound_schema: custom schema used for the upper bound calculation. - changelog_table: table of type changelog from which to extract data, when the extraction type is delta. - partition_column: column used to split the extraction. - latest_timestamp_data_location: data location (e.g., s3) containing the data to get the latest timestamp already loaded into bronze. - latest_timestamp_data_format: the format of the dataset in latest_timestamp_data_location. Default: delta. - extraction_type: type of extraction (delta or init). Default: "delta". - driver: JDBC driver name. Default: "com.sap.db.jdbc.Driver". - num_partitions: number of Spark partitions to split the extraction. - lower_bound: lower bound to decide the partition stride. - upper_bound: upper bound to decide the partition stride. If calculate_upper_bound is True, then upperBound will be derived by our upper bound optimizer, using the partition column. - default_upper_bound: the value to use as default upper bound in case the result of the upper bound calculation is None. Default: "1". - fetch_size: how many rows to fetch per round trip. Default: "100000". - compress: enable network compression. Default: True. - custom_schema: specify custom_schema for particular columns of the returned dataframe in the init/delta extraction of the source table. - min_timestamp: min timestamp to consider to filter the changelog data. Default: None and automatically derived from the location provided. In case this one is provided it has precedence and the calculation is not done. - max_timestamp: max timestamp to consider to filter the changelog data. Default: None and automatically derived from the table having information about the extraction requests, their timestamps and their status. In case this one is provided it has precedence and the calculation is not done. - generate_predicates: whether to generate predicates automatically or not. Default: False. - predicates: list containing all values to partition (if generate_predicates is used, the manual values provided are ignored). Default: None. - predicates_add_null: whether to consider null on predicates list. Default: True. - extraction_timestamp: the timestamp of the extraction. Default: current time following the format "%Y%m%d%H%M%S". - max_timestamp_custom_schema: custom schema used on the max_timestamp derivation from the table holding the extraction requests information. """ user: str password: str url: str dbtable: str calc_upper_bound_schema: Optional[str] = None changelog_table: Optional[str] = None partition_column: Optional[str] = None latest_timestamp_data_location: Optional[str] = None latest_timestamp_data_format: str = InputFormat.DELTAFILES.value extraction_type: str = JDBCExtractionType.DELTA.value driver: str = "com.sap.db.jdbc.Driver" num_partitions: Optional[int] = None lower_bound: Optional[int | float | str] = None upper_bound: Optional[int | float | str] = None default_upper_bound: str = "1" fetch_size: str = "100000" compress: bool = True custom_schema: Optional[str] = None min_timestamp: Optional[str] = None max_timestamp: Optional[str] = None generate_predicates: bool = False predicates: Optional[List] = None predicates_add_null: bool = True extraction_timestamp: str = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") max_timestamp_custom_schema: Optional[str] = None class JDBCExtractionUtils(object): """Utils for managing data extraction from particularly relevant JDBC sources.""" def __init__(self, jdbc_extraction: Any): """Construct JDBCExtractionUtils. Args: jdbc_extraction: JDBC Extraction configurations. Can be of type: JDBCExtraction, SAPB4Extraction or SAPBWExtraction. """ self._LOGGER: Logger = LoggingHandler(__name__).get_logger() self._JDBC_EXTRACTION = jdbc_extraction @staticmethod def get_additional_spark_options( input_spec: InputSpec, options: dict, ignore_options: List = None ) -> dict: """Helper to get additional Spark Options initially passed. If people provide additional Spark options, not covered by the util function arguments (get_spark_jdbc_options), we need to consider them. Thus, we update the options retrieved by the utils, by checking if there is any Spark option initially provided that is not yet considered in the retrieved options or function arguments and if the value for the key is not None. If these conditions are filled, we add the options and return the complete dict. Args: input_spec: the input specification. options: dict with Spark options. ignore_options: list of options to be ignored by the process. Spark read has two different approaches to parallelize reading process, one of them is using upper/lower bound, another one is using predicates, those process can't be executed at the same time, you must choose one of them. By choosing predicates you can't pass lower and upper bound, also can't pass number of partitions and partition column otherwise spark will interpret the execution partitioned by upper and lower bound and will expect to fill all variables. To avoid fill all predicates hardcoded at the acon, there is a feature that automatically generates all predicates for init or delta load based on input partition column, but at the end of the process, partition column can't be passed to the options, because we are choosing predicates execution, that is why to generate predicates we need to pass some options to ignore. Returns: a dict with all the options passed as argument, plus the options that were initially provided, but were not used in the util (get_spark_jdbc_options). """ func_args = JDBCExtractionUtils.get_spark_jdbc_options.__code__.co_varnames if ignore_options is None: ignore_options = [] ignore_options = ignore_options + list(options.keys()) + list(func_args) return { key: value for key, value in input_spec.options.items() if key not in ignore_options and value is not None } def get_predicates(self, predicates_query: str) -> List: """Get the predicates list, based on a predicates query. Args: predicates_query: query to use as the basis to get the distinct values for a specified column, based on which predicates are generated. Returns: List containing the predicates to use to split the extraction from JDBC sources. """ jdbc_args = { "url": self._JDBC_EXTRACTION.url, "table": predicates_query, "properties": { "user": self._JDBC_EXTRACTION.user, "password": self._JDBC_EXTRACTION.password, "driver": self._JDBC_EXTRACTION.driver, }, } from lakehouse_engine.io.reader_factory import ReaderFactory predicates_df = ReaderFactory.get_data( InputSpec( spec_id="get_predicates", data_format=InputFormat.JDBC.value, read_type=ReadType.BATCH.value, jdbc_args=jdbc_args, ) ) predicates_list = [ f"{self._JDBC_EXTRACTION.partition_column}='{row[0]}'" for row in predicates_df.collect() ] if self._JDBC_EXTRACTION.predicates_add_null: predicates_list.append(f"{self._JDBC_EXTRACTION.partition_column} IS NULL") self._LOGGER.info( f"The following predicate list was generated: {predicates_list}" ) return predicates_list def get_spark_jdbc_options(self) -> Tuple[dict, dict]: """Get the Spark options to extract data from a JDBC source. Returns: The Spark jdbc args dictionary, including the query to submit and also options args dictionary. """ options_args: Dict[str, Any] = { "fetchSize": self._JDBC_EXTRACTION.fetch_size, "compress": self._JDBC_EXTRACTION.compress, } jdbc_args = { "url": self._JDBC_EXTRACTION.url, "properties": { "user": self._JDBC_EXTRACTION.user, "password": self._JDBC_EXTRACTION.password, "driver": self._JDBC_EXTRACTION.driver, }, } if self._JDBC_EXTRACTION.extraction_type == JDBCExtractionType.DELTA.value: jdbc_args["table"], predicates_query = self._get_delta_query() else: jdbc_args["table"], predicates_query = self._get_init_query() if self._JDBC_EXTRACTION.custom_schema: options_args["customSchema"] = self._JDBC_EXTRACTION.custom_schema if self._JDBC_EXTRACTION.generate_predicates: jdbc_args["predicates"] = self.get_predicates(predicates_query) else: if self._JDBC_EXTRACTION.predicates: jdbc_args["predicates"] = self._JDBC_EXTRACTION.predicates else: options_args = self._get_extraction_partition_opts( options_args, ) return options_args, jdbc_args def get_spark_jdbc_optimal_upper_bound(self) -> Any: """Get an optimal upperBound to properly split a Spark JDBC extraction. Returns: Either an int, date or timestamp to serve as upperBound Spark JDBC option. """ options = {} if self._JDBC_EXTRACTION.calc_upper_bound_schema: options["customSchema"] = self._JDBC_EXTRACTION.calc_upper_bound_schema table = ( self._JDBC_EXTRACTION.dbtable if self._JDBC_EXTRACTION.extraction_type == JDBCExtractionType.INIT.value else self._JDBC_EXTRACTION.changelog_table ) jdbc_args = { "url": self._JDBC_EXTRACTION.url, "table": f"(SELECT COALESCE(MAX({self._JDBC_EXTRACTION.partition_column}), " f"{self._JDBC_EXTRACTION.default_upper_bound}) " f"upper_bound FROM {table})", # nosec: B608 "properties": { "user": self._JDBC_EXTRACTION.user, "password": self._JDBC_EXTRACTION.password, "driver": self._JDBC_EXTRACTION.driver, }, } from lakehouse_engine.io.reader_factory import ReaderFactory upper_bound_df = ReaderFactory.get_data( InputSpec( spec_id="get_optimal_upper_bound", data_format=InputFormat.JDBC.value, read_type=ReadType.BATCH.value, jdbc_args=jdbc_args, options=options, ) ) upper_bound = upper_bound_df.first()[0] if upper_bound is not None: self._LOGGER.info( f"Upper Bound '{upper_bound}' derived from " f"'{self._JDBC_EXTRACTION.dbtable}' using the column " f"'{self._JDBC_EXTRACTION.partition_column}'" ) return upper_bound else: raise AttributeError( f"Not able to calculate upper bound from " f"'{self._JDBC_EXTRACTION.dbtable}' using " f"the column '{self._JDBC_EXTRACTION.partition_column}'" ) def _get_extraction_partition_opts( self, options_args: dict, ) -> dict: """Get an options dict with custom extraction partition options. Args: options_args: spark jdbc reader options. """ if self._JDBC_EXTRACTION.num_partitions: options_args["numPartitions"] = self._JDBC_EXTRACTION.num_partitions if self._JDBC_EXTRACTION.upper_bound: options_args["upperBound"] = self._JDBC_EXTRACTION.upper_bound if self._JDBC_EXTRACTION.lower_bound: options_args["lowerBound"] = self._JDBC_EXTRACTION.lower_bound if self._JDBC_EXTRACTION.partition_column: options_args["partitionColumn"] = self._JDBC_EXTRACTION.partition_column return options_args def _get_max_timestamp(self, max_timestamp_query: str) -> str: """Get the max timestamp, based on the provided query. Args: max_timestamp_query: the query used to derive the max timestamp. Returns: A string having the max timestamp. """ jdbc_args = { "url": self._JDBC_EXTRACTION.url, "table": max_timestamp_query, "properties": { "user": self._JDBC_EXTRACTION.user, "password": self._JDBC_EXTRACTION.password, "driver": self._JDBC_EXTRACTION.driver, }, } from lakehouse_engine.io.reader_factory import ReaderFactory max_timestamp_df = ReaderFactory.get_data( InputSpec( spec_id="get_max_timestamp", data_format=InputFormat.JDBC.value, read_type=ReadType.BATCH.value, jdbc_args=jdbc_args, options={ "customSchema": self._JDBC_EXTRACTION.max_timestamp_custom_schema }, ) ) max_timestamp = max_timestamp_df.first()[0] self._LOGGER.info( f"Max timestamp {max_timestamp} derived from query: {max_timestamp_query}" ) return str(max_timestamp) @abstractmethod def _get_delta_query(self) -> Tuple[str, str]: """Get a query to extract delta (partially) from a source.""" pass @abstractmethod def _get_init_query(self) -> Tuple[str, str]: """Get a query to extract init (fully) from a source.""" pass ================================================ FILE: lakehouse_engine/utils/extraction/sap_b4_extraction_utils.py ================================================ """Utilities module for SAP B4 extraction processes.""" import re from dataclasses import dataclass from enum import Enum from logging import Logger from typing import Any, Optional, Tuple from lakehouse_engine.core.definitions import InputSpec, ReadType from lakehouse_engine.transformers.aggregators import Aggregators from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionUtils, ) from lakehouse_engine.utils.logging_handler import LoggingHandler class ADSOTypes(Enum): """Standardise the types of ADSOs we can have for Extractions from SAP B4.""" AQ = "AQ" CL = "CL" SUPPORTED_TYPES = [AQ, CL] @dataclass class SAPB4Extraction(JDBCExtraction): """Configurations available for an Extraction from SAP B4. It inherits from JDBCExtraction configurations, so it can use and/or overwrite those configurations. These configurations cover: - latest_timestamp_input_col: the column containing the request timestamps in the dataset in latest_timestamp_data_location. Default: REQTSN. - request_status_tbl: the name of the SAP B4 table having information about the extraction requests. Composed of database.table. Default: SAPHANADB.RSPMREQUEST. - request_col_name: name of the column having the request timestamp to join with the request status table. Default: REQUEST_TSN. - data_target: the data target to extract from. User in the join operation with the request status table. - act_req_join_condition: the join condition into activation table can be changed using this property. Default: 'tbl.reqtsn = req.request_col_name'. - include_changelog_tech_cols: whether to include the technical columns (usually coming from the changelog) table or not. - extra_cols_req_status_tbl: columns to be added from request status table. It needs to contain the prefix "req.". E.g. "req.col1 as column_one, req.col2 as column_two". - request_status_tbl_filter: filter to use for filtering the request status table, influencing the calculation of the max timestamps and the delta extractions. - adso_type: the type of ADSO that you are extracting from. Can be "AQ" or "CL". - max_timestamp_custom_schema: the custom schema to apply on the calculation of the max timestamp to consider for the delta extractions. Default: timestamp DECIMAL(23,0). - default_max_timestamp: the timestamp to use as default, when it is not possible to derive one. - default_min_timestamp: the timestamp to use as default, when it is not possible to derive one. - custom_schema: specify custom_schema for particular columns of the returned dataframe in the init/delta extraction of the source table. """ latest_timestamp_input_col: str = "REQTSN" request_status_tbl: str = "SAPHANADB.RSPMREQUEST" request_col_name: str = "REQUEST_TSN" data_target: Optional[str] = None act_req_join_condition: Optional[str] = None include_changelog_tech_cols: Optional[bool] = None extra_cols_req_status_tbl: Optional[str] = None request_status_tbl_filter: Optional[str] = None adso_type: Optional[str] = None max_timestamp_custom_schema: str = "timestamp DECIMAL(23,0)" default_max_timestamp: str = "1970000000000000000000" default_min_timestamp: str = "1970000000000000000000" custom_schema: str = "REQTSN DECIMAL(23,0)" class SAPB4ExtractionUtils(JDBCExtractionUtils): """Utils for managing data extraction from SAP B4.""" def __init__(self, sap_b4_extraction: SAPB4Extraction): """Construct SAPB4ExtractionUtils. Args: sap_b4_extraction: SAP B4 Extraction configurations. """ self._LOGGER: Logger = LoggingHandler(__name__).get_logger() self._B4_EXTRACTION = sap_b4_extraction self._B4_EXTRACTION.request_status_tbl_filter = ( self._get_req_status_tbl_filter() ) self._MAX_TIMESTAMP_QUERY = f""" --# nosec (SELECT COALESCE(MAX({self._B4_EXTRACTION.request_col_name}), {self._B4_EXTRACTION.default_max_timestamp}) as timestamp FROM {self._B4_EXTRACTION.request_status_tbl} WHERE {self._B4_EXTRACTION.request_status_tbl_filter}) """ # nosec: B608 super().__init__(sap_b4_extraction) @staticmethod def get_data_target(input_spec_opt: dict) -> str: """Get the data_target from the data_target option or derive it. By definition data_target is the same for the table and changelog table and is the same string ignoring everything before / and the first and last character after /. E.g. for a dbtable /BIC/abtable12, the data_target would be btable1. Args: input_spec_opt: options from the input_spec. Returns: A string with the data_target. """ exclude_chars = """["'\\\\]""" data_target: str = input_spec_opt.get( "data_target", re.sub(exclude_chars, "", input_spec_opt["dbtable"]).split("/")[-1][1:-1], ) return data_target def _get_init_query(self) -> Tuple[str, str]: """Get a query to do an init load based on a ADSO on a SAP B4 system. Returns: A query to submit to SAP B4 for the initial data extraction. The query is enclosed in parentheses so that Spark treats it as a table and supports it in the dbtable option. """ extraction_query = self._get_init_extraction_query() predicates_query = f""" (SELECT DISTINCT({self._B4_EXTRACTION.partition_column}) FROM {self._B4_EXTRACTION.dbtable} t) """ # nosec: B608 return extraction_query, predicates_query def _get_init_extraction_query(self) -> str: """Get the init extraction query based on current timestamp. Returns: A query to submit to SAP B4 for the initial data extraction. """ changelog_tech_cols = ( f"""{self._B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn, '0' AS datapakid, 0 AS record,""" if self._B4_EXTRACTION.include_changelog_tech_cols else "" ) extraction_query = f""" (SELECT t.*, {changelog_tech_cols} CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp FROM {self._B4_EXTRACTION.dbtable} t )""" # nosec: B608 return extraction_query def _get_delta_query(self) -> Tuple[str, str]: """Get a delta query for an SAP B4 ADSO. An SAP B4 ADSO requires a join with a special type of table often called requests status table (RSPMREQUEST), in which B4 tracks down the timestamps, status and metrics associated with the several data loads that were performed into B4. Depending on the type of ADSO (AQ or CL) the join condition and also the ADSO/table to consider to extract from will be different. For AQ types, there is only the active table, from which we extract both inits and deltas and this is also the table used to join with RSPMREQUEST to derive the next portion of the data to extract. For the CL types, we have an active table/adso from which we extract the init and one changelog table from which we extract the delta portions of data. Depending, if it is an init or delta one table or the other is also used to join with RSPMREQUEST. The logic on this function basically ensures that we are reading from the source table considering the data that has arrived between the maximum timestamp that is available in our target destination and the max timestamp of the extractions performed and registered in the RSPMREQUEST table, which follow the filtering criteria. Returns: A query to submit to SAP B4 for the delta data extraction. The query is enclosed in parentheses so that Spark treats it as a table and supports it in the dbtable option. """ if not self._B4_EXTRACTION.min_timestamp: from lakehouse_engine.io.reader_factory import ReaderFactory latest_timestamp_data_df = ReaderFactory.get_data( InputSpec( spec_id="data_with_latest_timestamp", data_format=self._B4_EXTRACTION.latest_timestamp_data_format, read_type=ReadType.BATCH.value, location=self._B4_EXTRACTION.latest_timestamp_data_location, ) ) min_timestamp = latest_timestamp_data_df.transform( Aggregators.get_max_value( self._B4_EXTRACTION.latest_timestamp_input_col ) ).first()[0] else: min_timestamp = self._B4_EXTRACTION.min_timestamp min_timestamp = ( min_timestamp if min_timestamp else self._B4_EXTRACTION.default_min_timestamp ) max_timestamp = ( self._B4_EXTRACTION.max_timestamp if self._B4_EXTRACTION.max_timestamp else self._get_max_timestamp(self._MAX_TIMESTAMP_QUERY) ) if self._B4_EXTRACTION.act_req_join_condition: join_condition = f"{self._B4_EXTRACTION.act_req_join_condition}" else: join_condition = f"tbl.reqtsn = req.{self._B4_EXTRACTION.request_col_name}" base_query = f""" --# nosec FROM {self._B4_EXTRACTION.changelog_table} AS tbl JOIN {self._B4_EXTRACTION.request_status_tbl} AS req ON {join_condition} WHERE {self._B4_EXTRACTION.request_status_tbl_filter} AND req.{self._B4_EXTRACTION.request_col_name} > {min_timestamp} AND req.{self._B4_EXTRACTION.request_col_name} <= {max_timestamp}) """ main_cols = f""" (SELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp """ # We join the main columns considered for the extraction with # extra_cols_act_request that people might want to use, filtering to only # add the comma and join the strings, in case extra_cols_act_request is # not None or empty. extraction_query_cols = ",".join( filter(None, [main_cols, self._B4_EXTRACTION.extra_cols_req_status_tbl]) ) extraction_query = extraction_query_cols + base_query predicates_query = f""" (SELECT DISTINCT({self._B4_EXTRACTION.partition_column}) {base_query} """ return extraction_query, predicates_query def _get_req_status_tbl_filter(self) -> Any: if self._B4_EXTRACTION.request_status_tbl_filter: return self._B4_EXTRACTION.request_status_tbl_filter else: if self._B4_EXTRACTION.adso_type == ADSOTypes.AQ.value: return f""" STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U') AND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('{self._B4_EXTRACTION.data_target}') """ elif self._B4_EXTRACTION.adso_type == ADSOTypes.CL.value: return f""" STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U') AND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('{self._B4_EXTRACTION.data_target}') """ else: raise NotImplementedError( f"The requested ADSO Type is not fully implemented and/or tested." f"Supported ADSO Types: {ADSOTypes.SUPPORTED_TYPES}" ) ================================================ FILE: lakehouse_engine/utils/extraction/sap_bw_extraction_utils.py ================================================ """Utilities module for SAP BW extraction processes.""" from dataclasses import dataclass from logging import Logger from typing import Optional, Tuple from lakehouse_engine.core.definitions import InputFormat, InputSpec, ReadType from lakehouse_engine.transformers.aggregators import Aggregators from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionType, JDBCExtractionUtils, ) from lakehouse_engine.utils.logging_handler import LoggingHandler @dataclass class SAPBWExtraction(JDBCExtraction): """Configurations available for an Extraction from SAP BW. It inherits from SAPBWExtraction configurations, so it can use and/or overwrite those configurations. These configurations cover: - latest_timestamp_input_col: the column containing the actrequest timestamp in the dataset in latest_timestamp_data_location. Default: "actrequest_timestamp". - act_request_table: the name of the SAP BW activation requests table. Composed of database.table. Default: SAPPHA.RSODSACTREQ. - request_col_name: name of the column having the request to join with the activation request table. Default: actrequest. - act_req_join_condition: the join condition into activation table can be changed using this property. Default: 'changelog_tbl.request = act_req.request_col_name'. - odsobject: name of BW Object, used for joining with the activation request table to get the max actrequest_timestamp to consider while filtering the changelog table. - include_changelog_tech_cols: whether to include the technical columns (usually coming from the changelog) table or not. Default: True. - extra_cols_act_request: list of columns to be added from act request table. It needs to contain the prefix "act_req.". E.g. "act_req.col1 as column_one, act_req.col2 as column_two". - get_timestamp_from_act_request: whether to get init timestamp from act request table or assume current/given timestamp. - sap_bw_schema: sap bw schema. Default: SAPPHA. - max_timestamp_custom_schema: the custom schema to apply on the calculation of the max timestamp to consider for the delta extractions. Default: timestamp DECIMAL(23,0). - default_max_timestamp: the timestamp to use as default, when it is not possible to derive one. - default_min_timestamp: the timestamp to use as default, when it is not possible to derive one. - ods_prefix: the prefix to use when looking for the changelog table in SAP BW. Default: "8". - logsys: the BW source & receiver system ID to use to get the tsprefix (prefix for transfer structures) which is used while deriving the changelog table. Default: None & generated based on the schema. """ latest_timestamp_input_col: str = "actrequest_timestamp" request_col_name: str = "actrequest" act_req_join_condition: Optional[str] = None odsobject: Optional[str] = None include_changelog_tech_cols: bool = True extra_cols_act_request: Optional[str] = None get_timestamp_from_act_request: bool = False sap_bw_schema: str = "SAPPHA" act_request_table: str = f"{sap_bw_schema}.RSODSACTREQ" max_timestamp_custom_schema: str = "timestamp DECIMAL(15,0)" default_max_timestamp: str = "197000000000000" default_min_timestamp: str = "197000000000000" ods_prefix: str = "8" logsys: Optional[str] = None custom_schema: Optional[str] = "REQUEST VARCHAR(30), DATAPAKID VARCHAR(6)" class SAPBWExtractionUtils(JDBCExtractionUtils): """Utils for managing data extraction from particularly relevant JDBC sources.""" def __init__(self, sap_bw_extraction: SAPBWExtraction): """Construct SAPBWExtractionUtils. Args: sap_bw_extraction: SAP BW Extraction configurations. """ self._LOGGER: Logger = LoggingHandler(__name__).get_logger() self._BW_EXTRACTION = sap_bw_extraction self._BW_EXTRACTION.changelog_table = self.get_changelog_table() self._MAX_TIMESTAMP_QUERY = f""" --# nosec (SELECT COALESCE(MAX(timestamp), {self._BW_EXTRACTION.default_max_timestamp}) as timestamp FROM {self._BW_EXTRACTION.act_request_table} WHERE odsobject = '{self._BW_EXTRACTION.odsobject}' AND operation = 'A' AND status = '0') """ # nosec: B608 super().__init__(sap_bw_extraction) def get_changelog_table(self) -> str: """Get the changelog table, given an odsobject. Returns: String to use as changelog_table. """ if ( self._BW_EXTRACTION.odsobject is not None and self._BW_EXTRACTION.changelog_table is None and self._BW_EXTRACTION.extraction_type != JDBCExtractionType.INIT.value ): logsys_cond = self.get_logsys_cond() prefix = self._BW_EXTRACTION.ods_prefix odsobject = self._BW_EXTRACTION.odsobject if self._BW_EXTRACTION.sap_bw_schema: system_table = f"{self._BW_EXTRACTION.sap_bw_schema}.RSTSODS" pref_table = f"{self._BW_EXTRACTION.sap_bw_schema}.RSBASIDOC" else: system_table = "RSTSODS" pref_table = "RSBASIDOC" query = f""" (SELECT ODSNAME_TECH FROM {system_table} o JOIN {pref_table} p ON {logsys_cond} AND o.ODSNAME = '{prefix}{odsobject}_' || p.tsprefix AND USERAPP = 'CHANGELOG' AND VERSION = '000') """ # nosec: B608 self._LOGGER.info( f"Deriving changelog_table using the following query: {query}" ) jdbc_args = { "url": self._BW_EXTRACTION.url, "table": query, "properties": { "user": self._BW_EXTRACTION.user, "password": self._BW_EXTRACTION.password, "driver": self._BW_EXTRACTION.driver, }, } from lakehouse_engine.io.reader_factory import ReaderFactory changelog_df = ReaderFactory.get_data( InputSpec( spec_id="changelog_table", data_format=InputFormat.JDBC.value, read_type=ReadType.BATCH.value, jdbc_args=jdbc_args, ) ) changelog_tbl_nbr = changelog_df.count() if changelog_tbl_nbr > 1: raise ValueError( f"More than one changelog table found for {odsobject}." f"Aborting. {changelog_df.show()}" ) if changelog_tbl_nbr == 0: raise ValueError(f"No changelog table found for {odsobject}. Aborting.") changelog_table = ( f'{self._BW_EXTRACTION.sap_bw_schema}."{changelog_df.first()[0]}"' if self._BW_EXTRACTION.sap_bw_schema else str(changelog_df.first()[0]) ) else: changelog_table = ( self._BW_EXTRACTION.changelog_table if self._BW_EXTRACTION.changelog_table else f"{self._BW_EXTRACTION.dbtable}_cl" ) self._LOGGER.info(f"The changelog table derived is: '{changelog_table}'") return changelog_table @staticmethod def get_odsobject(input_spec_opt: dict) -> str: """Get the odsobject based on the provided options. With the table name we may also get the db name, so we need to split. Moreover, there might be the need for people to specify odsobject if it is different from the dbtable. Args: input_spec_opt: options from the input_spec. Returns: A string with the odsobject. """ return str( input_spec_opt["dbtable"].split(".")[1] if len(input_spec_opt["dbtable"].split(".")) > 1 else input_spec_opt["dbtable"] ) def get_logsys_cond(self) -> str: """Get logsys condition to join & get the tsprefix for the changelog derivation. Usually the condition on the else is enough. Returns: The logsys condition. """ if self._BW_EXTRACTION.logsys: logsys = self._BW_EXTRACTION.logsys return f"p.slogsys = '{logsys}' AND p.rlogsys = '{logsys}'" else: return "p.slogsys = p.rlogsys" def _get_init_query(self) -> Tuple[str, str]: """Get a query to do an init load based on a DSO on a SAP BW system. Returns: A query to submit to SAP BW for the initial data extraction. The query is enclosed in parentheses so that Spark treats it as a table and supports it in the dbtable option. """ if self._BW_EXTRACTION.get_timestamp_from_act_request: # check if we are dealing with a DSO of type Write Optimised if self._BW_EXTRACTION.dbtable == self._BW_EXTRACTION.changelog_table: extraction_query = self._get_init_extraction_query_act_req_timestamp() else: raise AttributeError( "Not able to get the extraction query. The option " "'get_timestamp_from_act_request' is only " "available/useful for DSOs of type Write Optimised." ) else: extraction_query = self._get_init_extraction_query() predicates_query = f""" (SELECT DISTINCT({self._BW_EXTRACTION.partition_column}) FROM {self._BW_EXTRACTION.dbtable} t) """ # nosec: B608 return extraction_query, predicates_query def _get_init_extraction_query(self) -> str: """Get extraction query based on given/current timestamp. Returns: A query to submit to SAP BW for the initial data extraction. """ changelog_tech_cols = ( f"""'0' AS request, CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15, 0)) AS actrequest_timestamp, '0' AS datapakid, 0 AS partno, 0 AS record,""" if self._BW_EXTRACTION.include_changelog_tech_cols else f"CAST({self._BW_EXTRACTION.extraction_timestamp} " f"AS DECIMAL(15, 0))" f" AS actrequest_timestamp," ) extraction_query = f""" (SELECT t.*, {changelog_tech_cols} CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15, 0)) AS extraction_start_timestamp FROM {self._BW_EXTRACTION.dbtable} t )""" # nosec: B608 return extraction_query def _get_init_extraction_query_act_req_timestamp(self) -> str: """Get extraction query assuming the init timestamp from act_request table. Returns: A query to submit to SAP BW for the initial data extraction from write optimised DSOs, receiving the actrequest_timestamp from the activation requests table. """ extraction_query = f""" (SELECT t.*, act_req.timestamp as actrequest_timestamp, CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15, 0)) AS extraction_start_timestamp FROM {self._BW_EXTRACTION.dbtable} t JOIN {self._BW_EXTRACTION.act_request_table} AS act_req ON t.request = act_req.{self._BW_EXTRACTION.request_col_name} WHERE act_req.odsobject = '{self._BW_EXTRACTION.odsobject}' AND operation = 'A' AND status = '0' )""" # nosec: B608 return extraction_query def _get_delta_query(self) -> Tuple[str, str]: """Get a delta query for an SAP BW DSO. An SAP BW DSO requires a join with a special type of table often called activation requests table, in which BW tracks down the timestamps associated with the several data loads that were performed into BW. Because the changelog table only contains the active request id, and that cannot be sorted by the downstream consumers to figure out the latest change, we need to join the changelog table with this special table to get the activation requests timestamps to then use them to figure out the latest changes in the delta load logic afterwards. Additionally, we also need to know which was the latest timestamp already loaded into the lakehouse bronze layer. The latest timestamp should always be available in the bronze dataset itself or in a dataset that tracks down all the actrequest timestamps that were already loaded. So we get the max value out of the respective actrequest timestamp column in that dataset. Returns: A query to submit to SAP BW for the delta data extraction. The query is enclosed in parentheses so that Spark treats it as a table and supports it in the dbtable option. """ if not self._BW_EXTRACTION.min_timestamp: from lakehouse_engine.io.reader_factory import ReaderFactory latest_timestamp_data_df = ReaderFactory.get_data( InputSpec( spec_id="data_with_latest_timestamp", data_format=self._BW_EXTRACTION.latest_timestamp_data_format, read_type=ReadType.BATCH.value, location=self._BW_EXTRACTION.latest_timestamp_data_location, ) ) min_timestamp = latest_timestamp_data_df.transform( Aggregators.get_max_value( self._BW_EXTRACTION.latest_timestamp_input_col ) ).first()[0] else: min_timestamp = self._BW_EXTRACTION.min_timestamp max_timestamp = ( self._BW_EXTRACTION.max_timestamp if self._BW_EXTRACTION.max_timestamp else self._get_max_timestamp(self._MAX_TIMESTAMP_QUERY) ) if self._BW_EXTRACTION.act_req_join_condition: join_condition = f"{self._BW_EXTRACTION.act_req_join_condition}" else: join_condition = ( f"changelog_tbl.request = " f"act_req.{self._BW_EXTRACTION.request_col_name}" ) base_query = f""" --# nosec FROM {self._BW_EXTRACTION.changelog_table} AS changelog_tbl JOIN {self._BW_EXTRACTION.act_request_table} AS act_req ON {join_condition} WHERE act_req.odsobject = '{self._BW_EXTRACTION.odsobject}' AND act_req.timestamp > {min_timestamp} AND act_req.timestamp <= {max_timestamp} AND operation = 'A' AND status = '0') """ main_cols = f""" (SELECT changelog_tbl.*, act_req.TIMESTAMP AS actrequest_timestamp, CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp """ # We join the main columns considered for the extraction with # extra_cols_act_request that people might want to use, filtering to only # add the comma and join the strings, in case extra_cols_act_request is # not None or empty. extraction_query_cols = ",".join( filter(None, [main_cols, self._BW_EXTRACTION.extra_cols_act_request]) ) extraction_query = extraction_query_cols + base_query predicates_query = f""" (SELECT DISTINCT({self._BW_EXTRACTION.partition_column}) {base_query} """ return extraction_query, predicates_query ================================================ FILE: lakehouse_engine/utils/extraction/sftp_extraction_utils.py ================================================ """Utilities module for SFTP extraction processes.""" import stat from base64 import decodebytes from datetime import datetime from enum import Enum from logging import Logger from stat import S_ISREG from typing import Any, List, Set, Tuple import paramiko as p from paramiko import Ed25519Key, PKey, RSAKey, Transport from paramiko.sftp_client import SFTPAttributes, SFTPClient # type: ignore from lakehouse_engine.transformers.exceptions import WrongArgumentsException from lakehouse_engine.utils.logging_handler import LoggingHandler class SFTPInputFormat(Enum): """Formats of algorithm input.""" CSV = "csv" FWF = "fwf" JSON = "json" XML = "xml" class SFTPExtractionFilter(Enum): """Standardize the types of filters we can have from a SFTP source.""" file_name_contains = "file_name_contains" LATEST_FILE = "latest_file" EARLIEST_FILE = "earliest_file" GREATER_THAN = "date_time_gt" LOWER_THAN = "date_time_lt" class SFTPExtractionUtils(object): """Utils for managing data extraction from particularly relevant SFTP sources.""" _logger: Logger = LoggingHandler(__name__).get_logger() @classmethod def get_files_list( cls, sftp: SFTPClient, remote_path: str, options_args: dict ) -> Set[str]: """Get a list of files to be extracted from SFTP. The arguments (options_args) to list files are: - date_time_gt(str): Filter the files greater than the string datetime formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS". - date_time_lt(str): Filter the files lower than the string datetime formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS". - earliest_file(bool): Filter the earliest dated file in the directory. - file_name_contains(str): Filter files when match the pattern. - latest_file(bool): Filter the most recent dated file in the directory. - sub_dir(bool): When true, the engine will search files into subdirectories of the remote_path. It will consider one level below the remote_path. When sub_dir is used with latest_file/earliest_file argument, the engine will retrieve the latest_file/earliest_file for each subdirectory. Args: sftp: the SFTP client object. remote_path: path of files to be filtered. options_args: options from the acon. Returns: A list containing the file names to be passed to Spark. """ all_items, folder_path = cls._get_folder_items(remote_path, sftp, options_args) filtered_files: Set[str] = set() try: for item, folder in zip(all_items, folder_path): file_contains = cls._file_has_pattern(item, options_args) file_in_interval = cls._file_in_date_interval(item, options_args) if file_contains and file_in_interval: filtered_files.add(folder + item.filename) if ( SFTPExtractionFilter.EARLIEST_FILE.value in options_args.keys() or SFTPExtractionFilter.LATEST_FILE.value in options_args.keys() ): filtered_files = cls._get_earliest_latest_file( sftp, options_args, filtered_files, folder_path ) except Exception as e: cls._logger.error(f"SFTP list_files EXCEPTION: - {e}") return filtered_files @classmethod def get_sftp_client( cls, options_args: dict, ) -> Tuple[SFTPClient, Transport]: """Get the SFTP client. The SFTP client is used to open an SFTP session across an open SSH Transport and perform remote file operations. Args: options_args: dictionary containing SFTP connection parameters. The Paramiko arguments expected to connect are: - "hostname": the server to connect to. - "port": the server port to connect to. - "username": the username to authenticate as. - "password": used for password authentication. - "pkey": optional - an optional public key to use for authentication. - "passphrase" – optional - options used for decrypting private keys. - "key_filename" – optional - the filename, or list of filenames, of optional private key(s) and/or certs to try for authentication. - "timeout" – an optional timeout (in seconds) for the TCP connect. - "allow_agent" – optional - set to False to disable connecting to the SSH agent. - "look_for_keys" – optional - set to False to disable searching for discoverable private key files in ~/.ssh/. - "compress" – optional - set to True to turn on compression. - "sock" - optional - an open socket or socket-like object to use for communication to the target host. - "gss_auth" – optional - True if you want to use GSS-API authentication. - "gss_kex" – optional - Perform GSS-API Key Exchange and user authentication. - "gss_deleg_creds" – optional - Delegate GSS-API client credentials or not. - "gss_host" – optional - The targets name in the kerberos database. - "gss_trust_dns" – optional - Indicates whether or not the DNS is trusted to securely canonicalize the name of the host being connected to (default True). - "banner_timeout" – an optional timeout (in seconds) to wait for the SSH banner to be presented. - "auth_timeout" – an optional timeout (in seconds) to wait for an authentication response. - "disabled_algorithms" – an optional dict passed directly to Transport and its keyword argument of the same name. - "transport_factory" – an optional callable which is handed a subset of the constructor arguments (primarily those related to the socket, GSS functionality, and algorithm selection) and generates a Transport instance to be used by this client. Defaults to Transport.__init__. The parameter to specify the private key is expected to be in RSA format. Attempting a connection with a blank host key is not allowed unless the argument "add_auto_policy" is explicitly set to True. Returns: sftp -> a new SFTPClient session object. transport -> the Transport for this connection. """ ssh_client = p.SSHClient() try: if not options_args.get("pkey") and not options_args.get("add_auto_policy"): raise WrongArgumentsException( "Get SFTP Client: No host key (pkey) was provided and the " + "add_auto_policy property is false." ) if options_args.get("pkey") and not options_args.get("key_type"): raise WrongArgumentsException( "Get SFTP Client: The key_type must be provided when " + "the host key (pkey) is provided." ) if options_args.get("pkey", None) and options_args.get("key_type", None): key = cls._get_host_keys( options_args.get("pkey", None), options_args.get("key_type", None) ) ssh_client.get_host_keys().add( hostname=f"[{options_args.get('hostname')}]:" + f"{options_args.get('port')}", keytype="ssh-rsa", key=key, ) elif options_args.get("add_auto_policy", None): ssh_client.load_system_host_keys() ssh_client.set_missing_host_key_policy(p.WarningPolicy()) # nosec: B507 else: ssh_client.load_system_host_keys() ssh_client.set_missing_host_key_policy(p.RejectPolicy()) ssh_client.connect( hostname=options_args.get("hostname"), port=options_args.get("port", 22), username=options_args.get("username", None), password=options_args.get("password", None), key_filename=options_args.get("key_filename", None), timeout=options_args.get("timeout", None), allow_agent=options_args.get("allow_agent", True), look_for_keys=options_args.get("look_for_keys", True), compress=options_args.get("compress", False), sock=options_args.get("sock", None), gss_auth=options_args.get("gss_auth", False), gss_kex=options_args.get("gss_kex", False), gss_deleg_creds=options_args.get("gss_deleg_creds", False), gss_host=options_args.get("gss_host", False), banner_timeout=options_args.get("banner_timeout", None), auth_timeout=options_args.get("auth_timeout", None), gss_trust_dns=options_args.get("gss_trust_dns", None), passphrase=options_args.get("passphrase", None), disabled_algorithms=options_args.get("disabled_algorithms", None), transport_factory=options_args.get("transport_factory", None), ) sftp = ssh_client.open_sftp() transport = ssh_client.get_transport() except ConnectionError as e: cls._logger.error(e) raise return sftp, transport @classmethod def validate_format(cls, files_format: str) -> str: """Validate the file extension based on the format definitions. Args: files_format: a string containing the file extension. Returns: The string validated and formatted. """ formats_allowed = [ SFTPInputFormat.CSV.value, SFTPInputFormat.FWF.value, SFTPInputFormat.JSON.value, SFTPInputFormat.XML.value, ] if files_format not in formats_allowed: raise WrongArgumentsException( f"The formats allowed for SFTP are {formats_allowed}." ) return files_format @classmethod def validate_location(cls, location: str) -> str: """Validate the location. Add "/" in the case it does not exist. Args: location: file path. Returns: The location validated. """ return location if location.rfind("/") == len(location) - 1 else location + "/" @classmethod def _file_has_pattern(cls, item: SFTPAttributes, options_args: dict) -> bool: """Check if a file follows the pattern used for filtering. Args: item: item available in SFTP directory. options_args: options from the acon. Returns: A boolean telling whether the file contains a pattern or not. """ file_to_consider = True if SFTPExtractionFilter.file_name_contains.value in options_args.keys(): if not ( options_args.get(SFTPExtractionFilter.file_name_contains.value) in item.filename and (S_ISREG(item.st_mode) or cls._is_compressed(item.filename)) ): file_to_consider = False return file_to_consider @classmethod def _file_in_date_interval( cls, item: SFTPAttributes, options_args: dict, ) -> bool: """Check if the file is in the expected date interval. The logic is applied based on the arguments greater_than and lower_than. i.e: - if greater_than and lower_than have values, then it performs a between. - if only lower_than has values, then only values lower than the input value will be retrieved. - if only greater_than has values, then only values greater than the input value will be retrieved. Args: item: item available in SFTP directory. options_args: options from the acon. Returns: A boolean telling whether the file is in the expected date interval or not. """ file_to_consider = True if ( SFTPExtractionFilter.LOWER_THAN.value in options_args.keys() or SFTPExtractionFilter.GREATER_THAN.value in options_args.keys() and (S_ISREG(item.st_mode) or cls._is_compressed(item.filename)) ): lower_than = options_args.get( SFTPExtractionFilter.LOWER_THAN.value, "9999-12-31" ) greater_than = options_args.get( SFTPExtractionFilter.GREATER_THAN.value, "1900-01-01" ) file_date = datetime.fromtimestamp(item.st_mtime) if not ( ( lower_than == greater_than and cls._validate_date(greater_than) <= file_date <= cls._validate_date(lower_than) ) or ( cls._validate_date(greater_than) < file_date < cls._validate_date(lower_than) ) ): file_to_consider = False return file_to_consider @classmethod def _get_earliest_latest_file( cls, sftp: SFTPClient, options_args: dict, list_filter_files: Set[str], folder_path: List, ) -> Set[str]: """Get the earliest or latest file of a directory. Args: sftp: the SFTP client object. options_args: options from the acon. list_filter_files: set of file names to filter from. folder_path: the location of files. Returns: A set containing the earliest/latest file name. """ list_earl_lat_files: Set[str] = set() for folder in folder_path: file_date = 0 file_name = "" all_items, _ = cls._get_folder_items(f"{folder}", sftp, options_args) for item in all_items: if ( folder + item.filename in list_filter_files and (S_ISREG(item.st_mode) or cls._is_compressed(item.filename)) and ( options_args.get("earliest_file") and (file_date == 0 or item.st_mtime < file_date) ) or ( options_args.get("latest_file") and (file_date == 0 or item.st_mtime > file_date) ) ): file_date = item.st_mtime file_name = folder + item.filename list_earl_lat_files.add(file_name) return list_earl_lat_files @classmethod def _get_folder_items( cls, remote_path: str, sftp: SFTPClient, options_args: dict ) -> Tuple: """Get the files and the directory to be processed. Args: remote_path: root folder path. sftp: a SFTPClient session object. options_args: options from the acon. Returns: A tuple with a list of items (file object) and a list of directories. """ sub_dir = options_args.get("sub_dir", False) all_items: List[SFTPAttributes] = sftp.listdir_attr(remote_path) items: List[SFTPAttributes] = [] folders: List = [] for item in all_items: is_dir = stat.S_ISDIR(item.st_mode) if is_dir and sub_dir and not item.filename.endswith((".gz", ".zip")): dirs = sftp.listdir_attr(f"{remote_path}{item.filename}") for file in dirs: items.append(file) folders.append(f"{remote_path}{item.filename}/") else: items.append(item) folders.append(remote_path) return items, folders @classmethod def _get_host_keys(cls, pkey: str, key_type: str) -> PKey: """Get the pkey that will be added to the server. Args: pkey: a string with a host key value. key_type: the type of key (rsa or ed25519). Returns: A PKey that will be used to authenticate the connection. """ key: RSAKey | Ed25519Key = None if pkey and key_type.lower() == "rsa": b_pkey = bytes(pkey, "UTF-8") key = p.RSAKey(data=decodebytes(b_pkey)) elif pkey and key_type.lower() == "ed25519": b_pkey = bytes(pkey, "UTF-8") key = p.Ed25519Key(data=decodebytes(b_pkey)) return key @classmethod def _is_compressed(cls, filename: str) -> Any: """Validate if it is a compressed file. Args: filename: name of the file to be validated. Returns: A boolean with the result. """ return filename.endswith((".gz", ".zip")) @classmethod def _validate_date(cls, date_text: str) -> datetime: """Validate the input date format. Args: date_text: a string with the date or datetime value. The expected formats are: YYYY-MM-DD and YYYY-MM-DD HH:MM:SS Returns: The datetime validated and formatted. """ for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S"): try: if date_text is not None: return datetime.strptime(date_text, fmt) except ValueError: pass raise ValueError( "Incorrect data format, should be YYYY-MM-DD or YYYY-MM-DD HH:MM:SS." ) ================================================ FILE: lakehouse_engine/utils/file_utils.py ================================================ """Utilities for file name based operations.""" import re from os import listdir from typing import List def get_file_names_without_file_type( path: str, file_type: str, exclude_regex: str ) -> list: """Function to retrieve list of file names in a folder. This function filters by file type and removes the extension of the file name it returns. Args: path: path to the folder to list files file_type: type of the file to include in list exclude_regex: regex of file names to exclude Returns: A list of file names without file type. """ file_list: List[str] = [] for file in listdir(path): if not re.search(exclude_regex, file) and file.endswith(file_type): file_list.append(file.split(".")[0]) return file_list def get_directory_path(path: str) -> str: """Add '/' to the end of the path of a directory. Args: path: directory to be processed Returns: Directory path stripped and with '/' at the end. """ path = path.strip() return path if path[-1] == "/" else path + "/" ================================================ FILE: lakehouse_engine/utils/gab_utils.py ================================================ """Module to define GAB Utility classes.""" import ast import calendar import json from datetime import datetime from typing import Optional import pendulum from pyspark.sql import DataFrame from pyspark.sql.functions import col, lit, struct, to_json from lakehouse_engine.core.definitions import GABCadence, GABDefaults from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.logging_handler import LoggingHandler class GABUtils(object): """Class containing utility functions for GAB.""" _LOGGER = LoggingHandler(__name__).get_logger() def logger( self, run_start_time: datetime, run_end_time: datetime, start: str, end: str, query_id: str, query_label: str, cadence: str, stage_file_path: str, query: str, status: str, error_message: Exception | str, target_database: str, ) -> None: """Store the execution of each stage in the log events table. Args: run_start_time: execution start time. run_end_time: execution end time. start: use case start date. end: use case end date. query_id: gab configuration table use case identifier. query_label: gab configuration table use case name. cadence: cadence to process. stage_file_path: stage file path. query: query to execute. status: status of the query execution. error_message: error message if present. target_database: target database to write. """ ins = """ INSERT INTO {database}.gab_log_events VALUES ( '{run_start_time}', '{run_end_time}', '{start}', '{end}', {query_id}, '{query_label}', '{cadence}', '{stage_file_path}', '{query}', '{status}', '{error_message}' )""".format( # nosec: B608 database=target_database, run_start_time=run_start_time, run_end_time=run_end_time, start=start, end=end, query_id=query_id, query_label=query_label, cadence=cadence, stage_file_path=stage_file_path, query=self._escape_quote(query), status=status, error_message=( self._escape_quote(str(error_message)) if status == "Failed" else error_message ), ) ExecEnv.SESSION.sql(ins) @classmethod def _escape_quote(cls, to_escape: str) -> str: """Escape quote on string. Args: to_escape: string to escape. """ return to_escape.replace("'", r"\'").replace('"', r"\"") @classmethod def get_json_column_as_dict( cls, lookup_query_builder: DataFrame, query_id: str, query_column: str ) -> dict: # type: ignore """Get JSON column as dictionary. Args: lookup_query_builder: gab configuration data. query_id: gab configuration table use case identifier. query_column: column to get as json. """ column_df = lookup_query_builder.filter( col("query_id") == lit(query_id) ).select(col(query_column)) column_df_json = column_df.select( to_json(struct([column_df[x] for x in column_df.columns])) ).collect()[0][0] json_column = json.loads(column_df_json) for mapping in json_column.values(): column_as_json = ast.literal_eval(mapping) return column_as_json # type: ignore @classmethod def extract_columns_from_mapping( cls, columns: dict, is_dimension: bool, extract_column_without_alias: bool = False, table_alias: Optional[str] = None, is_extracted_value_as_name: bool = True, ) -> tuple[list[str], list[str]] | list[str]: """Extract and transform columns to SQL select statement. Args: columns: data to extract the columns. is_dimension: flag identifying if is a dimension or a metric. extract_column_without_alias: flag to inform if it's to extract columns without aliases. table_alias: name or alias from the source table. is_extracted_value_as_name: identify if the extracted value is the column name. """ column_with_alias = ( "".join([table_alias, ".", "{} as {}"]) if table_alias else "{} as {}" ) column_without_alias = ( "".join([table_alias, ".", "{}"]) if table_alias else "{}" ) extracted_columns_with_alias = [] extracted_columns_without_alias = [] for column_name, column_value in columns.items(): if extract_column_without_alias: extracted_column_without_alias = column_without_alias.format( cls._get_column_format_without_alias( is_dimension, column_name, column_value, is_extracted_value_as_name, ) ) extracted_columns_without_alias.append(extracted_column_without_alias) extracted_column_with_alias = column_with_alias.format( *cls._extract_column_with_alias( is_dimension, column_name, column_value, is_extracted_value_as_name, ) ) extracted_columns_with_alias.append(extracted_column_with_alias) return ( (extracted_columns_with_alias, extracted_columns_without_alias) if extract_column_without_alias else extracted_columns_with_alias ) @classmethod def _extract_column_with_alias( cls, is_dimension: bool, column_name: str, column_value: str | dict, is_extracted_value_as_name: bool = True, ) -> tuple[str, str]: """Extract column name with alias. Args: is_dimension: flag indicating if the column is a dimension. column_name: name of the column. column_value: value of the column. is_extracted_value_as_name: flag indicating if the name of the column is the extracted value. """ extracted_value = ( column_value if is_dimension else (column_value["metric_name"]) # type: ignore ) return ( (extracted_value, column_name) # type: ignore if is_extracted_value_as_name else (column_name, extracted_value) ) @classmethod def _get_column_format_without_alias( cls, is_dimension: bool, column_name: str, column_value: str | dict, is_extracted_value_as_name: bool = True, ) -> str: """Extract column name without alias. Args: is_dimension: flag indicating if the column is a dimension. column_name: name of the column. column_value: value of the column. is_extracted_value_as_name: flag indicating if the name of the column is the extracted value. """ extracted_value: str = ( column_value if is_dimension else (column_value["metric_name"]) # type: ignore ) return extracted_value if is_extracted_value_as_name else column_name @classmethod def get_cadence_configuration_at_end_date(cls, end_date: datetime) -> dict: """A dictionary that corresponds to the conclusion of a cadence. Any end date inputted by the user we check this end date is actually end of a cadence (YEAR, QUARTER, MONTH, WEEK). If the user input is 2024-03-31 this is a month end and a quarter end that means any use cases configured as month or quarter need to be calculated. Args: end_date: base end date. """ init_end_date_dict = {} expected_end_cadence_date = pendulum.datetime( int(end_date.strftime("%Y")), int(end_date.strftime("%m")), int(end_date.strftime("%d")), ).replace(tzinfo=None) # Validating YEAR cadence if end_date == expected_end_cadence_date.last_of("year"): init_end_date_dict["YEAR"] = "N" # Validating QUARTER cadence if end_date == expected_end_cadence_date.last_of("quarter"): init_end_date_dict["QUARTER"] = "N" # Validating MONTH cadence if end_date == datetime( int(end_date.strftime("%Y")), int(end_date.strftime("%m")), calendar.monthrange( int(end_date.strftime("%Y")), int(end_date.strftime("%m")) )[1], ): init_end_date_dict["MONTH"] = "N" # Validating WEEK cadence if end_date == expected_end_cadence_date.end_of("week").replace( hour=0, minute=0, second=0, microsecond=0 ): init_end_date_dict["WEEK"] = "N" init_end_date_dict["DAY"] = "N" return init_end_date_dict def get_reconciliation_cadences( self, cadence: str, selected_reconciliation_window: dict, cadence_configuration_at_end_date: dict, rerun_flag: str, ) -> dict: """Get reconciliation cadences based on the use case configuration. Args: cadence: cadence to process. selected_reconciliation_window: configured use case reconciliation window. cadence_configuration_at_end_date: cadences to execute at the end date. rerun_flag: flag indicating if it's a rerun or a normal run. """ configured_cadences = self._get_configured_cadences_by_snapshot( cadence, selected_reconciliation_window, cadence_configuration_at_end_date ) return self._get_cadences_to_execute( configured_cadences, cadence, cadence_configuration_at_end_date, rerun_flag ) @classmethod def _get_cadences_to_execute( cls, configured_cadences: dict, cadence: str, cadence_configuration_at_end_date: dict, rerun_flag: str, ) -> dict: """Get cadences to execute. Args: cadence: cadence to process. configured_cadences: configured use case reconciliation window. cadence_configuration_at_end_date: cadences to execute at the end date. rerun_flag: flag indicating if it's a rerun or a normal run. """ cadences_to_execute = {} cad_order = GABCadence.get_ordered_cadences() for snapshot_cadence, snapshot_flag in configured_cadences.items(): if ( (cad_order[cadence] > cad_order[snapshot_cadence]) and (rerun_flag == "Y") ) or snapshot_cadence in cadence_configuration_at_end_date: cadences_to_execute[snapshot_cadence] = snapshot_flag elif snapshot_cadence not in cadence_configuration_at_end_date: continue return cls._sort_cadences_to_execute(cadences_to_execute, cad_order) @classmethod def _sort_cadences_to_execute( cls, cadences_to_execute: dict, cad_order: dict ) -> dict: """Sort the cadences to execute. Args: cadences_to_execute: cadences to execute. cad_order: all cadences with order. """ # ordering it because when grouping cadences with snapshot and without snapshot # can impact the cadence ordering. sorted_cadences_to_execute: dict = dict( sorted( cadences_to_execute.items(), key=lambda item: cad_order.get(item[0]), # type: ignore ) ) # ordering cadences to execute it from bigger (YEAR) to smaller (DAY) cadences_to_execute_items = [] for cadence_name, cadence_value in sorted_cadences_to_execute.items(): cadences_to_execute_items.append((cadence_name, cadence_value)) cadences_sorted_by_bigger_cadence_to_execute: dict = dict( reversed(cadences_to_execute_items) ) return cadences_sorted_by_bigger_cadence_to_execute @classmethod def _get_configured_cadences_by_snapshot( cls, cadence: str, selected_reconciliation_window: dict, cadence_configuration_at_end_date: dict, ) -> dict: """Get configured cadences to execute. Args: cadence: selected cadence. selected_reconciliation_window: configured use case reconciliation window. cadence_configuration_at_end_date: cadences to execute at the end date. Returns: Each cadence with the corresponding information if it's to execute with snapshot or not. """ cadences_by_snapshot = {} ( no_snapshot_cadences, snapshot_cadences, ) = cls._generate_reconciliation_by_snapshot( cadence, selected_reconciliation_window ) for snapshot_cadence, snapshot_flag in no_snapshot_cadences.items(): if snapshot_cadence in cadence_configuration_at_end_date: cadences_by_snapshot[snapshot_cadence] = snapshot_flag cls._LOGGER.info(f"{snapshot_cadence} is present in {cadence} cadence") break cadences_by_snapshot.update(snapshot_cadences) if (not cadences_by_snapshot) and ( cadence in cadence_configuration_at_end_date ): cadences_by_snapshot[cadence] = "N" return cadences_by_snapshot @classmethod def _generate_reconciliation_by_snapshot( cls, cadence: str, selected_reconciliation_window: dict ) -> tuple[dict, dict]: """Generate reconciliation by snapshot. Args: cadence: cadence to process. selected_reconciliation_window: configured use case reconciliation window. """ cadence_snapshot_configuration = {cadence: "N"} for cadence in GABCadence.get_cadences(): cls._add_cadence_snapshot_to_cadence_snapshot_config( cadence, selected_reconciliation_window, cadence_snapshot_configuration ) cadence_snapshot_configuration = dict( sorted( cadence_snapshot_configuration.items(), key=( lambda item: GABCadence.get_ordered_cadences().get( # type: ignore item[0] ) ), ) ) cadence_snapshot_configuration = dict( reversed(list(cadence_snapshot_configuration.items())) ) cadences_without_snapshot = { key: value for key, value in cadence_snapshot_configuration.items() if value == "N" } cadences_with_snapshot = { key: value for key, value in cadence_snapshot_configuration.items() if value == "Y" } return cadences_with_snapshot, cadences_without_snapshot @classmethod def _add_cadence_snapshot_to_cadence_snapshot_config( cls, cadence: str, selected_reconciliation_window: dict, cadence_snapshot_configuration: dict, ) -> None: """Add the selected reconciliation to cadence snapshot configuration. Args: cadence: selected cadence. selected_reconciliation_window: configured use case reconciliation window. cadence_snapshot_configuration: cadence snapshot configuration dictionary who will be updated with the new value. """ if cadence in selected_reconciliation_window: cadence_snapshot_configuration[cadence] = selected_reconciliation_window[ cadence ]["snapshot"] @classmethod def format_datetime_to_default(cls, date_to_format: datetime) -> str: """Format datetime to GAB default format. Args: date_to_format: date to format. """ return datetime.date(date_to_format).strftime(GABDefaults.DATE_FORMAT.value) class GABPartitionUtils(object): """Class to extract a partition based in a date period.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def get_years(cls, start_date: str, end_date: str) -> list[str]: """Return a list of distinct years from the input parameters. Args: start_date: start of the period. end_date: end of the period. """ year = [] if start_date > end_date: raise ValueError( "Input Error: Invalid start_date and end_date. " "Start_date is greater than end_date" ) for i in range(int(start_date[0:4]), int(end_date[0:4]) + 1): year.append(str(i)) return year @classmethod def get_partition_condition(cls, start_date: str, end_date: str) -> str: """Return year,month and day partition statement from the input parameters. Args: start_date: start of the period. end_date: end of the period. """ years = cls.get_years(start_date, end_date) if len(years) > 1: partition_condition = cls._get_multiple_years_partition( start_date, end_date, years ) else: partition_condition = cls._get_single_year_partition(start_date, end_date) return partition_condition @classmethod def _get_multiple_years_partition( cls, start_date: str, end_date: str, years: list[str] ) -> str: """Return partition when executing multiple years (>1). Args: start_date: start of the period. end_date: end of the period. years: list of years. """ start_date_month = cls._extract_date_part_from_date("MONTH", start_date) start_date_day = cls._extract_date_part_from_date("DAY", start_date) end_date_month = cls._extract_date_part_from_date("MONTH", end_date) end_date_day = cls._extract_date_part_from_date("DAY", end_date) year_statement = "(year = {0} and (".format(years[0]) + "{})" if start_date_month != "12": start_date_partition = year_statement.format( "(month = {0} and day between {1} and 31)".format( start_date_month, start_date_day ) + " or (month between {0} and 12)".format(int(start_date_month) + 1) ) else: start_date_partition = year_statement.format( "month = {0} and day between {1} and 31".format( start_date_month, start_date_day ) ) period_years_partition = "" if len(years) == 3: period_years_partition = ") or (year = {0}".format(years[1]) elif len(years) > 3: period_years_partition = ") or (year between {0} and {1})".format( years[1], years[-2] ) if end_date_month != "01": end_date_partition = ( ") or (year = {0} and ((month between 01 and {1})".format( years[-1], int(end_date_month) - 1 ) + " or (month = {0} and day between 1 and {1})))".format( end_date_month, end_date_day ) ) else: end_date_partition = ( ") or (year = {0} and month = 1 and day between 01 and {1})".format( years[-1], end_date_day ) ) partition_condition = ( start_date_partition + period_years_partition + end_date_partition ) return partition_condition @classmethod def _get_single_year_partition(cls, start_date: str, end_date: str) -> str: """Return partition when executing a single year. Args: start_date: start of the period. end_date: end of the period. """ start_date_year = cls._extract_date_part_from_date("YEAR", start_date) start_date_month = cls._extract_date_part_from_date("MONTH", start_date) start_date_day = cls._extract_date_part_from_date("DAY", start_date) end_date_year = cls._extract_date_part_from_date("YEAR", end_date) end_date_month = cls._extract_date_part_from_date("MONTH", end_date) end_date_day = cls._extract_date_part_from_date("DAY", end_date) if start_date_month != end_date_month: months = [] for i in range(int(start_date_month), int(end_date_month) + 1): months.append(i) start_date_partition = ( "year = {0} and ((month={1} and day between {2} and 31)".format( start_date_year, months[0], start_date_day ) ) period_years_partition = "" if len(months) == 2: period_years_partition = start_date_partition elif len(months) == 3: period_years_partition = ( start_date_partition + " or (month = {0})".format(months[1]) ) elif len(months) > 3: period_years_partition = ( start_date_partition + " or (month between {0} and {1})".format(months[1], months[-2]) ) partition_condition = ( period_years_partition + " or (month = {0} and day between 1 and {1}))".format( end_date_month, end_date_day ) ) else: partition_condition = ( "year = {0} and month = {1} and day between {2} and {3}".format( end_date_year, end_date_month, start_date_day, end_date_day ) ) return partition_condition @classmethod def _extract_date_part_from_date(cls, part: str, date: str) -> str: """Extract date part from string date. Args: part: date part (possible values: DAY, MONTH, YEAR) date: string date. """ if "DAY" == part.upper(): return date[8:10] elif "MONTH" == part.upper(): return date[5:7] else: return date[0:4] ================================================ FILE: lakehouse_engine/utils/logging_handler.py ================================================ """Module to configure project logging.""" import logging import re FORMATTER = logging.Formatter("%(asctime)s — %(name)s — %(levelname)s — %(message)s") SENSITIVE_KEYS_REG = [ { # Enclosed in ''. # Stops replacing when it finds comma and space, space or end of line. "regex": r"'(kafka\.ssl\.keystore\.password|kafka\.ssl\.truststore\.password" r"|password|secret|credential|credentials|pass|key)'[ ]*:" r"[ ]*'.*?(, | |}|$)", "replace": "'masked_cred': '******', ", }, { # Enclosed in "". # Stops replacing when it finds comma and space, space or end of line. "regex": r'"(kafka\.ssl\.keystore\.password|kafka\.ssl\.truststore\.password' r'|password|secret|credential|credentials|pass|key)"[ ]*:' r'[ ]*".*?(, | |}|$)', "replace": '"masked_cred": "******", ', }, { # Not enclosed in '' or "". # Stops replacing when it finds comma and space, space or end of line. "regex": r"(kafka\.ssl\.keystore\.password|kafka\.ssl\.truststore\.password" r"|password|secret|credential|credentials|pass|key)[ ]*:" r"[ ]*.*?(, | |}|$)", "replace": "masked_cred: ******, ", }, ] class FilterSensitiveData(logging.Filter): """Logging filter to hide sensitive data from being shown in the logs.""" def filter(self, record: logging.LogRecord) -> bool: # noqa: A003 """Hide sensitive information from being shown in the logs. Based on the configured regex and replace strings, the content of the log records is replaced and then all the records are allowed to be logged (return True). Args: record: the LogRecord event being logged. Returns: The transformed record to be logged. """ for key_reg in SENSITIVE_KEYS_REG: record.msg = re.sub(key_reg["regex"], key_reg["replace"], str(record.msg)) return True class LoggingHandler(object): """Handle the logging of the lakehouse engine project.""" def __init__(self, class_name: str): """Construct a LoggingHandler instance. Args: class_name: name of the class to be indicated in the logs. """ self._logger: logging.Logger = logging.getLogger(class_name) self._logger.setLevel(logging.DEBUG) self._logger.addFilter(FilterSensitiveData()) lsh = logging.StreamHandler() lsh.setLevel(logging.DEBUG) lsh.setFormatter(FORMATTER) if not self._logger.hasHandlers(): # avoid keep adding handlers and therefore duplicate messages self._logger.addHandler(lsh) def get_logger(self) -> logging.Logger: """Get the _logger instance variable. Returns: logging.Logger: the logger object. """ return self._logger ================================================ FILE: lakehouse_engine/utils/rest_api.py ================================================ """Module to handle REST API operations.""" import time from enum import Enum import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from lakehouse_engine.utils.logging_handler import LoggingHandler LOG = LoggingHandler(__name__).get_logger() DEFAULT_CONTENT_TYPE = "application/json" class RestMethods(Enum): """Methods for REST API calls.""" POST = "POST" PUT = "PUT" ALLOWED_METHODS = ["POST", "PUT"] class RestStatusCodes(Enum): """REST Status Code.""" RETRY_STATUS_CODES = [429, 500, 502, 503, 504] OK_STATUS_CODES = [200] class RESTApiException(requests.RequestException): """Class representing any possible REST API Exception.""" def __init__(self, message: str) -> None: """Construct RESTApiException instances. Args: message: message to display on exception event. """ super().__init__(message) def get_basic_auth(username: str, password: str) -> requests.auth.HTTPBasicAuth: """Get the basic authentication object to authenticate REST requests. Args: username: username. password: password. Returns: requests.auth.HTTPBasicAuth: the HTTPBasicAuth object. """ return requests.auth.HTTPBasicAuth(username, password) def get_configured_session( sleep_seconds: float = 0.2, total_retries: int = 5, backoff_factor: int = 2, retry_status_codes: list = None, allowed_methods: list = None, protocol: str = "https://", ) -> requests.Session: """Get a configured requests Session with exponential backoff. Args: sleep_seconds: seconds to sleep before each request to avoid rate limits. total_retries: number of times to retry. backoff_factor: factor for the exponential backoff. retry_status_codes: list of status code that triggers a retry. allowed_methods: http methods that are allowed for retry. protocol: http:// or https://. Returns requests.Session: the configured session. """ retry_status_codes = ( retry_status_codes if retry_status_codes else RestStatusCodes.RETRY_STATUS_CODES.value ) allowed_methods = ( allowed_methods if allowed_methods else RestMethods.ALLOWED_METHODS.value ) time.sleep(sleep_seconds) session = requests.Session() retries = Retry( total=total_retries, backoff_factor=backoff_factor, status_forcelist=retry_status_codes, allowed_methods=allowed_methods, ) session.mount(protocol, HTTPAdapter(max_retries=retries)) return session def execute_api_request( method: str, url: str, headers: dict = None, basic_auth_dict: dict = None, json: dict = None, files: dict = None, sleep_seconds: float = 0.2, ) -> requests.Response: """Execute a REST API request. Args: method: REST method (e.g., POST or PUT). url: url of the api. headers: request headers. basic_auth_dict: basic http authentication details (e.g., {"username": "x", "password": "y"}). json: json payload to send in the request. files: files payload to send in the request. sleep_seconds: for how many seconds to sleep to avoid error 429. Returns: response from the HTTP request. """ basic_auth: requests.auth.HTTPBasicAuth = None if basic_auth_dict: basic_auth = get_basic_auth( basic_auth_dict["username"], basic_auth_dict["password"] ) return get_configured_session(sleep_seconds=sleep_seconds).request( method=method, url=url, headers=headers, auth=basic_auth, json=json, files=files, ) ================================================ FILE: lakehouse_engine/utils/schema_utils.py ================================================ """Utilities to facilitate dataframe schema management.""" from logging import Logger from typing import Any, List, Optional from pyspark.sql.functions import col from pyspark.sql.types import StructType from lakehouse_engine.core.definitions import InputSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions class SchemaUtils(object): """Schema utils that help retrieve and manage schemas of dataframes.""" _logger: Logger = LoggingHandler(__name__).get_logger() @staticmethod def from_file(file_path: str, disable_dbfs_retry: bool = False) -> StructType: """Get a spark schema from a file (spark StructType json file) in a file system. Args: file_path: path of the file in a file system. [Check here]( https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html). disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: Spark schema struct type. """ return StructType.fromJson( FileStorageFunctions.read_json(file_path, disable_dbfs_retry) ) @staticmethod def from_file_to_dict(file_path: str, disable_dbfs_retry: bool = False) -> Any: """Get a dict with the spark schema from a file in a file system. Args: file_path: path of the file in a file system. [Check here]( https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html). disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: Spark schema in a dict. """ return FileStorageFunctions.read_json(file_path, disable_dbfs_retry) @staticmethod def from_dict(struct_type: dict) -> StructType: """Get a spark schema from a dict. Args: struct_type: dict containing a spark schema structure. [Check here]( https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html). Returns: Spark schema struct type. """ return StructType.fromJson(struct_type) @staticmethod def from_table_schema(table: str) -> StructType: """Get a spark schema from a table. Args: table: table name from which to inherit the schema. Returns: Spark schema struct type. """ return ExecEnv.SESSION.read.table(table).schema @classmethod def from_input_spec(cls, input_spec: InputSpec) -> Optional[StructType]: """Get a spark schema from an input specification. This covers scenarios where the schema is provided as part of the input specification of the algorithm. Schema can come from the table specified in the input specification (enforce_schema_from_table) or by the dict with the spark schema provided there also. Args: input_spec: input specification. Returns: spark schema struct type. """ if input_spec.enforce_schema_from_table: cls._logger.info( f"Reading schema from table: {input_spec.enforce_schema_from_table}" ) return SchemaUtils.from_table_schema(input_spec.enforce_schema_from_table) elif input_spec.schema_path: cls._logger.info(f"Reading schema from file: {input_spec.schema_path}") return SchemaUtils.from_file( input_spec.schema_path, input_spec.disable_dbfs_retry ) elif input_spec.schema: cls._logger.info( f"Reading schema from configuration file: {input_spec.schema}" ) return SchemaUtils.from_dict(input_spec.schema) else: cls._logger.info("No schema was provided... skipping enforce schema") return None @staticmethod def _get_prefix_alias(num_chars: int, prefix: str, shorten_names: bool) -> str: """Get prefix alias for a field.""" return ( f"""{'_'.join( [item[:num_chars] for item in prefix.split('.')] )}_""" if shorten_names else f"{prefix}_".replace(".", "_") ) @staticmethod def schema_flattener( schema: StructType, prefix: str = None, level: int = 1, max_level: int = None, shorten_names: bool = False, alias: bool = True, num_chars: int = 7, ignore_cols: List = None, ) -> List: """Recursive method to flatten the schema of the dataframe. Args: schema: schema to be flattened. prefix: prefix of the struct to get the value for. Only relevant for being used in the internal recursive logic. level: level of the depth in the schema being flattened. Only relevant for being used in the internal recursive logic. max_level: level until which you want to flatten the schema. Default: None. shorten_names: whether to shorten the names of the prefixes of the fields being flattened or not. Default: False. alias: whether to define alias for the columns being flattened or not. Default: True. num_chars: number of characters to consider when shortening the names of the fields. Default: 7. ignore_cols: columns which you don't want to flatten. Default: None. Returns: A function to be called in .transform() spark function. """ cols = [] ignore_cols = ignore_cols if ignore_cols else [] for field in schema.fields: name = prefix + "." + field.name if prefix else field.name field_type = field.dataType if ( isinstance(field_type, StructType) and name not in ignore_cols and (max_level is None or level <= max_level) ): cols += SchemaUtils.schema_flattener( schema=field_type, prefix=name, level=level + 1, max_level=max_level, shorten_names=shorten_names, alias=alias, num_chars=num_chars, ignore_cols=ignore_cols, ) else: if alias and prefix: prefix_alias = SchemaUtils._get_prefix_alias( num_chars, prefix, shorten_names ) cols.append(col(name).alias(f"{prefix_alias}{field.name}")) else: cols.append(col(name)) return cols ================================================ FILE: lakehouse_engine/utils/sharepoint_utils.py ================================================ """Utilities for sharepoint API operations.""" from __future__ import annotations import os import shutil from contextlib import contextmanager from datetime import datetime from pathlib import Path from typing import Any, Dict, Generator, List, cast import requests from pyspark.sql import DataFrame from requests import RequestException from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) from lakehouse_engine.core.definitions import SharepointFile from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.exceptions import SharePointAPIError from lakehouse_engine.utils.logging_handler import LoggingHandler _logger = LoggingHandler(__name__).get_logger() class SharepointUtils(object): """Class with methods to connect and extract data from Sharepoint.""" def __init__( self, client_id: str, tenant_id: str, local_path: str, api_version: str, site_name: str, drive_name: str, file_name: str, secret: str, folder_relative_path: str = None, chunk_size: int = 5 * 1024 * 1024, # 5 MB local_options: dict = None, conflict_behaviour: str = "replace", file_pattern: str = None, file_type: str = None, ): """Instantiate objects of the SharepointUtils class. Args: client_id: application (client) ID of your Azure AD app. tenant_id: tenant ID (directory ID) from Azure AD for authentication. local_path: local directory path (Volume) where the files are temporarily stored. api_version: Graph API version to use. site_name: name of the Sharepoint site where the files are stored. drive_name: name of the document library or drive in Sharepoint. file_name: name of the file to be stored in sharepoint. secret: client secret for authentication. folder_relative_path: optional; relative path within the drive(drive_name) where the file will be stored. chunk_size: Optional; size of file chunks to be uploaded/downloaded in bytes (default is 5 MB). local_options: Optional; additional options for customizing write action to local path. conflict_behaviour: Optional; defines how conflicts in file uploads are handled('replace', 'fail', etc.). file_pattern: Optional; pattern to match files in Sharepoint (e.g., 'data_*'). file_type: Optional; type of the file to be stored in Sharepoint (e.g., 'csv'). Returns: A SharepointUtils object. """ self.client_id = client_id self.tenant_id = tenant_id self.local_path = local_path self.api_version = api_version self.site_name = site_name self.drive_name = drive_name self.file_name = file_name self.secret = secret self.folder_relative_path = folder_relative_path self.chunk_size = chunk_size self.local_options = local_options self.conflict_behaviour = conflict_behaviour self.site_id = None self.drive_id = None self.token = None self.file_pattern = file_pattern self.file_type = file_type self._create_app() def _get_token(self) -> None: """Fetch and store a valid access token for Sharepoint API.""" try: self.token = self.app.acquire_token_for_client( scopes=[f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/.default"] ) except Exception as err: _logger.error(f"Token acquisition error: {err}") def _create_app(self) -> None: """Create an MSAL (Microsoft Authentication Library) instance. This is used to handle authentication and authorization with Azure AD. """ import msal self.app = msal.ConfidentialClientApplication( client_id=self.client_id, authority=f"{ExecEnv.ENGINE_CONFIG.sharepoint_authority}/{self.tenant_id}", client_credential=self.secret, ) self._get_token() @retry( stop=stop_after_attempt(5), wait=wait_exponential(multiplier=30, min=30, max=150), retry=retry_if_exception_type( (RequestException, SharePointAPIError) ), # Retry on these exceptions ) def _make_request( self, endpoint: str, method: str = "GET", headers: dict = None, json_options: dict = None, data: object = None, stream: bool = False, ) -> requests.Response: """Execute API requests to Microsoft Graph API. !!! note If you try to upload large files sequentially,you may encounter a 503 "serviceNotAvailable" error. To mitigate this, consider using coalesce in the Acon transform specification. However, be aware that increasing the number of partitions also increases the likelihood of server throttling Args: endpoint: The API endpoint to call. headers: A dictionary containing the necessary headers. json_options: Optional; JSON data to include in the request body. method: The HTTP method to use ('GET', 'POST', 'PUT', etc.). data: Optional; additional data (e.g., file content) on request body. Returns: A Response object from the request library. Raises: SharePointAPIError: If there is an issue with the Sharepoint API request. """ self._get_token() # Required to avoid cicd issue if not self.token or "access_token" not in self.token: raise SharePointAPIError("Authentication token is missing or invalid.") try: if "access_token" in self.token: response = requests.request( method=method, url=endpoint, headers=( headers if headers else {"Authorization": "Bearer " + self.token["access_token"]} ), json=json_options, data=data, stream=stream, ) return response except RequestException as error: raise SharePointAPIError(f"{error}") def _parse_json(self, response: requests.Response, context: str) -> Dict[str, Any]: """Parse JSON response and raise on errors. Args: response: HTTP response object. context: Operation context for error logging. Returns: Parsed JSON as a dictionary. Raises: HTTPError: If the request fails. ValueError: If the response is not valid JSON. """ try: response.raise_for_status() except requests.HTTPError as e: _logger.error( "HTTP error while %s: %s | body: %s", context, e, response.text[:200] ) raise try: data = response.json() if not isinstance(data, dict): raise ValueError(f"Expected dict JSON while {context}") return data except (requests.JSONDecodeError, ValueError): _logger.error( "Non-JSON or wrong type while %s. Body preview: %s", context, response.text[:200], ) raise def _get_site_id(self) -> str: """Get site ID from site name, with caching. Returns: Site ID as a string. Raises: SharepointAPIError: If the request fails. RuntimeError: For unexpected errors or missing site ID. """ if self.site_id is not None: return self.site_id endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}" f"/sites/{ExecEnv.ENGINE_CONFIG.sharepoint_company_domain}:/" f"sites/{self.site_name}" ) try: response = self._make_request(endpoint=endpoint) response_data = self._parse_json( response, f"getting site id for site '{self.site_name}'" ) self.site_id = response_data.get("id") if not self.site_id: raise ValueError( f"Site ID not found for site '{self.site_name}' in the API " f"response: {response_data}" ) return self.site_id except RequestException as error: raise SharePointAPIError(f"{error}") except Exception as e: raise RuntimeError( f"Unexpected error while reading site ID for site '{self.site_name}':" f"{e}" ) def _get_drive_id(self) -> str: """Get drive ID from site ID and drive name, with caching. Returns: Drive ID as a string. Raises: SharepointAPIError: If the request fails. ValueError: If no drive is found. """ if self.drive_id is not None: return str(self.drive_id) site_id = self._get_site_id() endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/" f"{self.api_version}/sites/{site_id}/drives" ) try: response = self._make_request(endpoint=endpoint) response_data = self._parse_json(response, "listing drives for site") drives = response_data.get("value", []) if not drives: raise ValueError(f"No drives found for site '{self.site_id}'.") for drive in drives: if self.drive_name.strip().lower() == drive["name"].strip().lower(): drive_id = drive["id"] self.drive_id = drive_id return str(drive_id) raise ValueError( f"Drive '{self.drive_name}' could not be found in site '{site_id}'." ) except RequestException as error: raise SharePointAPIError(f"Request error: {error}") def check_if_endpoint_exists( self, folder_root_path: str = None, raise_error: bool = True ) -> bool: """Check if a Sharepoint drive or folder exists. Args: folder_root_path: Optional folder path to check. raise_error: Raise error if the folder doesn't exist. Returns: True if the endpoint exists, False otherwise. Raises: SharepointAPIError: If the endpoint doesn't exist and raise_error is True. """ try: site_id = self._get_site_id() drive_id = self._get_drive_id() if not folder_root_path: return True endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/" f"{self.api_version}/sites/{site_id}/drives/{drive_id}" f"/root:/{folder_root_path}" ) response = self._make_request(endpoint=endpoint) response.raise_for_status() return True except requests.HTTPError as error: if error.response.status_code == 404: _logger.warning(f"Sharepoint path doesn't exist: {folder_root_path}") if raise_error: raise SharePointAPIError( f"Path '{folder_root_path}' doesn't exist!" ) return False raise def check_if_local_path_exists(self, local_path: str) -> None: """Verify that a local path exists. Args: local_path: Local folder where files are temporarily stored. Raises: SharePointAPIError: If the path cannot be read. """ try: os.listdir(local_path) except IOError as error: raise SharePointAPIError(f"{error}") def save_to_staging_area(self, sp_file: SharepointFile) -> str: """Save a Sharepoint file locally (direct write or streaming). If the file is under the threshold and already loaded in memory, write its content directly. Otherwise, download the file via streaming to avoid memory overload. Args: sp_file: File metadata and content. Returns: Local file path. Raises: SharePointAPIError: On download or write failure. """ try: if sp_file.content and sp_file.content_size < (500 * 1024 * 1024): _logger.info( f"Writing '{sp_file.file_name}' via direct write (under 500MB)." ) return self.write_bytes_to_local_file(sp_file) _logger.info( f"Writing '{sp_file.file_name}' via streaming (500MB+ or content not" f" loaded)." ) return self.download_file_streaming(sp_file) except Exception as e: raise SharePointAPIError(f"Failed to write '{sp_file.file_name}': {e}") def download_file_streaming(self, sp_file: SharepointFile) -> str: """Download a large file from Sharepoint in chunks to a local path. Uses the configured chunk size to avoid memory overload with large files. Args: sp_file: File with remote path and name. Returns: Local file path. Raises: SharePointAPIError: If the download fails. """ try: site_id = self._get_site_id() drive_id = self._get_drive_id() url = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/root:/{sp_file.file_path}:/content" ) local_file_path = Path(self.local_path) / sp_file.file_name local_file_path.parent.mkdir(parents=True, exist_ok=True) with self._make_request(endpoint=url, stream=True) as response: response.raise_for_status() with open(local_file_path, "wb") as file: for chunk in response.iter_content(chunk_size=self.chunk_size): if chunk: file.write(chunk) return str(local_file_path) except requests.RequestException as error: raise SharePointAPIError(f"Failed to stream download: {error}") def write_bytes_to_local_file(self, sp_file: SharepointFile) -> str: """Write Sharepoint file content (bytes) to a local path. Args: sp_file: File with content and metadata. Returns: Local file path. Raises: ValueError: If content is missing. RuntimeError: If writing to disk fails. """ if not sp_file.content: raise ValueError( f"Cannot write file '{sp_file.file_name}': Content is empty." ) try: # Local base path (e.g., Unity Volumes, DBFS, or other mounted storage) local_base_path = Path(self.local_path) local_base_path.mkdir(parents=True, exist_ok=True) file_path = local_base_path / sp_file.file_name file_path.write_bytes(sp_file.content) return str(file_path) except Exception as e: raise RuntimeError( f"Failed to write file '{sp_file.file_name}' to Unity Volume: {e}" ) def write_to_local_path(self, df: DataFrame) -> None: """Write a Spark DataFrame to a local path (Volume) in CSV format. This method writes the provided Spark DataFrame to a specified local directory, saving it in CSV format. The method renames the output file from its default "part-*" naming convention to a specified file name. The dictionary local_options enables the customisation of the write action. The customizable options can be found here: https://spark.apache.org/docs/3.5.1/sql-data-sources-csv.html. Args: df: The Spark DataFrame to write to the local file system. Returns: None. Raises: IOError: If there is an issue during the file writing process. """ try: df.coalesce(1).write.mode("overwrite").save( path=self.local_path, format="csv", **self.local_options if self.local_options else {}, ) self._rename_local_file(self.local_path, self.file_name) except IOError as error: raise SharePointAPIError(f"{error}") def _rename_local_file(self, local_path: str, file_name: str) -> None: """Rename a local file that starts with 'part-' to the desired file name. Args: local_path: The directory where the file is located. file_name: The new file name for the local file. """ files_in_dir = os.listdir(local_path) part_file = [f for f in files_in_dir if f.startswith("part-")][0] try: os.rename( os.path.join(local_path, part_file), os.path.join(local_path, file_name) ) except IOError as error: raise SharePointAPIError(f"{error}") def write_to_sharepoint(self) -> None: """Upload a local file to Sharepoint in chunks using the Microsoft Graph API. This method creates an upload session and uploads a local CSV file to a Sharepoint document library. The file is divided into chunks (based on the `chunk_size` specified) to handle large file uploads and send sequentially using the upload URL returned from the Graph API. The method uses instance attributes such as `api_domain`, `api_version`, `site_name`, `drive_name`, `folder_relative_path`, and `file_name` to construct the necessary API calls and upload the file to the specified location in Sharepoint. Returns: None. Raises: APIError: If an error occurs during any stage of the upload (e.g., failure to create upload session,issues during chunk upload). """ drive_id = self._get_drive_id() if self.folder_relative_path: endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}" f"/{self.api_version}/drives/{drive_id}/items/root:" f"/{self.folder_relative_path}/{self.file_name}.csv:" f"/createUploadSession" ) else: endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}" f"/{self.api_version}/drives/{drive_id}/items/root:" f"/{self.file_name}.csv:/createUploadSession" ) response = self._make_request(method="POST", endpoint=endpoint) response.raise_for_status() upload_session = response.json() upload_url = upload_session["uploadUrl"] upload_file = str(Path(self.local_path) / self.file_name) stat = os.stat(upload_file) size = stat.st_size with open(upload_file, "rb") as data: start = 0 while start < size: chunk = data.read(self.chunk_size) bytes_read = len(chunk) upload_range = f"bytes {start}-{start + bytes_read - 1}/{size}" headers = { "Content-Length": str(bytes_read), "Content-Range": upload_range, } response = self._make_request( method="PUT", endpoint=upload_url, headers=headers, data=chunk ) response.raise_for_status() start += bytes_read def delete_local_path(self) -> None: """Delete and recreate the local path used for temporary storage. Raises: SharePointAPIError: If deletion or recreation fails. """ try: local_path = Path(self.local_path) if local_path.exists(): shutil.rmtree(local_path) local_path.mkdir(parents=True, exist_ok=True) except Exception as e: raise SharePointAPIError(f"Failed to clear or recreate local path: {e}") @contextmanager def staging_area(self) -> Generator[str, None, None]: """Provide a clean local staging folder for Sharepoint files. Yield the local path after ensuring it's empty. Cleans up after use. Yield: Path to the staging folder as a string. """ self.delete_local_path() try: yield self.local_path finally: try: self.delete_local_path() except Exception as e: _logger.warning(f"Failed to clean up local path: {e}") def list_items_in_path(self, path: str) -> list[Any]: """List items (files/folders) at a Sharepoint path. Args: path: Relative folder or file path. Returns: List of items; files include @microsoft.graph.downloadUrl. Raises: ValueError: If the path is invalid or not found. """ site_id = self._get_site_id() drive_id = self._get_drive_id() path = path.strip("/") if not path: resp = self._make_request( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/root/children" ) data = self._parse_json(resp, "listing root children") return cast(List[dict[str, Any]], data.get("value", [])) path_parts = path.split("/") # start from root children resp = self._make_request( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/sites/" f"{site_id}/drives/{drive_id}/root/children" ) data = self._parse_json(resp, "listing root children") items = cast(List[dict[str, Any]], data.get("value", [])) for component in path_parts: current_item = next( (item for item in items if item.get("name") == component), None ) if not current_item: raise ValueError(f"Path component '{component}' not found in '{path}'.") if "folder" in current_item: # descend into folder resp = self._make_request( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/items/" f"{current_item['id']}/children" ) data = self._parse_json(resp, f"listing children for '{component}'") items = cast(List[dict[str, Any]], data.get("value", [])) else: # it's a file; ensure we have downloadUrl if "@microsoft.graph.downloadUrl" not in current_item: resp = self._make_request( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/" f"{self.api_version}/sites/{site_id}/drives/{drive_id}/" f"items/{current_item['id']}" ) current_item = self._parse_json( resp, f"fetching file metadata for item id {current_item['id']}" ) return [current_item] return items def get_file_metadata(self, file_path: str) -> SharepointFile: """Fetch file metadata and content from Sharepoint. Args: file_path: Full Sharepoint path (e.g., 'folder/file.csv'). Returns: SharepointFile with metadata and bytes content. Raises: ValueError: If required metadata is missing or path is invalid. requests.HTTPError: On HTTP errors during retrieval. """ site_id = self._get_site_id() drive_id = self._get_drive_id() file_metadata_url = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/" f"{self.api_version}/sites/{site_id}/drives/{drive_id}/root:/{file_path}" ) # Get metadata metadata_response = self._make_request(endpoint=file_metadata_url, method="GET") metadata = self._parse_json( metadata_response, f"fetching metadata for '{file_path}'", ) file_name = metadata.get("name") time_created = metadata.get("createdDateTime", "") time_modified = metadata.get("lastModifiedDateTime", "") download_url = metadata.get("@microsoft.graph.downloadUrl") if not file_name or not download_url: raise ValueError( f"Missing required metadata for '{file_path}': " f"name={file_name!r}, " f"downloadUrl={'present' if download_url else 'absent'}" ) # Download file content (bytes) content_response = self._make_request(endpoint=download_url, method="GET") content_response.raise_for_status() file_content = content_response.content if "/" not in file_path: raise ValueError( f"Invalid file path: '{file_path}'. Expected a folder/file structure." ) folder = file_path.rsplit("/", 1)[0] return SharepointFile( file_name=file_name, time_created=time_created, time_modified=time_modified, content=file_content, _folder=folder, ) def archive_sharepoint_file( self, sp_file: SharepointFile, to_path: str | None, *, move_enabled: bool = True ) -> None: """Rename (timestamp) and optionally move a Sharepoint file. Args: sp_file: File to archive. to_path: Destination folder (if moving). move_enabled: Whether to move after rename. Raises: SharePointAPIError: If the request fails. """ # If already archived (renamed+moved before), don't repeat if getattr(sp_file, "_already_archived", False) and move_enabled and to_path: _logger.info( "Skipping archive: file already archived -> %s", sp_file.file_name ) return try: if not getattr(sp_file, "skip_rename", False): new_file_name = self._rename_sharepoint_file(sp_file) sp_file.file_name = new_file_name sp_file.skip_rename = True if not move_enabled or not to_path: _logger.info( """Archiving disabled or no target folder; Renamed only and left in place: '%s'.""", sp_file.file_path, ) return self._move_file_in_sharepoint(sp_file, to_path) sp_file._already_archived = True _logger.info("Archived '%s' to '%s'.", sp_file.file_name, to_path) except requests.RequestException as e: _logger.error( "Request failed while archiving '%s': %s", sp_file.file_name, e ) raise SharePointAPIError(f"Request failed: {e}") def _rename_sharepoint_file(self, sp_file: SharepointFile) -> str: """Prefix file name with a timestamp (skip if already renamed). Args: sp_file: File to rename. Returns: New file name. Raises: SharePointAPIError: If the rename request fails. """ try: if getattr(sp_file, "skip_rename", False): _logger.info( f"Skipping rename for already-prefixed file: {sp_file.file_name}" ) return sp_file.file_name _logger.info(f"Renaming file at '{sp_file.file_path}'.") site_id = self._get_site_id() drive_id = self._get_drive_id() current_date_formatted = datetime.now().strftime("%Y%m%d%H%M%S") new_file_name = f"{current_date_formatted}_{sp_file.file_name}" url_get_file = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/root:/{sp_file.file_path}" ) resp = self._make_request(endpoint=url_get_file, method="GET") file_info = self._parse_json( resp, f"fetching file info at '{sp_file.file_path}'" ) file_id = file_info.get("id") if not file_id: raise ValueError( f"File '{sp_file.file_name}' not found in '{sp_file.file_path}'." ) url_rename_file = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/items/{file_id}" ) rename_payload = {"name": new_file_name} rename_resp = self._make_request( endpoint=url_rename_file, method="PATCH", json_options=rename_payload ) rename_resp.raise_for_status() _logger.info(f"File '{sp_file.file_name}' renamed to '{new_file_name}'.") sp_file.file_name = new_file_name return new_file_name except requests.RequestException as e: _logger.error( f"Request failed while renaming file '{sp_file.file_name}': {e}" ) raise SharePointAPIError(f"Request failed: {e}") def _move_file_in_sharepoint(self, sp_file: SharepointFile, to_path: str) -> None: """Move a file to another folder in Sharepoint. Args: sp_file: File to move. to_path: Destination path. Raises: ValueError: If the file ID cannot be resolved. SharePointAPIError: If the move request fails. """ try: _logger.info( f"Moving file '{sp_file.file_name}' from '{sp_file.file_path}' to " f"'{to_path}'." ) site_id = self._get_site_id() drive_id = self._get_drive_id() if not self.check_if_endpoint_exists( folder_root_path=to_path, raise_error=False ): self._create_folder_in_sharepoint(to_path) # Create the folder if it doesn't exist; raise_error = false so it # doesn't throw error _logger.info(f"Created archive folder: {to_path}") url_get_file = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/root:/{sp_file.file_path}" ) response = self._make_request(endpoint=url_get_file, method="GET") file_info = self._parse_json( response, f"getting file id for move '{sp_file.file_path}'", ) file_id = file_info.get("id") if not file_id: raise ValueError( f"File '{sp_file.file_name}' not found in '{sp_file.file_path}'." ) url_move_file = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/items/{file_id}" ) new_parent_reference = { "parentReference": {"path": f"/drive/root:/{to_path}"}, "name": sp_file.file_name, } response = self._make_request( endpoint=url_move_file, method="PATCH", json_options=new_parent_reference, ) response.raise_for_status() _logger.info( f"File '{sp_file.file_name}' successfully moved to '{to_path}'." ) except requests.RequestException as e: _logger.error( f"Request failed while moving file '{sp_file.file_name}': {e}" ) raise SharePointAPIError(f"Request failed: {e}") def _create_folder_in_sharepoint(self, folder_path: str) -> None: """Create the final folder in a Sharepoint path. Args: folder_path: Full folder path to create. Raises: SharePointAPIError: If folder creation fails. """ try: site_id = self._get_site_id() drive_id = self._get_drive_id() parent_path, folder_name = ( folder_path.rsplit("/", 1) if "/" in folder_path else ("", folder_path) ) parent_path = parent_path.strip("/") # Clean path just in case _logger.info( f"Creating folder '{folder_name}' inside '{parent_path or 'root'}'" ) if parent_path: endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/root:/{parent_path}:/children" ) else: endpoint = ( f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/" f"sites/{site_id}/drives/{drive_id}/root/children" ) folder_metadata = {"name": folder_name, "folder": {}} response = self._make_request( endpoint=endpoint, method="POST", json_options=folder_metadata ) response.raise_for_status() _logger.info(f"Folder '{folder_path}' created successfully.") except requests.RequestException as e: _logger.error(f"Failed to create folder '{folder_path}': {e}") raise SharePointAPIError(f"Error creating folder '{folder_path}': {e}") ================================================ FILE: lakehouse_engine/utils/spark_utils.py ================================================ """Utilities to facilitate spark dataframe management.""" from pyspark.sql import DataFrame from lakehouse_engine.core.exec_env import ExecEnv class SparkUtils(object): """Spark utils that help retrieve and manage dataframes.""" @staticmethod def create_temp_view( df: DataFrame, view_name: str, return_prefix: bool = False ) -> None | str: """Create a temporary view from a dataframe. If the execution environment is serverless, it creates a temporary view, otherwise it creates a global temporary view. Serverless environments don't support global temporary views, so we need to create a temporary view in that case, but it still gets accessible from other queries in the same session. In non-serverless environments, we create a global temporary view to make sure it is accessible from other sessions as well. Args: df: dataframe to create the view from. view_name: name of the view to create. return_prefix: whether to return the prefix to use in queries for this view or not. Returns: None or the prefix to use in queries for this view, depending on the value of return_prefix. """ if ExecEnv.IS_SERVERLESS: df.createOrReplaceTempView(view_name) prefix = "" else: df.createOrReplaceGlobalTempView(view_name) prefix = "global_temp." if return_prefix: return prefix return None ================================================ FILE: lakehouse_engine/utils/sql_parser_utils.py ================================================ """Module to parse sql files.""" from lakehouse_engine.core.definitions import SQLParser class SQLParserUtils(object): """Parser utilities class.""" def split_sql_commands( self, sql_commands: str, delimiter: str, advanced_parser: bool, ) -> list[str]: """Read the sql commands of a file to choose how to split them. Args: sql_commands: commands to be split. delimiter: delimiter to split the sql commands. advanced_parser: boolean to define if we need to use a complex split. Returns: List with the sql commands. """ if advanced_parser: self.sql_commands: str = sql_commands self.delimiter: str = delimiter self.separated_sql_commands: list[str] = [] self.split_index: int = 0 return self._split_sql_commands() else: return sql_commands.split(delimiter) def _split_sql_commands(self) -> list[str]: """Read the sql commands of a file to split them based on a delimiter. Returns: List with the sql commands. """ single_quotes: int = 0 double_quotes: int = 0 one_line_comment: int = 0 multiple_line_comment: int = 0 for index, char in enumerate(self.sql_commands): if char == SQLParser.SINGLE_QUOTES.value and self._character_validation( value=[double_quotes, one_line_comment, multiple_line_comment] ): single_quotes = self._update_value( value=single_quotes, condition=self._character_validation( value=self._get_substring(first_char=index - 1, last_char=index) ), operation="+-", ) elif char == SQLParser.DOUBLE_QUOTES.value and self._character_validation( value=[single_quotes, one_line_comment, multiple_line_comment] ): double_quotes = self._update_value( value=double_quotes, condition=self._character_validation( value=self._get_substring(first_char=index - 1, last_char=index) ), operation="+-", ) elif char == SQLParser.SINGLE_TRACE.value and self._character_validation( value=[double_quotes, single_quotes, multiple_line_comment] ): one_line_comment = self._update_value( value=one_line_comment, condition=( self._get_substring(first_char=index, last_char=index + 2) == SQLParser.DOUBLE_TRACES.value ), operation="+", ) elif ( char == SQLParser.SLASH.value or char == SQLParser.STAR.value ) and self._character_validation( value=[double_quotes, single_quotes, one_line_comment] ): multiple_line_comment = self._update_value( value=multiple_line_comment, condition=self._get_substring(first_char=index, last_char=index + 2) in SQLParser.MULTIPLE_LINE_COMMENT.value, operation="+-", ) one_line_comment = self._update_value( value=one_line_comment, condition=char == SQLParser.PARAGRAPH.value, operation="-", ) self._validate_command_is_closed( index=index, dependencies=self._character_validation( value=[ single_quotes, double_quotes, one_line_comment, multiple_line_comment, ] ), ) return self.separated_sql_commands def _get_substring(self, first_char: int = None, last_char: int = None) -> str: """Get the substring based on the indexes passed as arguments. Args: first_char: represents the first index of the string. last_char: represents the last index of the string. Returns: The substring based on the indexes passed as arguments. """ return self.sql_commands[first_char:last_char] def _validate_command_is_closed(self, index: int, dependencies: int) -> None: """Validate based on the delimiter if we have the closing of a sql command. Args: index: index of the character in a string. dependencies: represents an int to validate if we are outside of quotes,... """ if ( self._get_substring(first_char=index, last_char=index + len(self.delimiter)) == self.delimiter and dependencies ): self._add_new_command( sql_command=self._get_substring( first_char=self.split_index, last_char=index ) ) self.split_index = index + len(self.delimiter) if self._get_substring( first_char=index, last_char=index + len(self.delimiter) ) != self.delimiter and index + len(self.delimiter) == len(self.sql_commands): self._add_new_command( sql_command=self._get_substring( first_char=self.split_index, last_char=len(self.sql_commands) ) ) def _character_validation(self, value: str | list) -> bool: """Validate if character is the opening/closing/inside of a comment. Args: value: represent the value associated to different validated types or a character to be analyzed. Returns: Boolean that indicates if character found is the opening or closing of a comment, is inside of quotes, comments,... """ if value.__class__.__name__ == "list": return sum(value) == 0 else: return value != SQLParser.BACKSLASH.value def _add_new_command(self, sql_command: str) -> None: """Add a newly found command to list of sql commands to execute. Args: sql_command: command to be added to list. """ self.separated_sql_commands.append(str(sql_command)) def _update_value(self, value: int, operation: str, condition: bool = False) -> int: """Update value associated to different types of comments or quotes. Args: value: value to be updated operation: operation that we want to perform on the value. condition: validate if we have a condition associated to the value. Returns: A integer that represents the updated value. """ if condition and operation == "+-": value = value + 1 if value == 0 else value - 1 elif condition and operation == "+": value = value + 1 if value == 0 else value elif condition and operation == "-": value = value - 1 if value == 1 else value return value ================================================ FILE: lakehouse_engine/utils/storage/__init__.py ================================================ """Utilities to interact with storage systems.""" ================================================ FILE: lakehouse_engine/utils/storage/dbfs_storage.py ================================================ """Module to represent a DBFS file storage system.""" from typing import Any from urllib.parse import ParseResult, urlunparse from lakehouse_engine.utils.databricks_utils import DatabricksUtils from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.storage.file_storage import FileStorage class DBFSStorage(FileStorage): """Class to represent a DBFS file storage system.""" _LOGGER = LoggingHandler(__name__).get_logger() _MAX_INT = 2147483647 @classmethod def get_file_payload(cls, url: ParseResult) -> Any: """Get the content of a file. Args: url: url of the file. Returns: File payload/content. """ from lakehouse_engine.core.exec_env import ExecEnv str_url = urlunparse(url) cls._LOGGER.info(f"Trying with dbfs_storage: Reading from file: {str_url}") return DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.head( str_url, cls._MAX_INT ) @classmethod def write_payload_to_file(cls, url: ParseResult, content: str) -> None: """Write payload into a file. Args: url: url of the file. content: content to write into the file. """ from lakehouse_engine.core.exec_env import ExecEnv str_url = urlunparse(url) cls._LOGGER.info(f"Trying with dbfs_storage: Writing into file: {str_url}") DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.put(str_url, content, True) ================================================ FILE: lakehouse_engine/utils/storage/file_storage.py ================================================ """Module for abstract representation of a storage system holding files.""" from abc import ABC, abstractmethod from typing import Any from urllib.parse import ParseResult class FileStorage(ABC): """Abstract file storage class.""" @classmethod @abstractmethod def get_file_payload(cls, url: ParseResult) -> Any: """Get the payload of a file. Args: url: url of the file. Returns: File payload/content. """ pass @classmethod @abstractmethod def write_payload_to_file(cls, url: ParseResult, content: str) -> None: """Write payload into a file. Args: url: url of the file. content: content to write into the file. """ pass ================================================ FILE: lakehouse_engine/utils/storage/file_storage_functions.py ================================================ """Module for common file storage functions.""" import json from abc import ABC from typing import Any from urllib.parse import ParseResult, urlparse import boto3 from lakehouse_engine.utils.storage.dbfs_storage import DBFSStorage from lakehouse_engine.utils.storage.local_fs_storage import LocalFSStorage from lakehouse_engine.utils.storage.s3_storage import S3Storage class FileStorageFunctions(ABC): # noqa: B024 """Class for common file storage functions.""" @classmethod def read_json(cls, path: str, disable_dbfs_retry: bool = False) -> Any: """Read a json file. The file should be in a supported file system (e.g., s3, dbfs or local filesystem). Args: path: path to the json file. disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: Dict with json file content. """ url = urlparse(path, allow_fragments=False) if disable_dbfs_retry: return json.load(S3Storage.get_file_payload(url)) elif url.scheme == "s3" and cls.is_boto3_configured(): try: return json.load(S3Storage.get_file_payload(url)) except Exception: return json.loads(DBFSStorage.get_file_payload(url)) elif url.scheme == "file": return json.load(LocalFSStorage.get_file_payload(url)) elif url.scheme in ["dbfs", "s3"]: return json.loads(DBFSStorage.get_file_payload(url)) else: raise NotImplementedError( f"File storage protocol not implemented for {path}." ) @classmethod def read_sql(cls, path: str, disable_dbfs_retry: bool = False) -> Any: """Read a sql file. The file should be in a supported file system (e.g., s3, dbfs or local filesystem). Args: path: path to the sql file. disable_dbfs_retry: optional flag to disable file storage dbfs. Returns: Content of the SQL file. """ url = urlparse(path, allow_fragments=False) if disable_dbfs_retry: return S3Storage.get_file_payload(url).read().decode("utf-8") elif url.scheme == "s3" and cls.is_boto3_configured(): try: return S3Storage.get_file_payload(url).read().decode("utf-8") except Exception: return DBFSStorage.get_file_payload(url) elif url.scheme == "file": return LocalFSStorage.get_file_payload(url).read() elif url.scheme in ["dbfs", "s3"]: return DBFSStorage.get_file_payload(url) else: raise NotImplementedError( f"Object storage protocol not implemented for {path}." ) @classmethod def write_payload( cls, path: str, url: ParseResult, content: str, disable_dbfs_retry: bool = False ) -> None: """Write payload into a file. The file should be in a supported file system (e.g., s3, dbfs or local filesystem). Args: path: path to validate the file type. url: url of the file. content: content to write into the file. disable_dbfs_retry: optional flag to disable file storage dbfs. """ if disable_dbfs_retry: S3Storage.write_payload_to_file(url, content) elif path.startswith("s3://") and cls.is_boto3_configured(): try: S3Storage.write_payload_to_file(url, content) except Exception: DBFSStorage.write_payload_to_file(url, content) elif path.startswith(("s3://", "dbfs:/")): DBFSStorage.write_payload_to_file(url, content) else: LocalFSStorage.write_payload_to_file(url, content) @staticmethod def is_boto3_configured() -> bool: """Check if boto3 is able to locate credentials and properly configured. If boto3 is not properly configured, we might want to try a different reader. """ try: boto3.client("sts").get_caller_identity() return True except Exception: return False ================================================ FILE: lakehouse_engine/utils/storage/local_fs_storage.py ================================================ """Module to represent a local file storage system.""" import os from typing import TextIO from urllib.parse import ParseResult from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.storage.file_storage import FileStorage class LocalFSStorage(FileStorage): """Class to represent a local file storage system.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def get_file_payload(cls, url: ParseResult) -> TextIO: """Get the payload of a file. Args: url: url of the file. Returns: file payload/content. """ cls._LOGGER.info(f"Reading from file: {url.scheme}:{url.netloc}/{url.path}") return open(f"{url.netloc}/{url.path}", "r") @classmethod def write_payload_to_file(cls, url: ParseResult, content: str) -> None: """Write payload into a file. Args: url: url of the file. content: content to write into the file. """ cls._LOGGER.info(f"Writing into file: {url.scheme}:{url.netloc}/{url.path}") os.makedirs(os.path.dirname(f"{url.netloc}/{url.path}"), exist_ok=True) with open(f"{url.netloc}/{url.path}", "w") as file: file.write(content) ================================================ FILE: lakehouse_engine/utils/storage/s3_storage.py ================================================ """Module to represent a s3 file storage system.""" from typing import Any from urllib.parse import ParseResult import boto3 from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.storage.file_storage import FileStorage class S3Storage(FileStorage): """Class to represent a s3 file storage system.""" _LOGGER = LoggingHandler(__name__).get_logger() @classmethod def get_file_payload(cls, url: ParseResult) -> Any: """Get the payload of a config file. Args: url: url of the file. Returns: File payload/content. """ s3 = boto3.resource("s3") obj = s3.Object(url.netloc, url.path.lstrip("/")) cls._LOGGER.info( f"Trying with s3_storage: " f"Reading from file: {url.scheme}://{url.netloc}{url.path}" ) return obj.get()["Body"] @classmethod def write_payload_to_file(cls, url: ParseResult, content: str) -> None: """Write payload into a file. Args: url: url of the file. content: content to write into the file. """ s3 = boto3.resource("s3") obj = s3.Object(url.netloc, url.path.lstrip("/")) cls._LOGGER.info( f"Trying with s3_storage: " f"Writing into file: {url.scheme}://{url.netloc}{url.path}" ) obj.put(Body=content) ================================================ FILE: lakehouse_engine_usage/__init__.py ================================================ """ # How to use the Lakehouse Engine? Lakehouse engine usage examples for all the algorithms and other core functionalities. - [Data Loader](lakehouse_engine_usage/data_loader.html) - [Data Quality](lakehouse_engine_usage/data_quality.html) - [Reconciliator](lakehouse_engine_usage/reconciliator.html) - [Sensors - Sensor & Heartbeat Sensor](lakehouse_engine_usage/sensors.html) - [GAB](lakehouse_engine_usage/gab.html) """ ================================================ FILE: lakehouse_engine_usage/data_loader/__init__.py ================================================ """ .. include::data_loader.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/__init__.py ================================================ """ .. include::append_load_from_jdbc_with_permissive_mode.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/append_load_from_jdbc_with_permissive_mode.md ================================================ # Append Load from JDBC with PERMISSIVE mode (default) This scenario is an append load from a JDBC source (e.g., SAP BW, Oracle Database, SQL Server Database...). ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db", "table": "jdbc_permissive", "properties": { "driver": "org.sqlite.JDBC" } }, "options": { "numPartitions": 1 } }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.jdbc_permissive_table" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_date", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_date" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "db_table": "test_db.jdbc_permissive_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data" } ] } load_data(acon=acon) ``` ##### Relevant notes - The **ReadMode** is **PERMISSIVE** in this scenario, which **is the default in Spark**, hence we **don't need to specify it**. Permissive means don't enforce any schema on the input data. - From a JDBC source the ReadType needs to be "batch" always as "streaming" is not available for a JDBC source. - In this scenario we do an append load by getting the max date (transformer_spec ["get_max_value"](../../../reference/packages/transformers/aggregators.md#packages.transformers.aggregators.Aggregators.get_max_value)) on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec ["incremental_filter"](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.incremental_filter)). **That is the standard way we do incremental batch loads in the lakehouse engine.** For streaming incremental loads we rely on Spark Streaming checkpoint feature [(check a streaming append load ACON example)](../streaming_append_load_with_terminator/streaming_append_load_with_terminator.md). ================================================ FILE: lakehouse_engine_usage/data_loader/append_load_with_failfast/__init__.py ================================================ """ .. include::append_load_with_failfast.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/append_load_with_failfast/append_load_with_failfast.md ================================================ # Append Load with FAILFAST This scenario is an append load enforcing the schema (using the schema of the target table to enforce the schema of the source, i.e., the schema of the source needs to exactly match the schema of the target table) and FAILFASTING if the schema of the input data does not match the one we specified. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "enforce_schema_from_table": "test_db.failfast_table", "options": { "header": True, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.failfast_table" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_date", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_date" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "db_table": "test_db.failfast_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data" } ] } load_data(acon=acon) ``` ##### Relevant notes - The **ReadMode** is **FAILFAST** in this scenario, i.e., fail the algorithm if the schema of the input data does not match the one we specified via schema_path, read_schema_from_table or schema Input_specs variables. - In this scenario we do an append load by getting the max date (transformer_spec ["get_max_value"](../../../reference/packages/transformers/aggregators.md#packages.transformers.aggregators.Aggregators.get_max_value)) on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec ["incremental_filter"](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.incremental_filter)). **That is the standard way we do incremental batch loads in the lakehouse engine.** For streaming incremental loads we rely on Spark Streaming checkpoint feature [(check a streaming append load ACON example)](../streaming_append_load_with_terminator/streaming_append_load_with_terminator.md). ================================================ FILE: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/__init__.py ================================================ """ .. include::batch_delta_load_init_delta_backfill_with_merge.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/batch_delta_load_init_delta_backfill_with_merge.md ================================================ # Batch Delta Load Init, Delta and Backfill with Merge This scenario illustrates the process of implementing a delta load algorithm by first using an ACON to perform an initial load, then another one to perform the regular deltas that will be triggered on a recurrent basis, and finally an ACON for backfilling specific parcels if ever needed. ## Init Load ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } load_data(acon=acon) ``` ##### Relevant Notes - We can see that even though this is an init load we still have chosen to condense the records through our ["condense_record_mode_cdc"](../../../reference/packages/transformers/condensers.md#packages.transformers.condensers.Condensers.condense_record_mode_cdc) transformer. This is a condensation step capable of handling SAP BW style changelogs based on actrequest_timestamps, datapakid, record_mode, etc... - In the init load we actually did a merge in this case because we wanted to test locally if a merge with an empty target table works, but you don't have to do it, as an init load usually can be just a full load. If a merge of init data with an empty table has any performance implications when compared to a regular insert remains to be tested, but we don't have any reason to recommend a merge over an insert for an init load, and as said, this was done solely for local testing purposes, you can just use `write_type: "overwrite"` ## Delta Load ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } load_data(acon=acon) ``` ##### Relevant Notes - The merge predicate and the insert, delete or update predicates should reflect the reality of your data, and it's up to each data product to figure out which predicates better match their reality: - The merge predicate usually involves making sure that the "primary key" for your data matches. !!! note "**Performance Tip!!!**" Ideally, in order to get a performance boost in your merges, you should also place a filter in your merge predicate (e.g., certain technical or business date in the target table >= x days ago), based on the assumption that the rows in that specified interval will never change in the future. This can drastically decrease the merge times of big tables. - The insert, delete and update predicates will always depend on the structure of your changelog, and also how you expect your updates to arrive (e.g., in certain data products you know that you will never get out of order data or late arriving data, while in other you can never ensure that). These predicates should reflect that in order to prevent you from doing unwanted changes to the target delta lake table. - For example, in this scenario, we delete rows that have the R, D or X record_mode values, because we know that if after condensing the rows that is the latest status of that row from the changelog, they should be deleted, and we never insert rows with those status (**note**: we use this guardrail in the insert to prevent out of order changes, which is likely not the case in SAP BW). - Because the `insert_predicate` is fully optional, in your scenario you may not require that. - In this scenario, we don't pass an `update_predicate` in the ACON, because both `insert_predicate` and update_predicate are fully optional, i.e., if you don't pass them the algorithm will update any data that matches the `merge_predicate` and insert any data that does not match it. The predicates in these cases just make sure the algorithm does not insert or update any data that you don't want, as in the late arriving changes scenario where a deleted row may arrive first from the changelog then the update row, and to prevent your target table to have inconsistent data for a certain period of time (it will eventually get consistent when you receive the latest correct status from the changelog though) you can have this guardrail in the insert or update predicates. Again, for most sources this will not happen but sources like Kafka for example cannot 100% ensure order, for example. - In order to understand how we can cover different scenarios (e.g., late arriving changes, out of order changes, etc.), please go [here](../streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md). - The order of the predicates in the ACON does not matter, is the logic in the lakehouse engine [DeltaMergeWriter's "_merge" function](../../../reference/packages/io/writers/delta_merge_writer.md#packages.io.writers.delta_merge_writer.DeltaMergeWriter.__init__) that matters. - Notice the "<=>" operator? In Spark SQL that's the null safe equal. ## Backfilling ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_value": "20180110120052t", "greater_or_equal": True } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } load_data(acon=acon) ``` ##### Relevant Notes - The backfilling process depicted here is fairly similar to the init load, but it is relevant to highlight by using a static value (that can be modified accordingly to the backfilling needs) in the [incremental_filter](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.incremental_filter) function. - Other relevant functions for backfilling may include the [expression_filter](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.expression_filter) function, where you can use a custom SQL filter to filter the input data. ================================================ FILE: lakehouse_engine_usage/data_loader/custom_transformer/__init__.py ================================================ """ .. include::custom_transformer.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/custom_transformer/custom_transformer.md ================================================ # Custom Transformer There may appear a scenario where the data product dev team faces the need to perform complex data transformations that are either not yet available in the lakehouse engine or the logic is just too complex to chain in an ACON file. In the context of the lakehouse, the only layers that usually can impose that complexity is silver+ and gold. This page targets exactly those cases. Below you'll find a notebook where you can pass your own PySpark or Spark SQL logic into the ACON, by dynamically injecting a python function into the ACON dictionary. The lakehouse engine will take care of executing those transformations in the transformation step of the data loader algorithm. Please read the notebook's comments carefully to understand how it works, or simply open it in your notebook environment, which will make the notebook's code and comments more readable. !!! warning "Force Streaming Micro Batch Processing." When you use streaming mode, with a custom transformer, it’s highly advisable that you set the `force_streaming_microbatch_processing` flag to `True` in the transform specification, as explained above! ## What is a custom transformer in the Lakehouse Engine and how you can use it to write your own pyspark logic? We highly promote the Lakehouse Engine for creating Data Products aligned with the data source (bronze/silver layer), pumping data into silver so our Data Scientists and Analysts can leverage the value of the data in silver, as close as it comes from the source. The low-code and configuration-driven nature of the lakehouse engine makes it a compelling framework to use in such cases, where the transformations that are done from bronze to silver are not that many, as we want to keep the data close to the source. However, when it comes to Data Products enriched in some way or for insights (silver+, gold), they are typically heavy on transformations (they are the T of the overall ELT process), so the nature of the lakehouse engine may would have get into the way of adequately building it. Considering this, and considering our user base that prefers an ACON-based approach and all the nice off-the-shelf features of the lakehouse engine, we have developed a feature that allows us to **pass custom transformers where you put your entire pyspark logic and can pass it as an argument in the ACON** (the configuration file that configures every lakehouse engine algorithm). !!! note "Motivation" Doing that, you let the ACON guide your read, data quality, write and terminate processes, and you just focus on transforming data :) ## Custom transformation Function The function below is the one that encapsulates all your defined pyspark logic and sends it as a python function to the lakehouse engine. This function will then be invoked internally in the lakehouse engine via a df.transform() function. If you are interested in checking the internals of the lakehouse engine, our codebase is openly available here: https://github.com/adidas/lakehouse-engine !!! warning "Attention!!!" For this process to work, your function defined below needs to receive a DataFrame and return a DataFrame. Attempting any other method signature (e.g., defining more parameters) will not work, unless you use something like [python partials](https://docs.python.org/3/library/functools.html#functools.partial), for example. ```python def get_new_data(df: DataFrame) -> DataFrame: """Get the new data from the lakehouse engine reader and prepare it.""" return ( df.withColumn("amount", when(col("_change_type") == "delete", lit(0)).otherwise(col("amount"))) .select("article_id", "order_date", "amount") .groupBy("article_id", "order_date") .agg(sum("amount").alias("amount")) ) def get_joined_data(new_data_df: DataFrame, current_data_df: DataFrame) -> DataFrame: """Join the new data with the current data already existing in the target dataset.""" return ( new_data_df.alias("new_data") .join( current_data_df.alias("current_data"), [ new_data_df.article_id == current_data_df.article_id, new_data_df.order_date == current_data_df.order_date, ], "left_outer", ) .withColumn( "current_amount", when(col("current_data.amount").isNull(), lit(0)).otherwise("current_data.amount") ) .withColumn("final_amount", col("current_amount") + col("new_data.amount")) .select(col("new_data.article_id"), col("new_data.order_date"), col("final_amount").alias("amount")) ) def calculate_kpi(df: DataFrame) -> DataFrame: """Calculate KPI through a custom transformer that will be provided in the ACON. Args: df: DataFrame passed as input. Returns: DataFrame: the transformed DataFrame. """ new_data_df = get_new_data(df) # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine. current_data_df = ExecEnv.SESSION.table( "my_database.my_table" ) transformed_df = get_joined_data(new_data_df, current_data_df) return transformed_df ``` ### Don't like pyspark API? Write SQL You don't have to comply to the pyspark API if you prefer SQL. Inside the function above (or any of the auxiliary functions you decide to develop) you can write something like: ````python def calculate_kpi(df: DataFrame) -> DataFrame: df.createOrReplaceTempView("new_data") # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine. ExecEnv.SESSION.sql( """ CREATE OR REPLACE TEMP VIEW my_kpi AS SELECT ... FROM new_data ... """ ) return ExecEnv.SESSION.table("my_kpi") ```` ## Just your regular ACON If you notice the ACON below, everything is the same as you would do in a Data Product, but the `transform_specs` section of the ACON has a difference, which is a function called `"custom_transformation"` where we supply as argument the function defined above with the pyspark code. !!! warning "Attention!!!" Do not pass the function as calculate_kpi(), but as calculate_kpi, otherwise you are telling python to invoke the function right away, as opposed to pass it as argument to be invoked later by the lakehouse engine. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "delta", "db_table": "my_database.dummy_sales", "options": {"readChangeFeed": "true"}, } ], "transform_specs": [ { "spec_id": "transformed_sales_kpi", "input_id": "sales", # because we are using streaming, this allows us to make sure that # all the computation in our custom transformer gets pushed to # Spark's foreachBatch method in a stream, which allows us to # run all Spark functions in a micro batch DataFrame, as there # are some Spark functions that are not supported in streaming. "force_streaming_foreach_batch_processing": True, "transformers": [ { "function": "custom_transformation", "args": {"custom_transformer": calculate_kpi}, }, ], } ], "dq_specs": [ { "spec_id": "my_table_quality", "input_id": "transformed_sales_kpi", "dq_type": "validator", "bucket": "my_dq_bucket", "expectations_store_prefix": "dq/expectations/", "validations_store_prefix": "dq/validations/", "checkpoint_store_prefix": "dq/checkpoints/", "tbl_to_derive_pk": "my_table", "dq_functions": [ {"function": "expect_column_values_to_not_be_null", "args": {"column": "article_id"}}, ], }, ], "output_specs": [ { "spec_id": "sales_kpi", "input_id": "transformed_sales_kpi", "write_type": "merge", "data_format": "delta", "db_table": "my_database.my_table", "options": { "checkpointLocation": "s3://my_data_product_bucket/gold/my_table", }, "merge_opts": { "merge_predicate": "new.article_id = current.article_id AND new.order_date = current.order_date" }, } ], } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/custom_transformer/sql_custom_transformer.md ================================================ # SQL Custom Transformer The SQL Custom Transformer executes a SQL transformation provided by the user.This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers. The transformer receives the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view. > To register a dataframe as a temp view you can use the "temp_view" config in the input_specs, as shown below. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"}, "schema_path": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/sql_transformation/" "source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/sql_transformation/data", "temp_view": "sales_sql", } ], "transform_specs": [ { "spec_id": "calculated_kpi", "input_id": "sales_source", "transformers": [ { "function": "sql_transformation", "args": {"sql": SQL}, } ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "calculated_kpi", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/sql_transformation/data", } ], } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/custom_transformer_sql/__init__.py ================================================ """ .. include::custom_transformer_sql.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/custom_transformer_sql/custom_transformer_sql.md ================================================ # SQL Custom Transformer The SQL Custom Transformer executes a SQL transformation provided by the user.This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers. The transformer receives the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view. > To register a dataframe as a temp view you can use the "temp_view" config in the input_specs, as shown below. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"}, "schema_path": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/sql_transformation/" "source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/sql_transformation/data", "temp_view": "sales_sql", } ], "transform_specs": [ { "spec_id": "calculated_kpi", "input_id": "sales_source", "transformers": [ { "function": "sql_transformation", "args": {"sql": SQL}, } ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "calculated_kpi", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/sql_transformation/data", } ], } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/data_loader.md ================================================ # Data Loader ## How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file? An algorithm (e.g., data load) in the lakehouse-engine is configured using an ACON. The lakehouse-engine is a configuration-driven framework, so people don't have to write code to execute a Spark algorithm. In contrast, the algorithm is written in pyspark and accepts configurations through a JSON file (an ACON - algorithm configuration). The ACON is the configuration providing the behaviour of a lakehouse engine algorithm. [You can check the algorithm code, and how it interprets the ACON here](../../reference/packages/algorithms/algorithm.md). In this page we will go through the structure of an ACON file and what are the most suitable ACON files for common data engineering scenarios. Check the underneath pages to find several **ACON examples** that cover many data extraction, transformation and loading scenarios. ## Overview of the Structure of the ACON file for DataLoads An ACON-based algorithm needs several specifications to work properly, but some of them might be optional. The available specifications are: - **Input specifications (input_specs)**: specify how to read data. This is a **mandatory** keyword. - **Transform specifications (transform_specs)**: specify how to transform data. - **Data quality specifications (dq_specs)**: specify how to execute the data quality process. - **Output specifications (output_specs)**: specify how to write data to the target. This is a **mandatory** keyword. - **Terminate specifications (terminate_specs)**: specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc.). - **Execution environment (exec_env)**: custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example). Below is an example of a complete ACON file that reads from a s3 folder with CSVs and incrementally loads that data (using a merge) into a delta lake table. !!! note "What is the **spec_id**?" **spec_id** is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in transform_specs) of a DataFrame that was read in the input_specs. Check ACON below to see how the spec_id of the input_specs is used as input_id in one transform specification. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "orders_bronze", "read_type": "streaming", "data_format": "csv", "schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json", "with_filepath": True, "options": { "badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/", "header": False, "delimiter": "\u005E", "dateFormat": "yyyyMMdd" }, "location": "s3://my-data-product-bucket/bronze/orders/" } ], "transform_specs": [ { "spec_id": "orders_bronze_with_extraction_date", "input_id": "orders_bronze", "transformers": [ { "function": "with_row_id" }, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": True, "regex": ".*WE_SO_SCL_(\\d+).csv" } } ] } ], "dq_specs": [ { "spec_id": "check_orders_bronze_with_extraction_date", "input_id": "orders_bronze_with_extraction_date", "dq_type": "validator", "result_sink_db_table": "my_database.my_table_dq_checks", "fail_on_error": False, "dq_functions": [ { "dq_function": "expect_column_values_to_not_be_null", "args": { "column": "omnihub_locale_code" } }, { "dq_function": "expect_column_unique_value_count_to_be_between", "args": { "column": "product_division", "min_value": 10, "max_value": 100 } }, { "dq_function": "expect_column_max_to_be_between", "args": { "column": "so_net_value", "min_value": 10, "max_value": 1000 } }, { "dq_function": "expect_column_value_lengths_to_be_between", "args": { "column": "omnihub_locale_code", "min_value": 1, "max_value": 10 } }, { "dq_function": "expect_column_mean_to_be_between", "args": { "column": "coupon_code", "min_value": 15, "max_value": 20 } } ] } ], "output_specs": [ { "spec_id": "orders_silver", "input_id": "check_orders_bronze_with_extraction_date", "data_format": "delta", "write_type": "merge", "partitions": [ "order_date_header" ], "merge_opts": { "merge_predicate": """ new.sales_order_header = current.sales_order_header and new.sales_order_schedule = current.sales_order_schedule and new.sales_order_item=current.sales_order_item and new.epoch_status=current.epoch_status and new.changed_on=current.changed_on and new.extraction_date=current.extraction_date and new.lhe_batch_id=current.lhe_batch_id and new.lhe_row_id=current.lhe_row_id """, "insert_only": True }, "db_table": "my_database.my_table_with_dq", "location": "s3://my-data-product-bucket/silver/order_events_with_dq/", "with_batch_id": True, "options": { "checkpointLocation": "s3://my-data-product-bucket/checkpoints/order_events_with_dq/" } } ], "terminate_specs": [ { "function": "optimize_dataset", "args": { "db_table": "my_database.my_table_with_dq" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True } } load_data(acon=acon) ``` ## Input Specifications You specify how to read the data by providing a list of Input Specifications. Usually there's just one element in that list, as, in the lakehouse, you are generally focused on reading data from one layer (e.g., source, bronze, silver, gold) and put it on the next layer. However, there may be scenarios where you would like to combine two datasets (e.g., joins or incremental filtering on one dataset based on the values of another one), therefore you can use one or more elements. [More information about InputSpecs](../../reference/packages/core/definitions.md#packages.core.definitions.InputSpec). ##### Relevant notes - A spec id is fundamental, so you can use the input data later on in any step of the algorithm (transform, write, dq process, terminate). - You don't have to specify `db_table` and `location` at the same time. Depending on the data_format sometimes you read from a table (e.g., jdbc or deltalake table) sometimes you read from a location (e.g., files like deltalake, parquet, json, avro... or kafka topic). ## Transform Specifications In the lakehouse engine, you transform data by providing a transform specification, which contains a list of transform functions (transformers). So the transform specification acts upon on input, and it can execute multiple lakehouse engine transformation functions (transformers) upon that input. If you look into the example above we ask the lakehouse engine to execute two functions on the `orders_bronze` input data: `with_row_id` and `with_regex_value`. Those functions can of course receive arguments. You can see a list of all available transformation functions (transformers) here `lakehouse_engine.transformers`. Then, you just invoke them in your ACON as demonstrated above, following exactly the same function name and parameters name as described in the code documentation. [More information about TransformSpec](../../reference/packages/core/definitions.md#packages.core.definitions.TransformSpec). ##### Relevant notes - This stage is fully optional, you can omit it from the ACON. - There is one relevant option `force_streaming_foreach_batch_processing` that can be used to force the transform to be executed in the foreachBatch function to ensure non-supported streaming operations can be properly executed. You don't have to worry about this if you are using regular lakehouse engine transformers. But if you are providing your custom logic in pyspark code via our lakehouse engine custom_transformation (`lakehouse_engine.transformers.custom_transformers`) then sometimes your logic may contain Spark functions that are not compatible with Spark Streaming, and therefore this flag can enable all of your computation to be streaming-compatible by pushing down all the logic into the foreachBatch() function. ## Data Quality Specifications One of the most relevant features of the lakehouse engine is that you can have data quality guardrails that prevent you from loading bad data into your target layer (e.g., bronze, silver or gold). The lakehouse engine data quality process includes one main feature at the moment: - **Validator**: The capability to perform data quality checks on that data (e.g., is the max value of a column bigger than x?) and even tag your data with the results of the DQ checks. The output of the data quality process can be written into a [**Result Sink**](../data_quality/result_sink/result_sink.md) target (e.g. table or files) and is integrated with a [Data Docs website](../data_quality/data_quality.md#3-data-docs-website), which can be a company-wide available website for people to check the quality of their data and share with others. To achieve all of this functionality the lakehouse engine uses [Great Expectations](https://greatexpectations.io/) internally. To hide the Great Expectations internals from our user base and provide friendlier abstractions using the ACON, we have developed the concept of DQSpec that can contain many DQFunctionSpec objects, which is very similar to the relationship between the TransformSpec and TransformerSpec, which means you can have multiple Great Expectations functions executed inside a single data quality specification (as in the ACON above). !!! note The names of the functions and args are a 1 to 1 match of [Great Expectations API](https://greatexpectations.io/expectations/). [More information about DQSpec](../../reference/packages/core/definitions.md#packages.core.definitions.DQSpec). ##### Relevant notes - You can write the outputs of the DQ process to a sink through the result_sink* parameters of the DQSpec. `result_sink_options` takes any Spark options for a DataFrame writer, which means you can specify the options according to your sink format (e.g., delta, parquet, json, etc.). We usually recommend using `"delta"` as format. - You can use the results of the DQ checks to tag the data that you are validating. When configured, these details will appear as a new column (like any other), as part of the tables of your Data Product. - To be able to make an analysis with the data of `result_sink*`, we have available an approach in which you set `result_sink_explode` as true (which is the default) and then you have some columns expanded. Those are: - General columns: Those are columns that have the basic information regarding `dq_specs` and will have always values and does not depend on the expectation types chosen. - Columns: `checkpoint_config`, `run_name`, `run_time`, `run_results`, `success`, `validation_result_identifier`, `spec_id`, `input_id`, `validation_results`, `run_time_year`, `run_time_month`, `run_time_day`. - Statistics columns: Those are columns that have information about the runs of expectations, being those values for the run and not for each expectation. Those columns come from `run_results.validation_result.statistics.*`. - Columns: `evaluated_expectations`, `success_percent`, `successful_expectations`, `unsuccessful_expectations`. - Expectations columns: Those are columns that have information about the expectation executed. - Columns: `expectation_type`, `batch_id`, `expectation_success`, `exception_info`. Those columns are exploded from `run_results.validation_result.results` inside `expectation_config.expectation_type`, `expectation_config.kwargs.batch_id`, `success as expectation_success`, and `exception_info`. Moreover, we also include `unexpected_index_list`, `observed_value` and `kwargs`. - Arguments of Expectations columns: Those are columns that will depend on the expectation_type selected. Those columns are exploded from `run_results.validation_result.results` inside `expectation_config.kwargs.*`. - We can have for example: `column`, `column_A`, `column_B`, `max_value`, `min_value`, `value`, `value_pairs_set`, `value_set`, and others. - More columns desired? Those can be added, using `result_sink_extra_columns` in which you can select columns like `` and/or explode columns like `.*`. - Use the parameter `"source"` to identify the data used for an easier analysis. - By default, Great Expectation will also provide a site presenting the history of the DQ validations that you have performed on your data. - You can make an analysis of all your expectations and create a dashboard aggregating all that information. - This stage is fully optional, you can omit it from the ACON. ## Output Specifications The output_specs section of an ACON is relatively similar to the input_specs section, but of course focusing on how to write the results of the algorithm, instead of specifying the input for the algorithm, hence the name output_specs (output specifications). [More information about OutputSpec](../../reference/packages/core/definitions.md#packages.core.definitions.OutputSpec). ##### Relevant notes - Respect the supported write types and output formats. - One of the most relevant options to specify in the options parameter is the `checkpoint_location` when in streaming read mode, because that location will be responsible for storing which data you already read and transformed from the source, **when the source is a Spark Streaming compatible source (e.g., Kafka or S3 files)**. ## Terminate Specifications The terminate_specs section of the ACON is responsible for some "wrapping up" activities like optimising a table, vacuuming old files in a delta table, etc. With time the list of available terminators will likely increase (e.g., reconciliation processes), but for now we have the [following terminators](../../reference/packages/terminators/index.md). This stage is fully optional, you can omit it from the ACON. The most relevant now in the context of the lakehouse initiative are the following: - [dataset_optimizer](../../reference/packages/terminators/dataset_optimizer.md) - [cdf_processor](../../reference/packages/terminators/cdf_processor.md) - [sensor_terminator](../../reference/packages/terminators/sensor_terminator.md) - [notifier_terminator](../../reference/packages/terminators/notifiers/email_notifier.md) [More information about TerminatorSpec](../../reference/packages/core/definitions.md#packages.core.definitions.TerminatorSpec). ## Execution Environment In the exec_env section of the ACON you can pass any Spark Session configuration that you want to define for the execution of your algorithm. This is basically just a JSON structure that takes in any Spark Session property, so no custom lakehouse engine logic. This stage is fully optional, you can omit it from the ACON. !!! note Please be aware that Spark Session configurations that are not allowed to be changed when the Spark cluster is already running need to be passed in the configuration of the job/cluster that runs this algorithm, not here in this section. This section only accepts Spark Session configs that can be changed in runtime. Whenever you introduce an option make sure that it takes effect during runtime, as to the best of our knowledge there's no list of allowed Spark properties to be changed after the cluster is already running. Moreover, typically Spark algorithms fail if you try to modify a config that can only be set up before the cluster is running. ================================================ FILE: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/__init__.py ================================================ """ .. include::extract_from_sap_b4_adso.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md ================================================ # Extract from SAP B4 ADSOs A custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from SAP B4 DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions (AQ vs CL, active table, changelog table, requests status table, how to identify the next delta timestamp...), only requiring a few parameters that are explained and exemplified in the [template](#extraction-from-sap-b4-adsos-template) scenarios that we have created. !!! note This custom reader is very similar and uses most features from the sap_bw reader, so if you were using specific filters/parameters with the sap_bw reader, there is a high chance you can keep using it in a very similar way with the sap_b4 reader. The main concepts are applied to both readers, as the strategies on how to parallelize the extractions, for example. How can I find a good candidate column for [partitioning the extraction from S4Hana?](../extract_from_sap_bw_dso/extract_from_sap_bw_dso.md#how-can-we-decide-the-partitionColumn) !!! danger "**Parallelization Limitations**" There are no limits imposed by the Lakehouse-Engine framework, but you need to consider that there might be differences imposed by the source. E.g. Each User might be restricted on utilisation of about 100GB memory at a time from the source. Parallel extractions ***can bring a jdbc source down*** if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections. !!! danger **In case you want to perform further filtering in the REQTSN field, please be aware that it is not being pushed down to SAP B4 by default (meaning it will have bad performance).** In that case, you will need to use customSchema option while reading, so that you are able to enable filter push down for those. You can check the code documentation of the reader below: [**SAP B4 Reader**](../../../reference/packages/io/readers/sap_b4_reader.md) [**JDBC Extractions arguments**](../../../reference/packages/utils/extraction/jdbc_extraction_utils.md#packages.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__) [**SAP B4 Extractions arguments**](../../../reference/packages/utils/extraction/sap_b4_extraction_utils.md#packages.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__) !!! note For extractions using the SAP B4 reader, you can use the arguments listed in the SAP B4 arguments, but also the ones listed in the JDBC extractions, as those are inherited as well. ## Extraction from SAP B4 ADSOs Template This template covers the following scenarios of extractions from the SAP B4Hana ADSOs: - 1 - The Simplest Scenario (Not parallel - Not Recommended) - 2 - Parallel extraction - 2.1 - Simplest Scenario - 2.2 - Provide upperBound (Recommended) - 2.3 - Automatic upperBound (Recommended) - 2.4 - Provide predicates (Recommended) - 2.5 - Generate predicates (Recommended) !!! note The template will cover two ADSO Types: - **AQ**: ADSO which is of append type and for which a single ADSO/tables holds all the information, like an event table. For this type, the same ADSO is used for reading data both for the inits and deltas. Usually, these ADSOs end with the digit "6". - **CL**: ADSO which is split into two ADSOs, one holding the change log events, the other having the active data (current version of the truth for a particular source). For this type, the ADSO having the active data is used for the first extraction (init) and the change log ADSO is used for the subsequent extractions (deltas). Usually, these ADSOs are split into active table ending with the digit "2" and changelog table ending with digit "3". For each of these ADSO types, the lakehouse-engine abstracts the logic to get the delta extractions. This logic basically consists of joining the `db_table` (for `AQ`) or the `changelog_table` (for `CL`) with the table having the requests status (`my_database.requests_status_table`). One of the fields used for this joining is the `data_target`, which has a relationship with the ADSO (`db_table`/`changelog_table`), being basically the same identifier without considering parts of it. Based on the previous insights, the queries that the lakehouse-engine generates under the hood translate to (this is a simplified version, for more details please refer to the lakehouse-engine code documentation): **AQ Init Extraction:** `SELECT t.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp FROM my_database.my_table t` **AQ Delta Extraction:** `SELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp FROM my_database.my_table AS tbl JOIN my_database.requests_status_table AS req WHERE STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U') AND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('my_identifier') AND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table` **CL Init Extraction:** `SELECT t.*, {self._SAP_B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn, '0' AS datapakid, 0 AS record, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp FROM my_database.my_table_2 t` **CL Delta Extraction:** `SELECT tbl.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp` FROM my_database.my_table_3 AS tbl JOIN my_database.requests_status_table AS req WHERE STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U') AND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('my_data_target') AND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table` !!! note "Introductory Notes" If you want to have a better understanding about JDBC Spark optimizations, here you have a few useful links: - https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html - https://docs.databricks.com/en/connect/external-systems/jdbc.html - https://bit.ly/3x2eCEm - https://newbedev.com/how-to-optimize-partitioning-when-migrating-data-from-jdbc-source ### 1 - The Simplest Scenario (Not parallel - Not Recommended) This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source. It should only be used in case the ADSO you want to extract from SAP B4Hana is a small one, with no big requirements in terms of performance to fulfill. When extracting from the source ADSO, there are two options: - **Delta Init** - full extraction of the source ADSO. You should use it in the first time you extract from the ADSO or any time you want to re-extract completely. Similar to a so-called full load. - **Delta** - extracts the portion of the data that is new or has changed in the source, since the last extraction (using the `max_timestamp` value in the location of the data already extracted `latest_timestamp_data_location`). Below example is composed of two cells. - The first cell is only responsible to define the variables `extraction_type` and `write_type`, depending on the extraction type: **Delta Init** (`load_type = "init"`) or a **Delta** (`load_type = "delta"`). The variables in this cell will also be referenced by other acons/examples in this notebook, similar to what you would do in your pipelines/jobs, defining this centrally and then re-using it. - The second cell is where the acon to be used is defined (which uses the two variables `extraction_type` and `write_type` defined) and the `load_data` algorithm is executed to perform the extraction. !!! note There may be cases where you might want to always extract fully from the source ADSO. In these cases, you only need to use a Delta Init every time, meaning you would use `"extraction_type": "init"` and `"write_type": "overwrite"` as it is shown below. The explanation about what it is a Delta Init/Delta is applicable for all the scenarios presented in this notebook. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_b4", "options": { "url": "my_sap_b4_url", "user": "my_user", "password": "my_b4_hana_pwd", "dbtable": "my_database.my_table", "extraction_type": extraction_type, "latest_timestamp_data_location": "s3://my_path/my_identifier/", "adso_type": "AQ", }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ### 2 - Parallel extraction In this section, 5 possible scenarios for parallel extractions from SAP B4Hana ADSOs are presented. #### 2.1 - Parallel Extraction, Simplest Scenario This scenario provides the simplest example you can have for a parallel extraction from SAP B4Hana, only using the property `numPartitions`. The goal of the scenario is to cover the case in which people do not have much knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can be used to split the extraction in several tasks. This scenario can also be used if the use case does not have big performance requirements/concerns, meaning you do not feel the need to optimize the performance of the extraction to its maximum potential. On the example below, `"numPartitions": 10` is specified, meaning that Spark will open 10 parallel connections to the source ADSO and automatically decide how to parallelize the extraction upon that requirement. This is the only change compared to the example provided in the scenario 1. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_b4", "options": { "url": "my_sap_b4_url", "user": "my_user", "password": "my_sap_b4_pwd", "dbtable": "my_database.my_table", "extraction_type": extraction_type, "latest_timestamp_data_location": "s3://my_path/my_identifier_par_simple/", "adso_type": "AQ", "numPartitions": 10, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/my_identifier_par_simple/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.2 - Parallel Extraction, Provide upper_bound (Recommended) This scenario performs the extraction from the SAP B4 ADSO in parallel, but is more concerned with trying to optimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using the following options: - `numPartitions` - number of Spark partitions to split the extraction. - `partitionColumn` - column used to split the extraction. It must be a numeric, date, or timestamp. It should be a column that is able to split the extraction evenly in several tasks. An auto-increment column is usually a very good candidate. - `lowerBound` - lower bound to decide the partition stride. - `upperBound` - upper bound to decide the partition stride. This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as the `partitionColumn`. If you compare with the previous example, you'll notice that now `numPartitions` and three additional options are provided to fine tune the extraction (`partitionColumn`, `lowerBound`, `upperBound`). When these 4 properties are used, Spark will use them to build several queries to split the extraction. **Example:** for `"numPartitions": 10`, `"partitionColumn": "record"`, `"lowerBound: 1"`, `"upperBound: 100"`, Spark will generate 10 queries like this: - `SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL` - `SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20` - `SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30` - ... - `SELECT * FROM dummy_table WHERE RECORD >= 100` ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_b4", "options": { "url": "my_sap_b4_url", "user": "my_user", "password": "my_b4_hana_pwd", "dbtable": "my_database.my_table", "extraction_type": extraction_type, "latest_timestamp_data_location": "s3://my_path/my_identifier_par_prov_upper/", "adso_type": "AQ", "partitionColumn": "RECORD", "numPartitions": 10, "lowerBound": 1, "upperBound": 1000000, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/my_identifier_par_prov_upper/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.3 - Parallel Extraction, Automatic upper_bound (Recommended) This scenario is very similar to 2.2, the only difference being that **`upperBound` is not provided**. Instead, the property `calculate_upper_bound` equals to true is used to benefit from the automatic calculation of the `upperBound` (derived from the `partitionColumn`) offered by the lakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of the max value for the column. The only thing you need to consider is that if you use this automatic calculation of the upperBound you will be doing an initial query to the SAP B4 ADSO to retrieve the max value for the `partitionColumn`, before doing the actual query to perform the extraction. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_b4", "calculate_upper_bound": True, "options": { "url": "my_sap_b4_url", "user": "my_user", "password": "my_b4_hana_pwd", "dbtable": "my_database.my_table", "extraction_type": extraction_type, "latest_timestamp_data_location": "s3://my_path/my_identifier_par_calc_upper/", "adso_type": "AQ", "partitionColumn": "RECORD", "numPartitions": 10, "lowerBound": 1, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/my_identifier_par_calc_upper/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.4 - Parallel Extraction, Provide Predicates (Recommended) This scenario performs the extraction from SAP B4 ADSO in parallel, useful in contexts in which there is no numeric, date or timestamp column to parallelize the extraction (e.g. when extracting from ADSO of Type `CL`, the active table does not have the `RECORD` column, which is usually a good option for scenarios 2.2 and 2.3): - `partitionColumn` - column used to split the extraction. It can be of any type. This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as the `partitionColumn`, specially if these columns are not complying with the scenario 2.2 or 2.3. **When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.** Below the lakehouse function to generate predicate list automatically is presented. This function needs to be used carefully, specially on predicates_query and predicates_add_null variables. **predicates_query:** At the sample below the whole table is being considered (`select distinct(x) from table`), but it is possible to filter predicates list here, specially if you are applying filter on transformations spec, and you know entire table won't be necessary, so you can change it to something like this: `select distinct(x) from table where x > y`. **predicates_add_null:** You can decide if you want to consider null on predicates list or not, by default this property is `True`. **Example:** for `"partition_column": "CALMONTH"` ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" # import the lakehouse_engine ExecEnv class, so that you can use the functions it offers # import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionUtils, ) ExecEnv.get_or_create() partition_column = "CALMONTH" dbtable = "my_database.my_table_3" predicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})""" user = "my_user" password = "my_b4_hana_pwd" url = "my_sap_b4_url" predicates_add_null = True jdbc_util = JDBCExtractionUtils( JDBCExtraction( user=user, password=password, url=url, predicates_add_null=predicates_add_null, partition_column=partition_column, dbtable=dbtable, ) ) predicates = jdbc_util.get_predicates(predicates_query) acon = { "input_specs": [ { "spec_id": "my_identifier_2_source", "read_type": "batch", "data_format": "sap_b4", "options": { "url": "my_sap_b4_url", "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", "dbtable": "my_database.my_table_2", "changelog_table": "my_database.my_table_3", "extraction_type": extraction_type, "latest_timestamp_data_location": "s3://my_path/my_identifier_2_prov_predicates/", "adso_type": "CL", "predicates": predicates, }, } ], "output_specs": [ { "spec_id": "my_identifier_2_bronze", "input_id": "my_identifier_2_source", "write_type": write_type, "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/my_identifier_2_prov_predicates/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.5 - Parallel Extraction, Generate Predicates This scenario is very similar to the scenario 2.4, with the only difference that it automatically generates the predicates (`"generate_predicates": True`). This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as the `partitionColumn`, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise those would probably be recommended). When this property is used, the lakehouse engine will generate the predicates to be used to extract data from the source. What the lakehouse engine does is to check for the init/delta portion of the data, what are the distinct values of the `partitionColumn` serving that data. Then, these values will be used by Spark to generate several queries to extract from the source in a parallel fashion. Each distinct value of the `partitionColumn` will be a query, meaning that you will not have control over the number of partitions used for the extraction. For example, if you face a scenario in which you are using a `partitionColumn` `LOAD_DATE` and for today's delta, all the data (let's suppose 2 million rows) is served by a single `LOAD_DATE = 20200101`, that would mean Spark would use a single partition to extract everything. In this extreme case you would probably need to change your `partitionColumn`. **Note:** these extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3. **Example:** for `"partitionColumn": "record"` Generate predicates: - `SELECT DISTINCT(RECORD) as RECORD FROM dummy_table` - `1` - `2` - `3` - ... - `100` - Predicates List: ['RECORD=1','RECORD=2','RECORD=3',...,'RECORD=100'] Spark will generate 100 queries like this: - `SELECT * FROM dummy_table WHERE RECORD = 1` - `SELECT * FROM dummy_table WHERE RECORD = 2` - `SELECT * FROM dummy_table WHERE RECORD = 3` - ... - `SELECT * FROM dummy_table WHERE RECORD = 100` Generate predicates will also consider null by default: - `SELECT * FROM dummy_table WHERE RECORD IS NULL` To disable this behaviour the following variable value should be changed to false: `"predicates_add_null": False` ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_2_source", "read_type": "batch", "data_format": "sap_b4", "generate_predicates": True, "options": { "url": "my_sap_b4_url", "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", "dbtable": "my_database.my_table_2", "changelog_table": "my_database.my_table_3", "extraction_type": extraction_type, "latest_timestamp_data_location": "s3://my_path/my_identifier_2_gen_predicates/", "adso_type": "CL", "partitionColumn": "CALMONTH", }, } ], "output_specs": [ { "spec_id": "my_identifier_2_bronze", "input_id": "my_identifier_2_source", "write_type": write_type, "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/my_identifier_2_gen_predicates/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/__init__.py ================================================ """ .. include::extract_from_sap_bw_dso.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md ================================================ # Extract from SAP BW DSOs !!! danger "**Parallelization Limitations**" Parallel extractions **can bring a jdbc source down** if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections. A custom sap_bw reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from SAP BW DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions (active table, changelog table, activation requests table, how to identify the next delta timestamp...), only requiring a few parameters that are explained and exemplified in the [template](#extraction-from-sap-bw-template) scenarios that we have created. This page also provides you a section to help you figure out a good candidate for [partitioning the extraction from SAP BW](#how-can-we-decide-the-partitionColumn). You can check the code documentation of the reader below: [**SAP BW Reader**](../../../reference/packages/io/readers/sap_bw_reader.md) [**JDBC Extractions arguments**](../../../reference/packages/utils/extraction/jdbc_extraction_utils.md#packages.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__) [**SAP BW Extractions arguments**](../../../reference/packages/utils/extraction/sap_bw_extraction_utils.md#packages.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__) !!! note For extractions using the SAP BW reader, you can use the arguments listed in the SAP BW arguments, but also the ones listed in the JDBC extractions, as those are inherited as well. ## Extraction from SAP-BW template This template covers the following scenarios of extractions from the SAP BW DSOs: - 1 - The Simplest Scenario (Not parallel - Not Recommended) - 2 - Parallel extraction - 2.1 - Simplest Scenario - 2.2 - Provide upperBound (Recommended) - 2.3 - Automatic upperBound (Recommended) - 2.4 - Backfilling - 2.5 - Provide predicates (Recommended) - 2.6 - Generate predicates (Recommended) - 3 - Extraction from Write Optimized DSO - 3.1 - Get initial actrequest_timestamp from Activation Requests Table !!! note "Introductory Notes" If you want to have a better understanding about JDBC Spark optimizations, here you have a few useful links: - https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html - https://docs.databricks.com/en/connect/external-systems/jdbc.html - https://bit.ly/3x2eCEm - https://newbedev.com/how-to-optimize-partitioning-when-migrating-data-from-jdbc-source ### 1 - The Simplest Scenario (Not parallel - Not Recommended) This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source. It should only be used in case the DSO you want to extract from SAP BW is a small one, with no big requirements in terms of performance to fulfill. When extracting from the source DSO, there are two options: - **Delta Init** - full extraction of the source DSO. You should use it in the first time you extract from the DSO or any time you want to re-extract completely. Similar to a so-called full load. - **Delta** - extracts the portion of the data that is new or has changed in the source, since the last extraction (using the max `actrequest_timestamp` value in the location of the data already extracted, by default). Below example is composed of two cells. - The first cell is only responsible to define the variables `extraction_type` and `write_type`, depending on the extraction type **Delta Init** (`LOAD_TYPE = INIT`) or a **Delta** (`LOAD_TYPE = DELTA`). The variables in this cell will also be referenced by other acons/examples in this notebook, similar to what you would do in your pipelines/jobs, defining this centrally and then re-using it. - The second cell is where the acon to be used is defined (which uses the two variables `extraction_type` and `write_type` defined) and the `load_data` algorithm is executed to perform the extraction. !!! note There may be cases where you might want to always extract fully from the source DSO. In these cases, you only need to use a Delta Init every time, meaning you would use `"extraction_type": "init"` and `"write_type": "overwrite"` as it is shown below. The explanation about what it is a Delta Init/Delta is applicable for all the scenarios presented in this notebook. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", # You should use this custom reader to benefit from the lakehouse-engine utils for extractions from SAP BW "data_format": "sap_bw", "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "changelog_table": "my_database.my_changelog_table", "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ### 2 - Parallel extraction In this section, 6 possible scenarios for parallel extractions from SAP BW DSOs. #### 2.1 - Parallel Extraction, Simplest Scenario This scenario provides the simplest example you can have for a parallel extraction from SAP BW, only using the property `numPartitions`. The goal of the scenario is to cover the case in which people does not have much knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can be used to split the extraction in several tasks. This scenario can also be used if the use case does not have big performance requirements/concerns, meaning you do not feel the need to optimize the performance of the extraction to its maximum potential. On the example below, `"numPartitions": 10` is specified, meaning that Spark will open 10 parallel connections to the source DSO and automatically decide how to parallelize the extraction upon that requirement. This is the only change compared to the example provided in the example 1. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "changelog_table": "my_database.my_changelog_table", "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, "numPartitions": 10, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.2 - Parallel Extraction, Provide upper_bound (Recommended) This scenario performs the extraction from the SAP BW DSO in parallel, but is more concerned with trying to optimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using the following options: - `numPartitions` - number of Spark partitions to split the extraction. - `partitionColumn` - column used to split the extraction. It must be a numeric, date, or timestamp. It should be a column that is able to split the extraction evenly in several tasks. An auto-increment column is usually a very good candidate. - `lowerBound` - lower bound to decide the partition stride. - `upperBound` - upper bound to decide the partition stride. It can either be **provided (as it is done in this example)** or derived automatically by our upperBound optimizer (example 2.3). This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as the `partitionColumn`. If you compare with the previous example, you'll notice that now `numPartitions` and three additional options are provided to fine tune the extraction (`partitionColumn`, `lowerBound`, `upperBound`). When these 4 properties are used, Spark will use them to build several queries to split the extraction. **Example:** for `"numPartitions": 10`, `"partitionColumn": "record"`, `"lowerBound: 1"`, `"upperBound: 100"`, Spark will generate 10 queries like this: - `SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL` - `SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20` - `SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30` - ... - `SELECT * FROM dummy_table WHERE RECORD >= 100` ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "changelog_table": "my_database.my_changelog_table", "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, "numPartitions": 3, "partitionColumn": "my_partition_col", "lowerBound": 1, "upperBound": 42, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.3 - Parallel Extraction, Automatic upper_bound (Recommended) This scenario is very similar to 2.2, the only difference being that **upper_bound is not provided**. Instead, the property `calculate_upper_bound` equals to true is used to benefit from the automatic calculation of the upperBound (derived from the `partitionColumn`) offered by the lakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of the max value for the column. The only thing you need to consider is that if you use this automatic calculation of the upperBound you will be doing an initial query to the SAP BW DSO to retrieve the max value for the `partitionColumn`, before doing the actual query to perform the extraction. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "calculate_upper_bound": True, "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "changelog_table": "my_database.my_changelog_table", "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, "numPartitions": 10, "partitionColumn": "my_partition_col", "lowerBound": 1, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.4 - Parallel Extraction, Backfilling This scenario covers the case, in which you might want to backfill the data extracted from a SAP BW DSO and made available in the bronze layer. By default, the delta extraction considers the max value of the column `actrequest_timestamp` on the data already extracted. However, there might be cases, in which you might want to extract a delta from a particular timestamp onwards or for a particular interval of time. For this, you can use the properties `min_timestamp` and `max_timestamp`. Below, a very similar example to the previous one is provided, the only differences being that the properties `"min_timestamp": "20210910000000"` and `"max_timestamp": "20210913235959"` are not provided, meaning it will extract the data from the changelog table, using a filter `"20210910000000" > actrequest_timestamp <= "20210913235959"`, ignoring if some of the data is already available in the destination or not. Moreover, note that the property `latest_timestamp_data_location` does not need to be provided, as the timestamps to be considered are being directly provided (if both the timestamps and the `latest_timestamp_data_location` are provided, the last parameter will have no effect). Additionally, `"extraction_type": "delta"` and `"write_type": "append"` is forced, instead of using the variables as in the other examples, because the backfilling scenario only makes sense for delta extractions. !!! note Note: be aware that the backfilling example being shown has no mechanism to enforce that you don't generate duplicated data in bronze. For your scenarios, you can either use this example and solve any duplication in the silver layer or extract the delta with a merge strategy while writing to bronze, instead of appending. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "calculate_upper_bound": True, "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "changelog_table": "my_database.my_changelog_table", "extraction_type": "delta", "numPartitions": 10, "partitionColumn": "my_partition_col", "lowerBound": 1, "min_timestamp": "20210910000000", "max_timestamp": "20210913235959", }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": "append", "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.5 - Parallel Extraction, Provide Predicates (Recommended) This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no numeric, date or timestamp column to parallelize the extraction: - `partitionColumn` - column used to split the extraction. It can be of any type. This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as the `partitionColumn`, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise those would probably be recommended). **When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.** Below the lakehouse function to generate predicate list automatically is presented. This function needs to be used carefully, specially on predicates_query and predicates_add_null variables. **predicates_query:** At the sample below the whole table is being considered (`select distinct(x) from table`), but it is possible to filter predicates list here, specially if you are applying filter on transformations spec, and you know entire table won't be necessary, so you can change it to something like this: `select distinct(x) from table where x > y`. **predicates_add_null:** You can decide if you want to consider null on predicates list or not, by default this property is True. ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" # import the lakehouse_engine ExecEnv class, so that you can use the functions it offers # import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionUtils, ) ExecEnv.get_or_create() partition_column = "my_partition_column" dbtable = "my_database.my_table" predicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})""" column_for_predicates = partition_column user = "my_user" password = "my_hana_pwd" url = "my_bw_url" predicates_add_null = True jdbc_util = JDBCExtractionUtils( JDBCExtraction( user=user, password=password, url=url, dbtable=dbtable, partition_column=partition_column, ) ) predicates = jdbc_util.get_predicates(predicates_query) acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, "predicates": predicates, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.6 - Parallel Extraction, Generate Predicates (Recommended) This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no numeric, date or timestamp column to parallelize the extraction: - `partitionColumn` - column used to split the extraction. It can be of any type. This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as the `partitionColumn`, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise those would probably be recommended). When this property is used, the lakehouse engine will generate the predicates to be used to extract data from the source. What the lakehouse engine does is to check for the init/delta portion of the data, what are the distinct values of the `partitionColumn` serving that data. Then, these values will be used by Spark to generate several queries to extract from the source in a parallel fashion. Each distinct value of the `partitionColumn` will be a query, meaning that you will not have control over the number of partitions used for the extraction. For example, if you face a scenario in which you are using a `partitionColumn` `LOAD_DATE` and for today's delta, all the data (let's suppose 2 million rows) is served by a single `LOAD_DATE = 20200101`, that would mean Spark would use a single partition to extract everything. In this extreme case you would probably need to change your `partitionColumn`. **Note:** these extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3. **Example:** for `"partitionColumn": "record"` Generate predicates: - `SELECT DISTINCT(RECORD) as RECORD FROM dummy_table` - `1` - `2` - `3` - ... - `100` - Predicates List: ['RECORD=1','RECORD=2','RECORD=3',...,'RECORD=100'] Spark will generate 100 queries like this: - `SELECT * FROM dummy_table WHERE RECORD = 1` - `SELECT * FROM dummy_table WHERE RECORD = 2` - `SELECT * FROM dummy_table WHERE RECORD = 3` - ... - `SELECT * FROM dummy_table WHERE RECORD = 100` Generate predicates will also consider null by default: - `SELECT * FROM dummy_table WHERE RECORD IS NULL` To disable this behaviour the following variable value should be changed to false: `"predicates_add_null": False` ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "generate_predicates": True, "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "odsobject": "my_ods_object", "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, "partitionColumn": "my_partition_col", }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ### 3 - Extraction from Write Optimized DSOs This scenario is based on the best practices of the scenario 2.2, but it is ready to extract data from Write Optimized DSOs, which have the changelog embedded in the active table, instead of having a separate changelog table. Due to this reason, you need to specify that the `changelog_table` parameter value is equal to the `dbtable` parameter value. Moreover, these tables usually already include the changelog technical columns like `RECORD` and `DATAPAKID`, for example, that the framework adds by default. Thus, you need to specify `"include_changelog_tech_cols": False` to change this behaviour. Finally, you also need to specify the name of the column in the table that can be used to join with the activation requests table to get the timestamp of the several requests/deltas, which is `"actrequest"` by default (`"request_col_name": 'request'`). ```python from lakehouse_engine.engine import load_data LOAD_TYPE = "INIT" or "DELTA" if LOAD_TYPE == "INIT": extraction_type = "init" write_type = "overwrite" else: extraction_type = "delta" write_type = "append" acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "changelog_table": "my_database.my_table", "odsobject": "my_ods_object", "request_col_name": "request", "include_changelog_tech_cols": False, "latest_timestamp_data_location": "s3://my_path/my_identifier/", "extraction_type": extraction_type, "numPartitions": 2, "partitionColumn": "RECORD", "lowerBound": 1, "upperBound": 50000, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 3.1 - Extraction from Write Optimized DSOs, Get ACTREQUEST_TIMESTAMP from Activation Requests Table By default, the act_request_timestamp has being hardcoded (either assumes a given extraction_timestamp or the current timestamp) in the init extraction, however this may be causing problems when merging changes in silver, for write optimised DSOs. So, a new possibility to choose when to retrieve this timestamp from the act_req_table was added. This scenario performs the data extraction from Write Optimized DSOs, forcing the actrequest_timestamp to assume the value from the activation requests table (timestamp column). This feature is only available for WODSOs and to use it you need to specify `"get_timestamp_from_actrequest": True`. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "sap_bw", "options": { "user": "my_user", "password": "my_hana_pwd", "url": "my_sap_bw_url", "dbtable": "my_database.my_table", "changelog_table": "my_database.my_table", "odsobject": "my_ods_object", "request_col_name": "request", "include_changelog_tech_cols": False, "latest_timestamp_data_location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP/", "extraction_type": "init", "numPartitions": 2, "partitionColumn": "RECORD", "lowerBound": 1, "upperBound": 50000, "get_timestamp_from_act_request": True, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": "overwrite", "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ## How can we decide the partitionColumn? **Compatible partitionColumn for upperBound/lowerBound Spark options:** It needs to be **int, date, timestamp** → https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html **If you don't have any column to partition on those formats, you can use predicates to partition the table** → https://docs.databricks.com/en/connect/external-systems/jdbc.html#manage-parallelism One of the most important parameters to optimise the extraction is the **partitionColumn**, as you can see in the template. Thus, this section helps you figure out if a column is a good candidate or not. Basically the partition column needs to be a column which is able to adequately split the processing, which means we can use it to "create" different queries with intervals/filters, so that the Spark tasks process similar amounts of rows/volume. Usually a good candidate is an integer auto-increment technical column. !!! note Although RECORD is usually a good candidate, it is usually available on the changelog table only. Meaning that you would need to use a different strategy for the init. In case you don't have good candidates for partitionColumn, you can use the sample acon provided in the **scenario 2.1** in the template above. It might make sense to use **scenario 2.1** for the init and then **scenario 2.2 or 2.3** for the subsequent deltas. **When there is no int, date or timestamp good candidate for partitionColumn:** In this case you can opt by the **scenario 2.5 - Generate Predicates**, which supports any kind of column to be defined as **partitionColumn**. However, you should still analyse if the column you are thinking about is a good candidate or not. In this scenario, Spark will create one query per distinct value of the **partitionColumn**, so you can perform some analysis. ================================================ FILE: lakehouse_engine_usage/data_loader/extract_from_sftp/__init__.py ================================================ """ .. include::extract_from_sftp.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/extract_from_sftp/extract_from_sftp.md ================================================ # Extract from SFTP Secure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web. This feature is available in the Lakehouse Engine with the purpose of having a mechanism to read data directly from SFTP directories without moving those files manually/physically to a S3 bucket. The engine uses Pandas to read the files and converts them into a Spark dataframe, which makes the available resources of an Acon usable, such as `dq_specs`, `output_specs`, `terminator_specs` and `transform_specs`. Furthermore, this feature provides several filters on the directories that makes easier to control the extractions. #### **Introductory Notes**: There are important parameters that must be added to **input specs** in order to make the SFTP extraction work properly: !!! note "**Read type**" The engine supports only **BATCH** mode for this feature. **sftp_files_format** - File format that will be used to read data from SFTP. **The engine supports: CSV, FWF, JSON and XML**. **location** - The SFTP directory to be extracted. If it is necessary to filter a specific file, it can be made using the `file_name_contains` option. **options** - Arguments used to set the Paramiko SSH client connection (hostname, username, password, port...), set the filter to retrieve files and set the file parameters (separators, headers, cols...). For more information about the file parameters, please go to the Pandas link in the useful links section. The options allowed are: | Property type | Detail | Example | Comment | |-------------------------------|--------------------------|------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Connection | add_auto_policy(str) | true of false | Indicates to allow an SFTP connection using no host key. When a connection attempt is being made using no host key, then the engine will throw an exception if the auto_add_policy property is false. The purpose of this flag is to make the user conscientiously choose a lesser secure connection. | | Connection | key_type (str) | "Ed25519" or "RSA" | Indicates the key type to be used for the connection (SSH, Ed25519). | | Connection | key_filename (str) | "/path/to/private_key/private_key.ppk" | The filename, or list of filenames, of optional private(keys), and/or certs to try for authentication. It must be used with a pkey in order to add a policy. If a pkey is not provided, then use `add_auto_policy`. | | Connection | pkey (str) | "AAAAC3MidD1lVBI1NTE5AAAAIKssLqd6hjahPi9FBH4GPDqMqwxOMsfxTgowqDCQAeX+" | Value to use for the host key when connecting to the remote SFTP server. | | Filter | date_time_gt (str) | "1900-01-01" or "1900-01-01 08:59:59" | Filter the files greater than the string datetime formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" | | Filter | date_time_lt (str) | "3999-12-31" or "3999-12-31 20:59:59" | Filter the files lower than the string datetime formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS" | | Filter | earliest_file (bool) | true or false | Filter the earliest dated file in the directory. | | Filter | file_name_contains (str) | "part_of_filename" | Filter files when match the pattern. | | Filter | latest_file (bool) | true or false | Filter the most recent dated file in the directory. | | Read data from subdirectories | sub_dir (bool) | true or false | The engine will search files into subdirectories of the **location**. It will consider one level below the root location given.
When `sub_dir` is used with **latest_file/earliest_file** argument, the engine will retrieve the latest/earliest file for each subdirectory. | | Add metadata info | file_metadata (bool) | true or false | When this option is set as True, the dataframe retrieves the **filename with location** and the **modification_time** from the original files in sftp. It attaches these two columns adding the information to respective records. | **Useful Info & Links**: 1. [Paramiko SSH Client](https://docs.paramiko.org/en/latest/api/client.html) 2. [Pandas documentation](https://pandas.pydata.org/docs/reference/io.html) ## Scenario 1 The scenario below shows the extraction of a CSV file using most part of the available filter options. Also, as an example, the column "created_on" is created in the transform_specs in order to store the processing date for every record. As the result, it will have in the output table the original file date (provided by the option `file_metadata`) and the processing date from the engine. For an incremental load approach, it is advised to use the "modification_time" column created by the option `file_metadata`. Since it has the original file date of modification, this date can be used in the logic to control what is new and has been changed recently. !!! note Below scenario uses **"add_auto_policy": true**, which is **not recommended**. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sftp_source", "read_type": "batch", "data_format": "sftp", "sftp_files_format": "csv", "location": "my_sftp_data_path", "options": { "hostname": "my_sftp_hostname", "username": "my_sftp_username", "password": "my_sftp_password", "port": "my_port", "add_auto_policy": True, "file_name_contains": "test_pattern", "args": {"sep": "|"}, "latest_file": True, "file_metadata": True } }, ], "transform_specs": [ { "spec_id": "sftp_transformations", "input_id": "sftp_source", "transformers": [ { "function": "with_literals", "args": {"literals": {"created_on": datetime.now()}}, }, ], }, ], "output_specs": [ { "spec_id": "sftp_bronze", "input_id": "sftp_transformations", "write_type": "append", "data_format": "delta", "location": "s3://my_path/dummy_table" } ] } load_data(acon=acon) ``` ## Scenario 2 The following scenario shows the extraction of a JSON file using an RSA pkey authentication instead of auto_add_policy. The engine supports Ed25519Key and RSA for pkeys. For the pkey file location, it is important to have the file in a location accessible by the cluster. This can be achieved either by mounting the location or with volumes. !!! note This scenario uses a more secure authentication, thus it is the recommended option, instead of the previous scenario. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "sftp_source", "read_type": "batch", "data_format": "sftp", "sftp_files_format": "json", "location": "my_sftp_data_path", "options": { "hostname": "my_sftp_hostname", "username": "my_sftp_username", "password": "my_sftp_password", "port": "my_port", "key_type": "RSA", "key_filename": "dbfs_mount_location/my_file_key.ppk", "pkey": "my_key", "latest_file": True, "file_metadata": True, "args": {"lines": True, "orient": "columns"}, }, }, ], "transform_specs": [ { "spec_id": "sftp_transformations", "input_id": "sftp_source", "transformers": [ { "function": "with_literals", "args": {"literals": {"lh_created_on": datetime.now()}}, }, ], }, ], "output_specs": [ { "spec_id": "sftp_bronze", "input_id": "sftp_transformations", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_path/dummy_table" } ] } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/__init__.py ================================================ """ .. include::extract_using_jdbc_connection.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/extract_using_jdbc_connection.md ================================================ # Extract using JDBC connection !!! danger "**SAP Extraction**" SAP is only used as an example to demonstrate how we can use a JDBC connection to extract data. **If you are looking to extract data from SAP, please use our sap_b4 or sap_bw reader.** You can find the **sap_b4 reader** documentation: [Extract from SAP B4 ADSOs](../../data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md) and the **sap_bw reader** documentarion: [Extract from SAP BW DSOs](../../data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md) !!! danger "**Parallel Extraction**" Parallel extractions **can bring a jdbc source down** if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections. ## Introduction Many databases allow a JDBC connection to extract data. Our engine has one reader where you can configure all the necessary definitions to connect to a database using JDBC. In the next section you will find several examples about how to do it. ## The Simplest Scenario using sqlite !!! warning "Not parallel" Recommended for smaller datasets only, or when stressing the source system is a high concern This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source. Here we use a sqlite database where any connection is allowed. Due to that, we do not specify any username or password. Same as spark, we provide two different ways to run jdbc reader. 1 - We can use the **jdbc() function**, passing inside all the arguments needed for Spark to work, and we can even combine this with additional options passed through .options(). 2 - Other way is using **.format("jdbc")** and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution. **You can find and run the following code in our local test for the engine.** ### jdbc() function As we can see in the next cell, all the arguments necessary to establish the jdbc connection are passed inside the `jdbc_args` object. Here we find the url, the table, and the driver. Besides that, we can add options, such as the partition number. The partition number will impact in the queries' parallelism. The below code is an example in how to use jdbc() function in our ACON. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/batch_init.json!} ``` This is same as using the following code in pyspark: ```python spark.read.jdbc( url="jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db", table="jdbc_function", properties={"driver":"org.sqlite.JDBC"}) .option("numPartitions", 1) ``` ### .format("jdbc") In this example we do not use the `jdbc_args` object. All the jdbc connection parameters are inside the dictionary with the object options. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/batch_init.json!} ``` This is same as using the following code in pyspark: ```python spark.read.format("jdbc") .option("url", "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db") .option("driver", "org.sqlite.JDBC") .option("dbtable", "jdbc_format") .option("numPartitions", 1) ``` ## Template with more complete and runnable examples In this template we will use a **SAP as example** for a more complete and runnable example. These definitions can be used in several databases that allow JDBC connection. The following scenarios of extractions are covered: - 1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets only, or when stressing the source system is a high concern) - 2 - Parallel extraction - 2.1 - Simplest Scenario - 2.2 - Provide upperBound (Recommended) - 2.3 - Provide predicates (Recommended) !!! note "Disclaimer" This template only uses **SAP as demonstration example for JDBC connection.** **This isn't a SAP template!!!** **If you are looking to extract data from SAP, please use our sap_b4 reader or the sap_bw reader.** The JDBC connection has 2 main sections to be filled, the **jdbc_args** and **options**: - jdbc_args - Here you need to fill everything related to jdbc connection itself, like table/query, url, user, ..., password. - options - This section is more flexible, and you can provide additional options like "fetchSize", "batchSize", "numPartitions", ..., upper and "lowerBound". If you want to know more regarding jdbc spark options you can follow the link below: - https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html If you want to have a better understanding about JDBC Spark optimizations, you can find them in the following: - https://docs.databricks.com/en/connect/external-systems/jdbc.html - https://stackoverflow.com/questions/41085238/what-is-the-meaning-of-partitioncolumn-lowerbound-upperbound-numpartitions-pa - https://newbedev.com/how-to-optimize-partitioning-when-migrating-data-from-jdbc-source ### 1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets, or for not stressing the source) This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source. It should only be used in case the data you want to extract from is a small one, with no big requirements in terms of performance to fulfill. When extracting from the source, we can have two options: - **Delta Init** - full extraction of the source. You should use it in the first time you extract from the source or any time you want to re-extract completely. Similar to a so-called full load. - **Delta** - extracts the portion of the data that is new or has changed in the source, since the last extraction (for that, the logic at the transformation step needs to be applied). On the examples below, the logic using REQTSN column is applied, which means that the maximum value on bronze is filtered and its value is used to filter incoming data from the data source. ##### Init - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "fetchSize": 100000, "compress": True, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": "overwrite", "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ##### Delta - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_jdbc_url", "table": "my_database.my_table", "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "fetchSize": 100000, "compress": True, }, }, { "spec_id": "my_identifier_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/", }, ], "transform_specs": [ { "spec_id": "max_my_identifier_bronze_date", "input_id": "my_identifier_bronze", "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}], }, { "spec_id": "appended_my_identifier", "input_id": "my_identifier_source", "transformers": [ { "function": "incremental_filter", "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"}, } ], }, ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "appended_my_identifier", "write_type": "append", "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ### 2 - Parallel extraction On this section we present 3 possible scenarios for parallel extractions from JDBC sources. !!! note "Disclaimer for parallel extraction" Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. **Be careful when choosing the number of partitions. Spark is a distributed system and can lead to many connections.** #### 2.1 - Parallel Extraction, Simplest Scenario This scenario provides the simplest example you can have for a parallel extraction from JDBC sources, only using the property `numPartitions`. The goal of the scenario is to cover the case in which people do not have much experience around how to optimize the extraction from JDBC sources or cannot identify a column that can be used to split the extraction in several tasks. This scenario can also be used if the use case does not have big performance requirements/concerns, meaning you do not feel the need to optimize the performance of the extraction to its maximum potential. On the example bellow, `"numPartitions": 10` is specified, meaning that Spark will open 10 parallel connections to the source and automatically decide how to parallelize the extraction upon that requirement. This is the only change compared to the example provided in the scenario 1. ##### Delta Init - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "fetchSize": 100000, "compress": True, "numPartitions": 10, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": "overwrite", "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ##### Delta - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "fetchSize": 100000, "compress": True, "numPartitions": 10, }, }, { "spec_id": "my_identifier_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/", }, ], "transform_specs": [ { "spec_id": "max_my_identifier_bronze_date", "input_id": "my_identifier_bronze", "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}], }, { "spec_id": "appended_my_identifier", "input_id": "my_identifier_source", "transformers": [ { "function": "incremental_filter", "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"}, } ], }, ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "appended_my_identifier", "write_type": "append", "data_format": "delta", "partitions": ["REQTSN"], "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.2 - Parallel Extraction, Provide upper_bound (Recommended) This scenario performs the extraction from the JDBC source in parallel, but has more concerns trying to optimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using the following options: - `numPartitions` - number of Spark partitions to split the extraction. - `partitionColumn` - column used to split the extraction. It must be a numeric, date, or timestamp. It should be a column that is able to split the extraction evenly in several tasks. An auto-increment column is usually a very good candidate. - `lowerBound` - lower bound to decide the partition stride. - `upperBound` - upper bound to decide the partition stride. This is an adequate example to be followed if there is a column in the data source that is good to be used as the `partitionColumn`. Comparing with the previous example, the `numPartitions` and three additional options to fine tune the extraction (`partitionColumn`, `lowerBound`, `upperBound`) are provided. When these 4 properties are used, Spark will use them to build several queries to split the extraction. **Example:** for `"numPartitions": 10`, `"partitionColumn": "record"`, `"lowerBound: 1"`, `"upperBound: 100"`, Spark will generate 10 queries like: - `SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL` - `SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20` - `SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30` - ... - `SELECT * FROM dummy_table WHERE RECORD >= 100` ##### Init - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "partitionColumn": "RECORD", "numPartitions": 10, "lowerBound": 1, "upperBound": 2000, "fetchSize": 100000, "compress": True, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": "overwrite", "data_format": "delta", "partitions": ["RECORD"], "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ##### Delta - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "partitionColumn": "RECORD", "numPartitions": 10, "lowerBound": 1, "upperBound": 2000, "fetchSize": 100000, "compress": True, }, }, { "spec_id": "my_identifier_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/", }, ], "transform_specs": [ { "spec_id": "max_my_identifier_bronze_date", "input_id": "my_identifier_bronze", "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}], }, { "spec_id": "appended_my_identifier", "input_id": "my_identifier_source", "transformers": [ { "function": "incremental_filter", "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"}, } ], }, ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "appended_my_identifier", "write_type": "append", "data_format": "delta", "partitions": ["RECORD"], "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` #### 2.3 - Parallel Extraction with Predicates (Recommended) This scenario performs the extraction from JDBC source in parallel, useful in contexts where there aren't numeric, date or timestamp columns to parallelize the extraction: - `partitionColumn` - column used to split the extraction (can be of any type). - This is an adequate example to be followed if there is a column in the data source that is good to be used as the `partitionColumn`, specially if these columns are not complying with the scenario 2.2. **When this property is used, all predicates to Spark need to be provided, otherwise it will leave data behind.** Bellow, a lakehouse function to generate predicate list automatically, is presented. **By using this function one needs to be careful specially on predicates_query and predicates_add_null variables.** **predicates_query:** At the sample below the whole table (`select distinct(x) from table`) is being considered, but it is possible to filter using predicates list here, specially if you are applying filter on transformations spec, and you know entire table won't be necessary, so you can change it to something like this: `select distinct(x) from table where x > y`. **predicates_add_null:** One can consider if null on predicates list or not. By default, this property is True. **Example:** for `"partitionColumn": "record"` ##### Init - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionUtils, ) ExecEnv.get_or_create() partitionColumn = "my_partition_col" dbtable = "my_database.my_table" predicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})""" column_for_predicates = partitionColumn user = "my_user" password = "my_b4_hana_pwd" url = "my_sap_b4_url" driver = "com.sap.db.jdbc.Driver" predicates_add_null = True jdbc_util = JDBCExtractionUtils( JDBCExtraction( user=user, password=password, url=url, predicates_add_null=predicates_add_null, partition_column=partitionColumn, dbtable=dbtable, ) ) predicates = jdbc_util.get_predicates(predicates_query) acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "predicates": predicates, "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "fetchSize": 100000, "compress": True, }, } ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "my_identifier_source", "write_type": "overwrite", "data_format": "delta", "partitions": ["RECORD"], "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ##### Delta - Load data into the Bronze Bucket ```python from lakehouse_engine.engine import load_data from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.extraction.jdbc_extraction_utils import ( JDBCExtraction, JDBCExtractionUtils, ) ExecEnv.get_or_create() partitionColumn = "my_partition_col" dbtable = "my_database.my_table" predicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})""" column_for_predicates = partitionColumn user = "my_user" password = "my_b4_hana_pwd" url = "my_sap_b4_url" driver = "com.sap.db.jdbc.Driver" predicates_add_null = True jdbc_util = JDBCExtractionUtils( JDBCExtraction( user=user, password=password, url=url, predicates_add_null=predicates_add_null, partition_column=partitionColumn, dbtable=dbtable, ) ) predicates = jdbc_util.get_predicates(predicates_query) acon = { "input_specs": [ { "spec_id": "my_identifier_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "my_sap_b4_url", "table": "my_database.my_table", "predicates": predicates, "properties": { "user": "my_user", "password": "my_b4_hana_pwd", "driver": "com.sap.db.jdbc.Driver", }, }, "options": { "fetchSize": 100000, "compress": True, }, }, { "spec_id": "my_identifier_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/", }, ], "transform_specs": [ { "spec_id": "max_my_identifier_bronze_date", "input_id": "my_identifier_bronze", "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}], }, { "spec_id": "appended_my_identifier", "input_id": "my_identifier_source", "transformers": [ { "function": "incremental_filter", "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"}, } ], }, ], "output_specs": [ { "spec_id": "my_identifier_bronze", "input_id": "appended_my_identifier", "write_type": "append", "data_format": "delta", "partitions": ["RECORD"], "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/filtered_full_load/__init__.py ================================================ """ .. include::filtered_full_load.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/filtered_full_load/filtered_full_load.md ================================================ # Filtered Full Load This scenario is very similar to the [full load](../full_load/full_load.md), but it filters the data coming from the source, instead of doing a complete full load. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/full_load/with_filter/batch.json!} ``` ##### Relevant notes: * As seen in the ACON, the filtering capabilities are provided by a transformer called `expression_filter`, where you can provide a custom Spark SQL filter. ================================================ FILE: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/__init__.py ================================================ """ .. include::filtered_full_load_with_selective_replace.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/filtered_full_load_with_selective_replace.md ================================================ # Filtered Full Load with Selective Replace This scenario is very similar to the [Filtered Full Load](../filtered_full_load/filtered_full_load.md), but we only replace a subset of the partitions, leaving the other ones untouched, so we don't replace the entire table. This capability is very useful for backfilling scenarios. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/full_load/with_filter_partition_overwrite/batch.json!} ``` ##### Relevant notes: * The key option for this scenario in the ACON is the `replaceWhere`, which we use to only overwrite a specific period of time, that realistically can match a subset of all the partitions of the table. Therefore, this capability is very useful for backfilling scenarios. ================================================ FILE: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/__init__.py ================================================ """ .. include::flatten_schema_and_explode_columns.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/flatten_schema_and_explode_columns.md ================================================ # Flatten Schema and Explode Columns Related with schema, we can make two kind of operations: * **Flatten Schema**: transformer named "flatten_schema" used to flatten the schema of dataframe. * Parameters to be defined: * max_level: 2 => this sets the level until you want to flatten the schema. * shorten_names: True => this flag is when you want to shorten the name of the prefixes of the fields. * alias: True => this flag is used when you want to define a prefix for the column to be flattened. * num_chars: 7 => this sets the number of characters to consider when shortening the names of the fields. * ignore_cols: True => this list value should be set to specify the columns you don't want to flatten. * **Explode Columns**: transformer named "explode_columns" used to explode columns with types ArrayType and MapType. * Parameters to be defined: * explode_arrays: True => this flag should be set to true to explode all array columns present in the dataframe. * array_cols_to_explode: ["sample_col"] => this list value should be set when to specify the array columns desired to explode. * explode_maps: True => this flag should be set to true to explode all map columns present in the dataframe. * map_cols_to_explode: ["map_col"] => this list value should be set when to specify the map columns desired to explode. * Recommendation: use array_cols_to_explode and map_cols_to_explode to specify the columns desired to explode and do not do it for all of them. The below scenario of **flatten_schema** is transforming one or more columns and dividing the content nested in more columns, as desired. We defined the number of levels we want to flatten in the schema, regarding the nested values. In this case, we are just setting `max_level` of `2`. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json!} ``` The scenario of **explode_arrays** is transforming the arrays columns in one or more rows, depending on the number of elements, so, it replicates the row for each array value. In this case we are using explode to all array columns, using `explode_arrays` as `true`. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json!} ``` The scenario of **flatten_and_explode_arrays_and_maps** is using `flatten_schema` and `explode_columns` to have the desired output. In this case, the desired output is to flatten all schema and explode maps and arrays, even having an array inside a struct. Steps: 1. In this case, we have an array column inside a struct column, so first we need to use the `flatten_schema` transformer to extract the columns inside that struct; 2. Then, we are able to explode all the array columns desired and map columns, using `explode_columns` transformer. 3. To be able to have the map column in 2 columns, we use again the `flatten_schema` transformer. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch.json!} ``` ================================================ FILE: lakehouse_engine_usage/data_loader/full_load/__init__.py ================================================ """ .. include::full_load.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/full_load/full_load.md ================================================ # Full Load This scenario reads CSV data from a path and writes in full to another path with delta lake files. ##### Relevant notes - This ACON infers the schema automatically through the option `inferSchema` (we use it for local tests only). This is usually not a best practice using CSV files, and you should provide a schema through the InputSpec variables `schema_path`, `read_schema_from_table` or `schema`. - The `transform_specs` in this case are purely optional, and we basically use the repartition transformer to create one partition per combination of date and customer. This does not mean you have to use this in your algorithm. - A full load is also adequate for an init load (initial load). As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/full_load/full_overwrite/batch.json!} ``` ================================================ FILE: lakehouse_engine_usage/data_loader/read_from_dataframe/__init__.py ================================================ """ .. include::read_from_dataframe.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/read_from_dataframe/read_from_dataframe.md ================================================ # Read from Dataframe !!! danger Don't use this feature if the Lakehouse Engine already has a supported data format for your use case, as in that case it is preferred to use the dedicated data formats which are more extensively tested and predictable. Check the supported data formats [here](../../../reference/packages/core/definitions.md#packages.core.definitions.InputFormat). Reading from a Spark DataFrame is very simple using our framework. You just need to define the input_specs as follows: ```python { "input_spec": { "spec_id": "my_df", "read_type": "batch", "data_format": "dataframe", "df_name": df, } } ``` !!! note "**Why is it relevant?**" With this capability of reading a dataframe you can deal with sources that do not yet officially have a reader (e.g., REST api, XML files, etc.). ================================================ FILE: lakehouse_engine_usage/data_loader/read_from_sharepoint/__init__.py ================================================ """ .. include::read_from_sharepoint.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/read_from_sharepoint/read_from_sharepoint.md ================================================ # Read from Sharepoint There may be scenarios where data products must ingest curated datasets that business teams maintain directly in Sharepoint, for example exports from external systems or manually maintained reference files. The `SharepointReader` is a specialized reader module designed to load one or more files from a Sharepoint document library into the lakehouse. It abstracts away the complexity of accessing Sharepoint by: * Resolving the configured Sharepoint site, drive, and document path. * Downloading the target file or all files matching a configured pattern into a temporary local location. * Reading the downloaded file(s) into a Spark DataFrame using the configured format and options. * Optionally combining multiple files into a single DataFrame (for example, unioning all matching CSV files in a folder) and optionally archiving processed files back to Sharepoint (success and error folders). !!! note 📘 Tip: This reader integrates seamlessly into the lakehouse engine’s input step and can be triggered as part of the ACON-based pipeline, just like any other reader module. !!! warning When reading from text-based formats such as CSV, complex data types (arrays, maps, structs) are not preserved in the source file. If your downstream tables expect these types, you must reconstruct them from string columns after ingestion (for example using `from_json` or explicit casts). ### Usage Scenarios The examples below show how to read data from Sharepoint, ranging from simple single-file reads to more advanced multi-file and large-file scenarios. 1. [Configuration parameters](#1-configuration-parameters) 2. [**Simple:** Read one file from Sharepoint](#2-simple-read-one-file-from-sharepoint) 1. [Minimal configuration](#i-minimal-configuration) 2. [With optional configurations](#ii-with-optional-configurations) 3. [**Complex:** Read multiple files from Sharepoint](#3-complex-read-multiple-files-from-sharepoint) 1. [Read multiple files (standard size)](#i-read-multiple-files-standard-size) 2. [Read multiple large files with `chunk_size` and CSV options](#ii-read-multiple-large-files-with-chunk_size-and-csv-options) 4. [Delimiter handling](#4-delimiter-handling) 5. [Orchestrating multiple Sharepoint reads (loop pattern)](#5-orchestrating-multiple-sharepoint-reads-loop-pattern) ## 1. Configuration parameters ### The mandatory configuration parameters are: - **client_id** (str): azure client ID application, available at the Azure Portal -> Azure Active Directory. - **tenant_id** (str): tenant ID associated with the Sharepoint site, available at the Azure Portal -> Azure Active Directory. - **site_name** (str): name of the Sharepoint site where the document library resides. Sharepoint URL naming convention is: **https://your_company_name.Sharepoint.com/sites/site_name** - **drive_name** (str): name of the document library where the file will be uploaded. Sharepoint URL naming convention is: **https://your_company_name.Sharepoint.com/sites/site_name/drive_name** - **file_name** (str): name of the file to be read from Sharepoint when performing a **single-file** read. - In multi-file scenarios, `file_pattern` is typically used instead (see examples below). - **secret** (str): client secret for authentication, available at the Azure Portal -> Azure Active Directory. - **local_path** (str): temporary local storage path (Volume) where files are downloaded before being read. - Ensure the **path ends with "/"**. - The **specified sub-folder may be deleted during processing** (for example when cleaning up temporary files); it does not perform a recursive delete on parent directories. - **Avoid using a critical sub-folder.** - **api_version** (str): version of the Graph Sharepoint API to be used for operations. This defaults to "v1.0". > 🔐 Authentication details (`client_id`, `secret`, etc.) should be handled > securely via lakehouse configuration or secret management tools, rather than > hard-coded in notebooks. ### The optional parameters are: - **folder_relative_path** (Optional[str]): relative folder path within the document library where the file(s) are located (for example, `"incoming/daily_exports"`). - **chunk_size** (Optional[int]): size (in bytes) of the file chunks used when downloading and archiving files. **Default is `5 * 1024 * 1024` (5 MB).** Useful when working with large files to avoid memory pressure. - **local_options** (Optional[dict]): additional options for customizing the **Spark read** from the temporary local file(s) (for example CSV options such as `header`, `delimiter`, `encoding`, etc.). See the Spark CSV options link below. - **conflict_behaviour** (Optional[str]): behavior to adopt when archiving files and a file with the same name already exists in the target location (for example, `"replace"`, `"fail"`). - **file_pattern** (Optional[str]): pattern to match **multiple files** in Sharepoint (for example, `"export_*.csv"`). Used by the multi-file reader flow to download and union all matching files. - **file_type** (Optional[str]): type of the files to be read from Sharepoint (for example, `"csv"`). The reader uses this to decide which Spark data source to use when reading from `local_path`. !!! note For more details about the Sharepoint framework, refer to Microsoft's official documentation: > 📖[ Microsoft Graph API - Sharepoint](https://learn.microsoft.com/en-us/graph/api/resources/sharepoint?view=graph-rest-1.0) > 🛠️ [Graph Explorer Tool](https://developer.microsoft.com/en-us/graph/graph-explorer) - this tool helps you explore available Sharepoint Graph API functionalities. > 📑 [Spark CSV options](https://spark.apache.org/docs/3.5.3/sql-data-sources-csv.html) ## 2. Simple: Read one file from Sharepoint This section demonstrates both minimal configuration and extended configurations when using the Sharepoint Reader. ### i. Minimal Configuration This approach uses only the mandatory parameters needed to connect to Sharepoint and read a single CSV file into the lakehouse. **Note:** In this minimal configuration: - The file is read from the configured `drive_name` (optionally under `folder_relative_path`). - No explicit archiving or custom CSV options are configured; those are covered in later sections. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "csv_read", "data_format": "sharepoint", "read_type": "batch", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "/Volumes/my_volume/sharepoint_tmp/", # must end with "/" "folder_relative_path": "dummy_folder", # optional "file_name": "dummy_sales.csv", "file_type": "csv", }, }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "csv_read", "data_format": "delta", "db_table": "dummy_sales", "write_type": "overwrite", "location": "s3://my_data_product_bucket/silver/dummy_sales/" }, ], } load_data(acon=acon) ``` ### ii. With optional configurations For more control over the read process, additional parameters can be specified on top of the minimal configuration: > **archive_enabled (Optional):** Enables archiving of the processed file in > Sharepoint. > > * If `True`, the reader moves the file out of the input folder after the read. > * Successful reads go to the *success* subfolder; failures go to the *error* > subfolder. > **archive_success_subfolder (Optional):** Name of the subfolder used to store > successfully processed files (default is `"done"`). > The folder is created under the same `folder_relative_path` and `drive_name`. > **archive_error_subfolder (Optional):** Name of the subfolder used to store > files that failed to be processed (default is `"error"`). > **local_options (Optional):** Additional options passed to Spark when reading > the downloaded CSV file(s) from `local_path` (for example `header`, `delimiter`, > `encoding`, etc.). > These options can be used in both **single-file** and **multi-file** read modes. > > * For available options, refer to: > [Apache Spark CSV Options](https://spark.apache.org/docs/3.5.4/sql-data-sources-csv.html). > **chunk_size (Optional):** Size (in bytes) of the chunks used when > downloading files. > > * Default: `5 * 1024 * 1024` (5 MB). > * Smaller chunks are safer for very large files or memory-constrained clusters. ```python from lakehouse_engine.engine import load_data # Optional CSV options for the local read LOCAL_OPTIONS = { "header": "true", "delimiter": ";", } acon = { "input_specs": [ { "spec_id": "csv_read", "data_format": "sharepoint", "read_type": "batch", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "/Volumes/my_volume/sharepoint_tmp/", "folder_relative_path": "dummy_simple", "file_name": "dummy_sales.csv", "file_type": "csv", "archive_enabled": True, "archive_success_subfolder": "successful", "archive_error_subfolder": "with_error", "local_options": LOCAL_OPTIONS, "chunk_size": 5 * 1024 * 1024, }, }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "csv_read", "data_format": "delta", "db_table": "dummy_sales", "write_type": "overwrite", "location": "s3://my_data_product_bucket/silver/dummy_sales/" }, ], } load_data(acon=acon) ``` ## 3. Complex: Read multiple files from Sharepoint In many cases, data in Sharepoint is split across multiple files within a folder or exported periodically. The `SharepointReader` can automatically locate and read all matching files based on a configured pattern, merging them into a single DataFrame. ### i. Read multiple files (standard size) Use `file_pattern` to match and load multiple files within the same folder. The reader downloads all matching files into the temporary local folder and performs a union of their contents before returning the DataFrame. ⚠️ **Schema consistency check:** All matched files must share the same schema. If a file with a different schema is encountered, the reader stops the ingestion, moves that file to the configured *error archive* folder, and logs the event. > **file_pattern (Optional):** Glob-style pattern for matching files, such as `"export_*.csv"`. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "csv_read_multi", "data_format": "sharepoint", "read_type": "batch", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "/Volumes/my_volume/sharepoint_tmp/", "folder_relative_path": "dummy_sales/daily_exports", "file_pattern": "export_*.csv", "file_type": "csv", }, }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "csv_read_multi", "data_format": "delta", "db_table": "dummy_sales_daily_exports", "write_type": "overwrite", "location": "s3://my_data_product_bucket/silver/dummy_sales/" }, ], } load_data(acon=acon) ``` ## ii. Read multiple large files with `chunk_size` and CSV options When reading multiple large CSV files, the reader can: - Download each file in chunks (to avoid memory pressure). - Apply custom CSV read options (delimiter, header, encoding, etc.) before unioning the data. > **chunk_size (Optional):** > Size (in bytes) of the chunks used when downloading and archiving files. > Default is `5 * 1024 * 1024` (5 MB). Increase this for very large files to reduce the number of download operations. > **local_options (Optional):** > Spark CSV options used when reading the downloaded files from `local_path` > (for example `header`, `delimiter`, `encoding`, `quote`, etc.). ```python from lakehouse_engine.engine import load_data LOCAL_OPTIONS = { "header": "true", "delimiter": ";", "encoding": "utf-8", } acon = { "input_specs": [ { "spec_id": "csv_read_multi_large", "data_format": "sharepoint", "read_type": "batch", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "/Volumes/my_volume/sharepoint_tmp/", "folder_relative_path": "dummy_sales/big_daily_exports/", "file_pattern": "big_export_*.csv", "file_type": "csv", "chunk_size": 50 * 1024 * 1024, # 50 MB per chunk "local_options": LOCAL_OPTIONS, }, }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "csv_read_multi_large", "data_format": "delta", "db_table": "dummy_sales_daily_exports", "write_type": "overwrite", }, ], } load_data(acon=acon) ``` ## 4. Delimiter handling When reading CSV files (single-file or multi-file), the Sharepoint Reader: - Uses `sep` or `delimiter` from `local_options` as-is if provided (no auto-detection in this case). - If no delimiter is provided, it: - Tries to auto-detect one from `; , | \t` using `csv.Sniffer`. - Optionally compares the resulting column count with `expected_columns` (if set) and logs a warning if they do not match. - Falls back to comma (`,`) if detection fails. Internally, the final delimiter is always passed to Spark as `sep` (`delimiter` is mapped to `sep` and then removed). > 💡 Tip: You can use `local_options` (including `sep` / `delimiter`) in both > single-file and multi-file read modes. When in doubt, set `sep` explicitly. ## 5. Orchestrating multiple Sharepoint reads (loop pattern) If you need to read from multiple independent Sharepoint locations (different folders, drives, or file patterns), you can orchestrate a loop in your notebook and call `load_data` once per configuration. ```python from lakehouse_engine.engine import load_data sharepoint_sources = [ {"folder_relative_path": "dummy_sales/big_daily_exports", "file_pattern": "big_export_*.csv"}, {"folder_relative_path": "dummy_sales/daily_exports", "file_pattern": "export_*.csv.csv"}, ] for src in sharepoint_sources: acon = { "input_specs": [ { "spec_id": "csv_read", "data_format": "sharepoint", "read_type": "batch", "sharepoint_opts": { "client_id": "...", "tenant_id": "...", "secret": "...", "site_name": "...", "drive_name": "...", "local_path": "/Volumes/my_volume/sharepoint_tmp/", "folder_relative_path": src["folder_relative_path"], "file_pattern": src["file_pattern"], "file_type": "csv", }, }, ], "output_specs": [ { "spec_id": "output", "input_id": "csv_read", "data_format": "delta", "db_table": "dummy_sales_daily_exports", "write_type": "append", }, ], } load_data(acon=acon) ``` ‼️ Caution: excessive parallelism - Running too many Sharepoint reads in parallel can trigger MS Graph API throttling (for example 429 or 503 responses). - Prefer a controlled level of parallelism when orchestrating multiple pipelines or loops that read from Sharepoint. - Monitor logs and retries to ensure stable performance, especially when working with large files or many files at once. The Lakehouse Engine framework uses retry logic with backoff to mitigate throttling, but it cannot fully replace sensible limits on concurrency. ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/__init__.py ================================================ """ .. include::streaming_append_load_with_malformed.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/streaming_append_load_with_malformed.md ================================================ # Streaming Append Load with DROPMALFORMED This scenario illustrates an append load done via streaming instead of batch, providing an efficient way of picking up new files from an S3 folder, instead of relying on the incremental filtering from the source needed from a batch based process (see append loads in batch from a JDBC source to understand the differences between streaming and batch append loads). However, not all sources (e.g., JDBC) allow streaming. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/append_load/streaming_dropmalformed/streaming.json!} ``` ##### Relevant notes: * In this scenario, we use DROPMALFORMED read mode, which drops rows that do not comply with the provided schema; * In this scenario, the schema is provided through the `input_spec` "schema" variable. This removes the need of a separate JSON Spark schema file, which may be more convenient in certain cases. * As can be seen, we use the `output_spec` Spark option `checkpointLocation` to specify where to save the checkpoints indicating what we have already consumed from the input data. This allows fault-tolerance if the streaming job fails, but more importantly, it allows us to run a streaming job using [AvailableNow](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) and the next job automatically picks up the stream state since the last checkpoint, allowing us to do efficient append loads without having to manually specify incremental filters as we do for batch append loads. ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/__init__.py ================================================ """ .. include::streaming_append_load_with_terminator.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/streaming_append_load_with_terminator.md ================================================ # Streaming Append Load with Optimize Dataset Terminator This scenario includes a terminator which optimizes a dataset (table), being able of vacuuming the table, optimising it with z-order or not, computing table statistics and more. You can find more details on the Terminator [here](../../../reference/packages/terminators/dataset_optimizer.md). As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/append_load/streaming_with_terminators/streaming.json!} ``` ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/__init__.py ================================================ """ .. include::streaming_delta_load_with_group_and_rank_condensation.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/streaming_delta_load_with_group_and_rank_condensation.md ================================================ # Streaming Delta Load with Group and Rank Condensation This scenario is useful for when we want to do delta loads based on changelogs that need to be first condensed based on a group by and then a rank only, instead of the record mode logic in the record mode based change data capture. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json!} ``` ##### Relevant notes: * This type of delta load with this type of condensation is useful when the source changelog can be condensed based on dates, instead of technical fields like `datapakid`, `record`, `record_mode`, etc., as we see in SAP BW DSOs.An example of such system is Omnihub Tibco orders and deliveries files. ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/__init__.py ================================================ """ .. include::streaming_delta_with_late_arriving_and_out_of_order_events.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md ================================================ # Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking) ## How to Deal with Late Arriving Data without using Watermark This scenario covers a delta load in streaming mode that is able to deal with late arriving and out of order events. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```json {!../../../../tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming_delta.json!} ``` ##### Relevant notes: * First question we can impose is: Do we need such complicated update predicate to handle late arriving and out of order events? Simple answer is no. Because we expect that the latest event (e.g., latest status of a record in the source) will eventually arrive, and therefore the target delta lake table will eventually be consistent. However, when will that happen? Do we want to have our target table inconsistent until the next update comes along? This of course is only true when your source cannot ensure the order of the changes and cannot avoid late arriving changes (e.g., some changes that should have come in this changelog extraction, will only arrive in the next changelog extraction). From previous experiences, this is not the case with SAP BW, for example (as SAP BW is ACID compliant, and it will extract data from an SAP source and only have the updated changelog available when the extraction goes through, so theoretically we should not be able to extract data from the SAP BW changelog while SAP BW is still extracting data). * However, when the source cannot fully ensure ordering (e.g., Kafka) and we want to make sure we don't load temporarily inconsistent data into the target table, we can pay extra special attention, as we do here, to our update and insert predicates, that will enable us to only insert or update data if the new event meets the respective predicates: * In this scenario, we will only update if the `update_predicate` is true, and that long predicate we have here ensures that the change that we are receiving is likely the latest one; * In this scenario, we will only insert the record if the record is not marked for deletion (this can happen if the new event is a record that is marked for deletion, but the record was not in the target table (late arriving changes where the delete came before the insert), and therefore, without the `insert_predicate`, the algorithm would still try to insert the row, even if the `record_mode` indicates that that row is for deletion. By using the `insert_predicate` above we avoid that to happen. However, even in such scenario, to prevent the algorithm to insert the data that comes later (which is old, as we said, the delete came before the insert and was actually the latest status), we would even need a more complex predicate based on your data's nature. Therefore, please read the disclaimer below. !!! note "**Disclaimer**!" The scenario illustrated in this page is purely fictional, designed for the Lakehouse Engine local tests specifically. Your data source changelogs may be different and the scenario and predicates discussed here may not make sense to you. Consequently, the data product team should reason about the adequate merge predicate and insert, update and delete predicates, that better reflect how they want to handle the delta loads for their data. * We use spark.sql.streaming.schemaInference in our local tests only. We don't encourage you to use it in your data product. !!! note "**Documentation**" [Feature Deep Dive: Watermarking in Apache Spark Structured Streaming - The Databricks Blog](https://www.databricks.com/blog/feature-deep-dive-watermarking-apache-spark-structured-streaming) [Structured Streaming Programming Guide - Spark 3.4.0 Documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) ## How to Deal with Late Arriving Data using Watermark When building real-time pipelines, one of the realities that teams have to work with is that distributed data ingestion is inherently unordered. Additionally, in the context of stateful streaming operations, teams need to be able to properly track event time progress in the stream of data they are ingesting for the proper calculation of time-window aggregations and other stateful operations. While working with real-time streaming data there will be delays between event time and processing time due to how data is ingested and whether the overall application experiences issues like downtime. Due to these potential variable delays, the engine that you use to process this data needs to have some mechanism to decide when to close the aggregate windows and produce the aggregate result. Imagine a scenario where we will need to perform stateful aggregations on the streaming data to understand and identify problems in the machines. **This is where we need to leverage Structured Streaming and Watermarking to produce the necessary stateful aggregations.** ##### Approach 1 - Use a pre-defined fixed window (Bad) Credits: [Image source](https://www.databricks.com/blog/feature-deep-dive-watermarking-apache-spark-structured-streaming) To explain this visually let’s take a scenario where we are receiving data at various times from around 10:50 AM → 11:20 AM. We are creating 10-minute tumbling windows that calculate the average of the temperature and pressure readings that came in during the windowed period. In this first picture, we have the tumbling windows trigger at 11:00 AM, 11:10 AM and 11:20 AM leading to the result tables shown at the respective times. When the second batch of data comes around 11:10 AM with data that has an event time of 10:53 AM this gets incorporated into the temperature and pressure averages calculated for the 11:00 AM → 11:10 AM window that closes at 11:10 AM, which does not give the correct result. ##### Approach 2 - Watermark We can define a **watermark** that will allow Spark to understand when to close the aggregate window and produce the correct aggregate result. In Structured Streaming applications, we can ensure that all relevant data for the aggregations we want to calculate is collected by using a feature called **watermarking**. In the most basic sense, by defining a **watermark** Spark Structured Streaming then knows when it has ingested all data up to some time, **T**, (based on a set lateness expectation) so that it can close and produce windowed aggregates up to timestamp **T**. Credits: [Image source](https://www.databricks.com/blog/feature-deep-dive-watermarking-apache-spark-structured-streaming) Unlike the first scenario where Spark will emit the windowed aggregation for the previous ten minutes every ten minutes (i.e. emit the 11:00 AM →11:10 AM window at 11:10 AM), Spark now waits to close and output the windowed aggregation once **the max event time seen minus the specified watermark is greater than the upper bound of the window**. In other words, Spark needed to wait until it saw data points where the latest event time seen minus 10 minutes was greater than 11:00 AM to emit the 10:50 AM → 11:00 AM aggregate window. At 11:00 AM, it does not see this, so it only initialises the aggregate calculation in Spark’s internal state store. At 11:10 AM, this condition is still not met, but we have a new data point for 10:53 AM so the internal state gets updated, just **not emitted**. Then finally by 11:20 AM Spark has seen a data point with an event time of 11:15 AM and since 11:15 AM minus 10 minutes is 11:05 AM which is later than 11:00 AM the 10:50 AM → 11:00 AM window can be emitted to the result table. This produces the correct result by properly incorporating the data based on the expected lateness defined by the watermark. Once the results are emitted the corresponding state is removed from the state store. ###### Watermarking and Different Output Modes It is important to understand how state, late-arriving records, and the different output modes could lead to different behaviours of your application running on Spark. The main takeaway here is that in both append and update modes, once the watermark indicates that all data is received for an aggregate time window, the engine can trim the window state. In append mode the aggregate is produced only at the closing of the time window plus the watermark delay while in update mode it is produced on every update to the window. Lastly, by increasing your watermark delay window you will cause the pipeline to wait longer for data and potentially drop less data – higher precision, but also higher latency to produce the aggregates. On the flip side, smaller watermark delay leads to lower precision but also lower latency to produce the aggregates. Watermarks can only be used when you are running your streaming application in **append** or **update** output modes. There is a third output mode, complete mode, in which the entire result table is written to storage. This mode cannot be used because it requires all aggregate data to be preserved, and hence cannot use watermarking to drop intermediate state. ###### Joins With Watermark There are three types of stream-stream joins that can be implemented in Structured Streaming: **inner, outer, and semi joins**. The main problem with doing joins in streaming applications is that you may have an incomplete picture of one side of the join. Giving Spark an understanding of when there are no future matches to expect is similar to the earlier problem with aggregations where Spark needed to understand when there were no new rows to incorporate into the calculation for the aggregation before emitting it. To allow Spark to handle this, we can leverage a combination of watermarks and event-time constraints within the join condition of the stream-stream join. This combination allows Spark to filter out late records and trim the state for the join operation through a time range condition on the join. Spark has a policy for handling multiple watermark definitions. Spark maintains **one global watermark** that is based on the slowest stream to ensure the highest amount of safety when it comes to not missing data. We can change this behaviour by changing *spark.sql.streaming.multipleWatermarkPolicy* to max; however, this means that data from the slower stream will be dropped. ###### State Store Performance Considerations As of Spark 3.2, Spark offers RocksDB state store provider. If you have stateful operations in your streaming query (for example, streaming aggregation, streaming dropDuplicates, stream-stream joins, mapGroupsWithState, or flatMapGroupsWithState) and you want to maintain millions of keys in the state, then you may face issues related to large JVM garbage collection (GC) pauses causing high variations in the micro-batch processing times. This occurs because, by the implementation of HDFSBackedStateStore, the state data is maintained in the JVM memory of the executors and large number of state objects puts memory pressure on the JVM causing high GC pauses. In such cases, you can choose to use a more optimized state management solution based on RocksDB. Rather than keeping the state in the JVM memory, this solution uses RocksDB to efficiently manage the state in the native memory and the local disk. Furthermore, any changes to this state are automatically saved by Structured Streaming to the checkpoint location you have provided, thus providing full fault-tolerance guarantees (the same as default state management). To enable the new build-in state store implementation, *set `spark.sql.streaming.stateStore.providerClass` to `org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider`*. For more details please visit Spark documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation You can enable this in your acons, by specifying it as part of the exec_env properties like below: ```json "exec_env": { "spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider" } ``` ================================================ FILE: lakehouse_engine_usage/data_loader/write_and_read_dataframe/__init__.py ================================================ """ .. include::write_and_read_dataframe.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/write_and_read_dataframe/write_and_read_dataframe.md ================================================ # Write and Read Dataframe DataFrame writer can give us some advantages by returning a dictionary containing the `spec_id` and the computed dataframe. In these examples we will cover the following scenarios of using the output `dataframe` format: 1. [**Write to dataframe**: Consuming the output spec as DataFrame;](#1-write-to-dataframe-consuming-the-output-spec-as-dataframe) 2. [**Write all dataframes**: Consuming all DataFrames generated per specs;](#2-write-all-dataframes-consuming-all-dataframes-generated-per-specs) 3. [**Read from and Write to dataframe**: Making use of the DataFrame output spec to compose silver data.](#3-read-from-and-write-to-dataframe-making-use-of-the-dataframe-output-spec-to-compose-silver-data) #### Main advantages of using this output writer: - **Debugging purposes**: as we can access any dataframe used in any part of our ACON we can observe what is happening with the computation and identify what might be wrong or can be improved. - **Flexibility**: in case we have some very specific need not covered yet by the lakehouse engine capabilities, example: return the Dataframe for further processing like using a machine learning model/prediction. - **Simplify ACONs**: instead developing a single complex ACON, using the Dataframe writer, we can compose our ACON from the output of another ACON. This allows us to identify and split the notebook logic across ACONs. If you want/need, you can add as many dataframes as you want in the output spec referencing the spec_id you want to add. !!! warning **This is not intended to replace the other capabilities offered by the lakehouse-engine** and in case **other feature can cover your use case**, you should **use it instead of using the Dataframe writer**, as they are much **more extensively tested on different type of operations**. *Additionally, please always introspect if the problem that you are trying to resolve and for which no lakehouse-engine feature is available, could be a common problem and thus deserve a common solution and feature.* Moreover, **Dataframe writer is not supported for the streaming trigger types `processing time` and `continuous`.** ## 1. Write to dataframe: Consuming the output spec as DataFrame ### Silver Dummy Sales Write to DataFrame In this example we will cover the Dummy Sales write to a result containing the output DataFrame. - An ACON is used to read from bronze, apply silver transformations and write to a dictionary containing the output spec as key and the dataframe as value through the following steps: - 1 - Definition of how to read data (input data location, read type and data format); - 2 - Transformation of data (rename relevant columns); - 3 - Write the data to dict containing the dataframe; !!! note If you are trying to retrieve more than once the same data using checkpoint it will return an empty dataframe with empty schema as we don't have new data to read. ```python from lakehouse_engine.engine import load_data cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"} acon = { "input_specs": [ { "spec_id": "dummy_sales_bronze", "read_type": "streaming", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_sales", } ], "transform_specs": [ { "spec_id": "dummy_sales_transform", "input_id": "dummy_sales_bronze", "transformers": [ { "function": "rename", "args": { "cols": cols_to_rename, }, }, ], } ], "output_specs": [ { "spec_id": "dummy_sales_silver", "input_id": "dummy_sales_transform", "data_format": "dataframe", "options": { "checkpointLocation": "s3://my_data_product_bucket/checkpoints/bronze/dummy_sales", }, } ], } ``` ### Run the Load and Return the Dictionary with the DataFrames by OutputSpec This exploratory test will return a dictionary with the output spec and the dataframe that will be stored after transformations. ```python output = load_data(acon=acon) display(output.keys()) display(output.get("dummy_sales_silver")) ``` ## 2. Write all dataframes: Consuming all DataFrames generated per specs ### Silver Dummy Sales Write to DataFrame In this example we will cover the Dummy Sales write to a result containing the specs and related DataFrame. - An ACON is used to read from bronze, apply silver transformations and write to a dictionary containing the spec id as key and the DataFrames as value through the following steps: - Definition of how to read data (input data location, read type and data format); - Transformation of data (rename relevant columns); - Write the data to a dictionary containing all the spec ids and DataFrames computed per step; ```python from lakehouse_engine.engine import load_data cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"} acon = { "input_specs": [ { "spec_id": "dummy_sales_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_sales", } ], "transform_specs": [ { "spec_id": "dummy_sales_transform", "input_id": "dummy_sales_bronze", "transformers": [ { "function": "rename", "args": { "cols": cols_to_rename, }, }, ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dummy_sales_bronze", "data_format": "dataframe", }, { "spec_id": "sales_silver", "input_id": "dummy_sales_transform", "data_format": "dataframe", }, ], } ``` ### Run the Load and Return the Dictionary with the related DataFrames by Spec This exploratory test will return a dictionary with all specs and the related dataframe. You can access the DataFrame you need by `output.get()` for future developments and tests. ```python output = load_data(acon=acon) display(output.keys()) display(output.get("sales_bronze")) display(output.get("sales_silver")) ``` ## 3. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data ### Silver Load Dummy Deliveries In this example we will cover the Dummy Deliveries table read and incremental load to silver composing the silver data to write using the DataFrame output spec: - First ACON is used to get the latest data from bronze, in this step we are using more than one output because we will need the bronze data with the latest data in the next step. - Second ACON is used to consume the bronze data and the latest data to perform silver transformation, in this ACON we are using as **input the two dataframes computed by the first ACON.** - Third ACON is used to write the silver computed data from the previous ACON to the target. !!! note This example is not a recommendation on how to deal with incremental loads, the ACON was split in 3 for demo purposes. Consume bronze data, generate the latest data and return a dictionary with bronze and transformed dataframes: ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_sales", }, { "spec_id": "dummy_deliveries_silver_source", "read_type": "batch", "data_format": "delta", "db_table": "my_database.dummy_deliveries", }, ], "transform_specs": [ { "spec_id": "dummy_deliveries_table_max_value", "input_id": "dummy_deliveries_silver_source", "transformers": [ { "function": "get_max_value", "args": {"input_col": "delivery_date", "output_col": "latest"}, }, { "function": "with_expressions", "args": { "cols_and_exprs": {"latest": "CASE WHEN latest IS NULL THEN 0 ELSE latest END"}, }, }, ], } ], "output_specs": [ { "spec_id": "deliveries_bronze", "input_id": "dummy_deliveries_bronze", "data_format": "dataframe", }, { "spec_id": "dummy_deliveries_transformed", "input_id": "dummy_deliveries_table_max_value", "data_format": "dataframe", }, ], } dummy_deliveries_transformed = load_data(acon=acon) dummy_deliveries_transformed_df = dummy_deliveries_transformed.get("dummy_deliveries_transformed") dummy_deliveries_bronze_df = dummy_deliveries_transformed.get("deliveries_bronze") ``` Consume previous dataframes generated by the first ACON (bronze and latest bronze data) to generate the silver data. In this acon we are only using **just one output** because we only need the dataframe from the output for the next step. ```python from lakehouse_engine.engine import load_data cols_to_rename = {"delivery_note_header": "delivery_note", "article": "article_id"} acon = { "input_specs": [ { "spec_id": "dummy_deliveries_bronze", "read_type": "batch", "data_format": "dataframe", "df_name": dummy_deliveries_bronze_df, }, { "spec_id": "dummy_deliveries_table_max_value", "read_type": "batch", "data_format": "dataframe", "df_name": dummy_deliveries_transformed_df, }, ], "transform_specs": [ { "spec_id": "dummy_deliveries_transform", "input_id": "dummy_deliveries_bronze", "transformers": [ { "function": "rename", "args": { "cols": cols_to_rename, }, }, { "function": "incremental_filter", "args": { "input_col": "delivery_date", "increment_df": "dummy_deliveries_table_max_value", "increment_col": "latest", "greater_or_equal": False, }, }, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_silver", "input_id": "dummy_deliveries_transform", "data_format": "dataframe", } ], } dummy_deliveries_silver = load_data(acon=acon) dummy_deliveries_silver_df = dummy_deliveries_silver.get("dummy_deliveries_silver") ``` Write the silver data generated by previous ACON into the target ```python from lakehouse_engine.engine import load_data write_silver_acon = { "input_specs": [ { "spec_id": "dummy_deliveries_silver", "read_type": "batch", "data_format": "dataframe", "df_name": dummy_deliveries_silver_df, }, ], "dq_specs": [ { "spec_id": "dummy_deliveries_quality", "input_id": "dummy_deliveries_silver", "dq_type": "validator", "bucket": "my_data_product_bucket", "expectations_store_prefix": "dq/expectations/", "validations_store_prefix": "dq/validations/", "checkpoint_store_prefix": "dq/checkpoints/", "result_sink_db_table": "my_database.dummy_deliveries_dq", "result_sink_location": "my_data_product_bucket/dq/dummy_deliveries", "fail_on_error": False, "tbl_to_derive_pk": "my_database.dummy_deliveries", "dq_functions": [ { "function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_note"}, }, { "function": "expect_table_row_count_to_be_between", "args": {"min_value": 19}, }, { "function": "expect_column_max_to_be_between", "args": {"column": "delivery_item", "min_value": 2}, }, ], }, ], "output_specs": [ { "spec_id": "dummy_deliveries_silver", "input_id": "dummy_deliveries_quality", "write_type": "append", "location": "s3://my_data_product_bucket/silver/dummy_deliveries_df_writer", "data_format": "delta", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": True, "spark.databricks.delta.optimizeWrite.enabled": True, "spark.databricks.delta.autoCompact.enabled": True, }, } load_data(acon=write_silver_acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/write_to_console/__init__.py ================================================ """ .. include::write_to_console.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/write_to_console/write_to_console.md ================================================ # Write to Console Console writer is an interesting feature to debug / validate what have been done on lakehouse engine. Before moving forward and store data somewhere, it is possible to show / print the final dataframe to the console, which means it is possible to transform the data as many times as you want and display the final result to validate if it is as expected. ## Silver Dummy Sales Write to Console Example In this template we will cover the Dummy Sales write to console. An ACON is used to read from bronze, apply silver transformations and write on console through the following steps: 1. Definition of how to read data (input data location, read type and data format); 2. Transformation of data (rename relevant columns); 3. Definition of how to print to console (limit, truncate, vertical options); For this, the ACON specs are : - **input_specs** (MANDATORY): specify how to read data; - **transform specs** (OPTIONAL): specify how to transform data; - **output_specs** (MANDATORY): specify how to write data to the target. !!! note Writer to console **is a wrapper for spark.show() function**, if you want to know more about the function itself or the available options, [please check the spark documentation here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.show.html). ```python from lakehouse_engine.engine import load_data cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"} acon = { "input_specs": [ { "spec_id": "dummy_sales_bronze", "read_type": "streaming", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_sales", } ], "transform_specs": [ { "spec_id": "dummy_sales_transform", "input_id": "dummy_sales_bronze", "transformers": [ { "function": "rename", "args": { "cols": cols_to_rename, }, }, ], } ], "output_specs": [ { "spec_id": "dummy_sales_silver", "input_id": "dummy_sales_transform", "data_format": "console", "options": {"limit": 8, "truncate": False, "vertical": False}, } ], } ``` And then, **Run the Load and Exit the Notebook**: This exploratory test will write to the console, which means the final dataframe will be displayed. ```python load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/write_to_rest_api/__init__.py ================================================ """ .. include::write_to_rest_api.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/write_to_rest_api/write_to_rest_api.md ================================================ # Write to REST API REST API writer is an interesting feature to send data from Spark to a REST API within the data pipeline context. It uses the Python requests library to execute the REST calls. It is possible to configure a few aspects of the writer, like if the payload should be sent via JSON body or via file, or configure additional JSON body parameters to add to the payload generated via Spark. In the current implementation of the writer, each row will generate a request to the API, so it is important that you prepare your dataframe accordingly (check example below). ## Silver Dummy Sales Write to REST API Example In this template we will cover the Dummy Sales write to a REST API. An ACON is used to read from bronze, apply silver transformations to prepare the REST api payload and write to the API through the following steps: 1. Definition of how to read data (input data location, read type and data format); 2. Transformation of the data so that we form a payload column per each row. **Important Note:** In the current implementation of the writer, each row will generate a request to the API, so `create_payload` is a lakehouse engine custom transformer function that creates a JSON string with the **payload** to be sent to the API. The column name should be exactly **"payload"**, so that the lakehouse engine further processes that column accordingly, in order to correctly write the data to the REST API. 3. Definition of how to write to a REST api (url, authentication, payload format configuration, ...); For this, the ACON specs are : - **input_specs** (MANDATORY): specify how to read data; - **transform specs** (MANDATORY): specify how to transform data to prepare the payload; - **output_specs** (MANDATORY): specify how to write data to the target. ```python from lakehouse_engine.engine import load_data def create_payload(df: DataFrame) -> DataFrame: payload_df = payload_df.withColumn( "payload", lit('{"just a dummy key": "just a dummy value"}') ) return payload_df acon = { "input_specs": [ { "spec_id": "dummy_sales_bronze", "read_type": "streaming", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_sales", } ], "transform_specs": [ { "spec_id": "dummy_sales_transform", "input_id": "dummy_sales_bronze", "transformers": [ { "function": "custom_transformation", "args": { "custom_transformer": create_payload, }, } ], }, ], "output_specs": [ { "spec_id": "data_to_send_to_api", "input_id": "dummy_sales_transform", "data_format": "rest_api", "options": { "rest_api_url": "https://foo.bar.com", "rest_api_method": "post", "rest_api_basic_auth": { "username": "...", "password": "...", }, "rest_api_is_file_payload": False, # True if payload is to be sent via JSON file instead of JSON body (application/json) "rest_api_file_payload_name": "custom_file", # this is the name of the file to be sent in cases where the payload uses file uploads rather than JSON body. "rest_api_extra_json_payload": {"x": "y"} } } ], } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_loader/write_to_sharepoint/__init__.py ================================================ """ .. include::write_to_sharepoint.md """ ================================================ FILE: lakehouse_engine_usage/data_loader/write_to_sharepoint/write_to_sharepoint.md ================================================ # Write to Sharepoint There may be scenarios where data products must deliver curated datasets to external platforms like Sharepoint, often to serve business users or reporting tools outside the lakehouse environment. The SharePointWriter is a specialized writer module designed to export a single file from the lakehouse to a Sharepoint document library. It handles the complexities of the export by: * Writing the dataset to a temporary local file. * Uploading that file to the configured Sharepoint location using authenticated APIs. * Since it is scoped to handle only a single file per execution, any logic for splitting or generating multiple files must be implemented within your notebook prior to invoking the writer. !!! note 📘 Tip: This writer integrates seamlessly into the lakehouse engine's output step and can be triggered as part of the ACON-based pipeline, just like any other writer module. !!! warning **CSV files do not support complex data types such as array, map, or struct.** If these fields exist in the dataset, they must be converted to string (e.g., via to_json(), cast, or similar) before using the Sharepoint Writer, as **these types will cause the export to fail.** ### Usage Scenarios The examples below show how to write data to Sharepoint, ranging from simple single-DataFrame writes to more complex multi-DataFrame workflows. 1. [Configuration parameters](#1-configuration-parameters) 2. [**Simple:** Write one Dataframe to Sharepoint](#2-simple-write-one-dataframe-to-sharepoint) 1. [Minimal configuration](#i-minimal-configuration) 2. [With optional configurations](#ii-with-optional-configurations) 3. [**Complex:** Write multiple Dataframes to Sharepoint](#3-complex-write-multiple-dataframes-to-sharepoint) 1. [Example: Partitioning function](#i-example-partitioning-function) 2. [Example: Detect Unsupported Column Types](#ii-detect-unsupported-columns-types) 2. [Without parallelism (sequential processing)](#iii-without-parallelism-sequential-processing) 3. [With parallelism (optimized for efficiency)](#iv-complex---with-parallelism-optimized-for-efficiency) ## 1. Configuration parameters ### The mandatory configuration parameters are: - **client_id** (str): azure client ID application, available at the Azure Portal -> Azure Active Directory. - **tenant_id** (str): tenant ID associated with the Sharepoint site, available at the Azure Portal -> Azure Active Directory. - **site_name** (str): name of the Sharepoint site where the document library resides. Sharepoint URL naming convention is: **https://your_company_name.sharepoint.com/sites/site_name** - **drive_name** (str): name of the document library where the file will be uploaded. Sharepoint URL naming convention is: **https://your_company_name.sharepoint.com/sites/site_name/drive_name** - **file_name** (str): name of the file to be uploaded to local path and to Sharepoint. - **secret** (str): client secret for authentication, available at the Azure Portal -> Azure Active Directory. - **local_path** (str): Temporary local storage path for the file before uploading. - Ensure the **path ends with "/"**. - Note: The **specified sub-folder is deleted during the process**; it does not perform a recursive delete on parent directories. - **Avoid using a critical sub-folder.** - **api_version** (str): version of the Graph Sharepoint API to be used for operations. This defaults to "v1.0". ### The optional parameters are: - **folder_relative_path** (Optional[str]): relative folder path within the document library to upload the file. - **chunk_size** (Optional[int]): Optional; size (in Bytes) of the file chunks for uploading to Sharepoint. **Default is 100 Mb.** - **local_options** (Optional[dict]): Optional; additional options for customizing write to csv action to local path. You can check the available options below. - **conflict_behaviour** (Optional[str]): Optional; behavior to adopt in case of a conflict (e.g., 'replace', 'fail'). !!! note For more details about the Sharepoint framework, refer to Microsoft's official documentation: > 📖[ Microsoft Graph API - Sharepoint](https://learn.microsoft.com/en-us/graph/api/resources/sharepoint?view=graph-rest-1.0) > 🛠️ [Graph Explorer Tool](https://developer.microsoft.com/en-us/graph/graph-explorer) - this tool helps you explore available Sharepoint Graph API functionalities. > 📑 [Spark CSV options](https://spark.apache.org/docs/3.5.3/sql-data-sources-csv.html) ## 2. Simple: Write one Dataframe to Sharepoint This section demonstrates both minimal configuration and extended configurations when using the Sharepoint Writer. ### i. Minimal Configuration This approach uses only the mandatory parameters, making it the quickest way to write a DataFrame to Sharepoint. **Note:** With minimal configurations, not even the header is written on the table. Furthermore, the file is written on the Sharepoint Drive root folder. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_input", "read_type": "batch", "data_format": "delta", "db_table": "dummy_sales", }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "dummy_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "s3://my_data_product_bucket/silver/dummy_sales/", # this path must end with an "/" "file_name": "dummy_sales", }, }, ], } load_data(acon=acon) ``` ### ii. With Optional Configurations For more control over the upload process, additional parameters can be specified: >**folder_relative_path (Optional):** Defines the subfolder inside the Sharepoint drive where the file should be stored. > > ‼️ **Important:** The drive within the site acts as the root. > > **Example:** > > * Site Name: "dummy_sharepoint" > * Drive Name: "dummy_drive" > * Folder Path: "dummy/test/" > * File Name: "test.csv" > * Final Destination: "dummy_sharepoint/dummy_drive/dummy/test/test.csv" > **chunk_size (Optional):** Defines the file chunk size (in bytes) for uploading. > > * Default: 100 MB (Recommended unless handling large files). > * Larger chunk sizes can improve performance but may increase memory usage. > **local_options (Optional):** Additional options for writing the DataFrame to a CSV file before upload. > > * For available options, refer to: [Apache Spark CSV Options](https://spark.apache.org/docs/3.5.4/sql-data-sources-csv.html). > **conflict_behaviour (Optional):** Determines the action taken if a file with the same name already exists. > > * Possible values: "replace", "fail", "rename", etc. > * Refer to Microsoft’s documentation: [Drive Item Conflict Behavior](https://learn.microsoft.com/en-us/dynamics365/business-central/application/system-application/enum/system.integration.graph.graph-conflictbehavior). ```python from lakehouse_engine.engine import load_data # Set the optional parameters LOCAL_OPTIONS = {"mode": "overwrite", "header": "true"} acon = { "input_specs": [ { "spec_id": "dummy_input", "read_type": "batch", "data_format": "delta", "db_table": "dummy_sales", }, ], "transform_specs": [ { "spec_id": "dummy_transform", "input_id": "dummy_input", "transformers": [ { "function": "add_current_date", "args": {"output_col": "extraction_timestamp"}, }, # Add a new column with the current date if needed { "function": "expression_filter", "args": {"exp": "customer = 'customer 1'"}, }, # Filter the data if needed ], }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "dummy_transform", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "s3://my_data_product_bucket/silver/dummy_sales/", # this path must end with an "/" "file_name": "dummy_sales", "folder_relative_path": "dummy_simple", # writes file in the folder ./dummy_simple "local_options": LOCAL_OPTIONS, "chunk_size": 300 * 1024 * 1024, # 300 MB }, }, ], } load_data(acon=acon) ``` ## 3. Complex: Write multiple Dataframes to Sharepoint This scenario illustrates how to write multiple files to Sharepoint within a loop. Some use cases may require uploading files categorized by season, customer type, product category, etc., depending on the business needs. Partitioning the data ensures better organization and optimized file management in Sharepoint. !!!warning ‼️ **Caution: Excessive Parallelism!** * Too many simultaneous uploads can trigger Graph API throttling, leading to 503 (Service Unavailable) errors. * Use a controlled level of parallelism (limit concurrent uploads) **if necessary**. * [Coalesce](https://spark.apache.org/docs/3.5.3/sql-performance-tuning.html#coalesce-hints-for-sql-queries) allows you to control Spark's parallelism. * **As the size of the files increases so does this concern,** so it’s important to test and monitor upload processes to avoid service disruptions and ensure smooth performance. **Neverthless, a stress test with over 50 partition files with > 4GB each** was performed and parallelism issues were not detected. The Lakehouse Engine Framework uses a **exponential backoff retry logic to avoid throttling** issues. ### i. Example: Partitioning function This function is a mere example on how to fetch the distinct of a column from a given table.\ It is not part of the lakehouse_engine framework. ```python def get_partitions( partition: str, bucket: Optional[str] = None, table: Optional[str] = None, filter_expression: Optional[str] = None ) -> List[Dict[str, str]]: """Fetch distinct values from a given partition column in a table or bucket. Parameters ---------- partition : str The name of the partition column. bucket : Optional[str], default=None The path to the S3 bucket (if applicable). table : Optional[str], default=None The name of the table (if applicable). filter_expression : Optional[str], default=None A filter condition to apply. Returns ------- List[Dict[str, str]] A list of dictionaries with unique partition values. """ if not bucket and not table: raise ValueError("Either 'bucket' or 'table' must be provided") df = spark.read.format("delta").load(bucket) if bucket else spark.table(table) partitions = df.select(partition).distinct() if filter_expression: partitions = partitions.filter(filter_expression) return [{partition: row[partition]} for row in partitions.collect()] ``` ### ii. Detect unsupported columns types This function exemplifies how to detect unsupported .csv column types. It is not part of the lakehouse_engine framework. ```python def detect_array_or_struct_fields(df: DataFrame) -> Dict[str, str]: """ Detect fields in a DataFrame that are arrays, structs, or maps. Args: df (DataFrame): The input DataFrame. Returns: Dict[str, str]: A dictionary with field names as keys and their types ('array', 'struct', or 'map') as values. """ field_types = {} type_mapping = {ArrayType: "StringType", StructType: "StringType", MapType: "StringType"} for field in df.schema.fields: for data_type, type_name in type_mapping.items(): if isinstance(field.dataType, data_type): field_types[field.name] = type_name break return field_types ``` ### iii. Without parallelism (sequential processing) ```python from lakehouse_engine.engine import load_data # Set the optional parameters LOCAL_OPTIONS = {"mode": "overwrite", "header": "true"} # Set the partition column PARTITION = "customer" # Fetch distinct values from the partition column partitions = get_partitions(partition=PARTITION, table="dummy_sales") # Sort the distinct values to ensure the correct order of the files # Note: # - If an error occurs during the process, by sorting beforehand, you guarantee the correct order of the files. # - It may come in handy if you want to restart the process (starting on a given file). partitions.sort(key=lambda x: x["customer"]) for partition in partitions: acon = { "input_specs": [ { "spec_id": "dummy_input", "read_type": "batch", "data_format": "delta", "db_table": "dummy_sales", }, ], "transform_specs": [ { "spec_id": "dummy_transform", "input_id": "dummy_input", "transformers": [ {"function": "add_current_date", "args": {"output_col": "extraction_timestamp"}}, {"function": "expression_filter", "args": {"exp": f"customer = '{partition['customer']}'"}}, { "function": "coalesce", "args": {"num_partitions": 1}, }, # Enforce that only 1 file is written - eliminating the parallelism ], }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "dummy_transform", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "s3://my_data_product_bucket/silver/dummy_sales/", # this path must end with an "/" "folder_relative_path": "dummy_complex/wo_parallelism", "file_name": f"dummy_sales_{partition['customer']}", "local_options": LOCAL_OPTIONS, "chunk_size": 200 * 1024 * 1024, # 200 MB }, }, ], } load_data(acon=acon) ``` ### iv. Complex - With parallelism (optimized for efficiency) ```python from lakehouse_engine.engine import load_data # Set the optional parameters LOCAL_OPTIONS = {"mode": "overwrite", "header": "true"} # Set the partition column PARTITION = "customer" # Fetch distinct values from the partition column partitions = get_partitions(partition=PARTITION, table="dummy_sales") # Detect array, struct or map fields which cannot be written to .csv files columns_to_cast = detect_array_or_struct_fields(spark.sql(f"SELECT * FROM {dummy_sales}")) # Sort the distinct values to ensure the correct order of the files # Note: # - If an error occurs during the process, by sorting beforehand, you guarantee the correct order of the files. # - It may come in handy if you want to restart the process (starting on a given file). partitions.sort(key=lambda x: x["customer"]) for partition in partitions: acon = { "input_specs": [ { "spec_id": "dummy_input", "read_type": "batch", "data_format": "delta", "db_table": "dummy_sales", }, ], "transform_specs": [ { "spec_id": "dummy_transform", "input_id": "dummy_input", "transformers": [ {"function": "add_current_date", "args": {"output_col": "extraction_timestamp"}}, {"function": "expression_filter", "args": {"exp": f"customer = '{partition['customer']}'"}}, # Coalesce removed guaranteeing maximum parallelism {"function": "cast", "args": {"cols": columns_to_cast}}, # Cast unsupported column types ], }, ], "output_specs": [ { "spec_id": "dummy_output", "input_id": "dummy_transform", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "dummy_client_id", "tenant_id": "dummy_tenant_id", "secret": "dummy_secret", "site_name": "dummy_site_name", "drive_name": "dummy_drive_name", "local_path": "s3://my_data_product_bucket/silver/dummy_sales/", # this path must end with an "/" "folder_relative_path": "dummy_complex/with_parallelism", "file_name": f"dummy_sales_{partition['customer']}", "local_options": LOCAL_OPTIONS, "chunk_size": 200 * 1024 * 1024, # 200 MB }, }, ], } load_data(acon=acon) ``` ### Relevant Notes - Multi-file export is not supported. For such use cases, loop through files manually and invoke SharePointWriter per file. - Authentication details should be handled securely via lakehouse configuration or secret management tools. ================================================ FILE: lakehouse_engine_usage/data_quality/__init__.py ================================================ """ .. include::data_quality.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/custom_expectations/__init__.py ================================================ """ .. include::custom_expectations.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/custom_expectations/custom_expectations.md ================================================ # Custom Expectations ## Defining Custom Expectations Custom expectations are defined in python and need to follow a structure to correctly integrate with Great Expectations. Follow the [documentation of GX on Creating Custom Expectations](https://docs.greatexpectations.io/docs/oss/guides/expectations/custom_expectations_lp/) and find information about [the existing types of expectations](https://docs.greatexpectations.io/docs/conceptual_guides/expectation_classes). Here is an example of custom expectation. As for other cases, the acon configuration should be executed with `load_data` using: ```python from lakehouse_engine.engine import load_data acon = {...} load_data(acon=acon) ``` Example of ACON configuration: ```python {!../../../../lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b.py!} ``` ### Naming Conventions Your expectation's name **should** start with expect. The name of the file **must** be the name of the expectation written in snake case. Ex: `expect_column_length_match_input_length` The name of the class **must** be the name of the expectation written in camel case. Ex: `ExpectColumnLengthMatchInputLength` ### File Structure The file contains two main sections: - the definition of the metric that we are tracking (where we define the logic of the expectation); - the definition of the expectation ### Metric Definition In this section we define the logic of the expectation. This needs to follow a certain structure: #### Code Structure 1) The class you define needs to extend one of the Metric Providers defined by Great Expectations that corresponds to your expectation's type. More info on the [metric providers](https://docs.greatexpectations.io/docs/conceptual_guides/metricproviders). 2) You need to define the name of your metric. This name **must** be unique and **must** follow the following structure: type of expectation.name of metric. Ex.: `column_pair_values.a_smaller_or_equal_than_b` **Types of expectations:** `column_values`, `multicolumn_values`, `column_pair_values`, `table_rows`, `table_columns`. 3) Any [GX default parameters](#parameters) that are necessary to calculate your metric **must** be defined as "condition_domain_keys". 4) Any [additional parameters](#parameters) that are necessary to calculate your metric **must** be defined as "condition_value_keys". 5) The logic of your expectation **must** be defined for the SparkDFExecutionEngine in order to be run on the Lakehouse. ```python 1) class ColumnMapMetric(ColumnMapMetricProvider): """Asserts that a column matches a pattern.""" 2) condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b" 3) condition_domain_keys = ( "batch_id", "table", "column_A", "column_B", "ignore_row_if", ) 4) condition_value_keys = ("margin",) 5) @column_pair_condition_partial(engine=SparkDFExecutionEngine) def _spark( self: ColumnPairMapMetricProvider, column_A: Any, column_B: Any, margin: Any, **kwargs: dict, ) -> Any: """Implementation of the expectation's logic. Args: column_A: Value of the row of column_A. column_B: Value of the row of column_B. margin: margin value to be added to column_b. kwargs: dict with additional parameters. Returns: If the condition is met. """ if margin is None: approx = 0 elif not isinstance(margin, (int, float, complex)): raise TypeError( f"margin must be one of int, float, complex." f" Found: {margin} as {type(margin)}" ) else: approx = margin # type: ignore return column_A <= column_B + approx # type: ignore ``` ### Expectation Definition In this section we define the expectation. This needs to follow a certain structure: #### Code Structure 1) The class you define needs to extend one of the Expectations defined by Great Expectations that corresponds to your expectation's type. 2) You must define an "examples" object where you define at least one success and one failure of your expectation to demonstrate its logic. The result format must be set to complete, and you must set the [unexpected_index_name](#result-format) variable. !!! note For any examples where you will have unexpected results you must define unexpected_index_list in your "out" element. This will be validated during the testing phase. 3) The metric **must** be the same you defined in the metric definition. 4) You **must** define all [additional parameters](#parameters) that the user has to/should provide to the expectation. 5) You **should** define any default values for your expectations parameters. 6) You must **define** the `_validate` method like shown in the example. You **must** call the `validate_result` function inside your validate method, this process adds a validation to the unexpected index list in the examples. !!! note If your custom expectation requires any extra validations, or you require additional fields to be returned on the final dataframe, you can add them in this function. The validate_result method has two optional parameters (`partial_success` and `partial_result) that can be used to pass the result of additional validations and add more information to the result key of the returned dict respectively. ```python 1) class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation): """Expect values in column A to be lower or equal than column B. Args: column_A: The first column name. column_B: The second column name. margin: additional approximation to column B value. Keyword Args: allow_cross_type_comparisons: If True, allow comparisons between types (e.g. integer and string). Otherwise, attempting such comparisons will raise an exception. ignore_row_if: "both_values_are_missing", "either_value_is_missing", "neither" (default). result_format: Which output mode to use: `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`. include_config: If True (default), then include the expectation config as part of the result object. catch_exceptions: If True, then catch exceptions and include them as part of the result object. Default: False. meta: A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. Returns: An ExpectationSuiteValidationResult. """ 2) examples = [ { "dataset_name": "Test Dataset", "data": { "a": [11, 22, 50], "b": [10, 21, 100], "c": [9, 21, 30], }, "schemas": { "spark": {"a": "IntegerType", "b": "IntegerType", "c": "IntegerType"} }, "tests": [ { "title": "negative_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "c", "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["c"], "include_unexpected_rows": True, }, }, "out": { "success": False, "unexpected_index_list": [ {"c": 9, "a": 11}, {"c": 21, "a": 22}, {"c": 30, "a": 50}, ], }, }, { "title": "positive_test", "exact_match_out": False, "include_in_gallery": True, "in": { "column_A": "a", "column_B": "b", "margin": 1, "result_format": { "result_format": "COMPLETE", "unexpected_index_column_names": ["a"], }, }, "out": {"success": True}, }, ], }, ] 3) map_metric = "column_values.pattern_match" 4) success_keys = ( "validation_regex", "mostly", ) 5) default_kwarg_values = { "ignore_row_if": "never", "result_format": "BASIC", "include_config": True, "catch_exceptions": False, "mostly": 1, } 6) def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: Optional[dict] = None, execution_engine: Optional[ExecutionEngine] = None, ) -> dict: """Custom implementation of the GX _validate method. This method is used on the tests to validate both the result of the tests themselves and if the unexpected index list is correctly generated. The GX test logic does not do this validation, and thus we need to make it manually. Args: configuration: Configuration used in the test. metrics: Test result metrics. runtime_configuration: Configuration used when running the expectation. execution_engine: Execution Engine where the expectation was run. Returns: Dictionary with the result of the validation. """ return validate_result(self, configuration, metrics) ``` ### Printing the Expectation Diagnostics Your expectations **must** include the ability to call the Great Expectations diagnostic function in order to be validated. In order to do this code **must** be present. ```python """Mandatory block of code. If it is removed the expectation will not be available.""" if __name__ == "__main__": # test the custom expectation with the function `print_diagnostic_checklist()` ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist() ``` ## Creation Process 1) Create a branch from lakehouse engine. 2) Create a custom expectation with your specific logic: 1. All new expectations must be placed inside folder `/lakehouse_engine/dq_processors/custom_expectations`. 2. The name of the expectation must be added to the file `/lakehouse_engine/core/definitions.py`, to the variable: `CUSTOM_EXPECTATION_LIST`. 3. All new expectations must be tested on `/tests/feature/custom_expectations/test_custom_expectations.py`. In order to create a new test for your custom expectation it is necessary to: - Copy one of the expectation folders in `tests/resources/feature/custom_expectations` renaming it to your custom expectation. - Make any necessary changes on the data/schema file present. - On `/tests/feature/custom_expectations/test_custom_expectations.py` add a scenario to test your expectation, all expectations must be tested on batch and streaming. The test is implemented to generate an acon based on each scenario data. - Test your developments to check that everything is working as intended. 3) When the development is completed, create a pull request with your changes. 4) Your expectation will be available with the next release of the lakehouse engine that happens after you pull request is approved. This means that you need to upgrade your version of the lakehouse engine in order to use it. ## Usage Custom Expectations are available to use like any other expectations provided by Great Expectations. ## Parameters Depending on the type of expectation you are defining some parameters are expected by default. Ex: A ColumnMapExpectation has a default "column" parameter. ### Mostly [Mostly](https://docs.greatexpectations.io/docs/reference/learn/expectations/standard_arguments/#mostly) is a standard parameter for a subset of expectations that is used to define a threshold for the failure of an expectation. Ex: A mostly value of 0.7 makes it so that the expectation only fails if more than 70% of records have a negative result. ## Result Format Great Expectations has several different types of [result formats](https://docs.greatexpectations.io/docs/reference/learn/expectations/result_format/) for the expectations results. The lakehouse engine requires the result format to be set to "COMPLETE" in order to tag the lines where the expectations failed. ### `unexpected_index_column_names` Inside this key you must define what columns are used as an index inside your data. If this is set and the result format is set to "COMPLETE" a list with the indexes of the lines that failed the validation will be returned by Great Expectations. This information is used by the Lakehouse Engine to tag the lines in error after the fact. The additional tests inside the `_validate` method verify that the custom expectation is tagging these lines correctly. ================================================ FILE: lakehouse_engine_usage/data_quality/data_quality.md ================================================ # Data Quality The Data Quality framework is based on [Great Expectations (GX)](https://greatexpectations.io/) and other custom-made developments, providing a very light abstraction on top of the GX open source framework and the Spark framework. ## How to use Data Quality? ### Data Loader You can define data quality rules inside the DataLoader algorithm that you use to load data. !!! note The DataLoader algorithm allows you to store the results of the data quality checks inside your custom location using the **result_sink** options (e.g., a delta table on your data product). Using result sink unlocks the capability to store DQ results having history over all the DQ executions, which can be used for debugging, to create **DQ dashboards** on top of the data, and much more. **Examples**: In these examples, dummy sales local data is used to cover a few example usages of the DQ Framework (based on Great Expectations). The main difference between the sample acons is on the usage of `dq_specs`. - 1 - [Minimal Example applying DQ with the Required Parameters](minimal_example/minimal_example.md) - 2 - [Configure Result Sink](result_sink/result_sink.md) - 3 - [Validations Failing](validations_failing/validations_failing.md) - 4 - [Row Tagging](row_tagging/row_tagging.md) **Disclaimer:** even though the `"dq_type": "validator"` is still supported (as presented on this template), our recommendation is to use `"dq_type": "prisma"`, which offers many more features end to end (from DQ Rules creation, execution until results analysis) and a configurable central observability with standard offering of Dashboarding on top. The DQ Type validator and the result_sink is still supported for very specific use cases that might still exist and for which it might make sense to keep using this approach. In case of doubt between the offerings, please feel free to reach us. ### Data Quality Validator The DQValidator algorithm focuses on validating data (e.g., spark DataFrames, Files or Tables). In contrast to the `dq_specs` inside the DataLoader algorithm, the DQValidator focuses on **validating data at rest (post-mortem)** instead of validating data in-transit (before it is loaded to the destination). !!! note The DQValidator algorithm allows you to store the results of the data quality checks inside your custom location using the **result_sink** options (e.g., a delta table on your data product). Using result sink unlocks the capability to store DQ results having history over all the DQ executions, which can be used for debugging, to create **DQ dashboards** on top of the data, and much more. [Here you can find more information regarding DQValidator and examples](data_quality_validator/data_quality_validator.md). ### Reconciliator Similarly to the [Data Quality Validator](#data-quality-validator) algorithm, the Reconciliator algorithm focuses on validating data at rest (post-mortem). In contrast to the DQValidator algorithm, the Reconciliator always compares a truth dataset (e.g., spark DataFrames, Files or Tables) with the current dataset (e.g., spark DataFrames, Files or Tables), instead of executing DQ rules defined by the teams. [Here you can find more information regarding reconciliator and examples](../reconciliator/reconciliator.md). !!! note Reconciliator does not use Great Expectations, therefore Data Docs and Result Sink and others native methods are not available. ### Custom Expectations If your data has a data quality check that cannot be done with the expectations provided by Great Expectations you can create a custom expectation to make this verification. !!! note Before creating a custom expectation check if there is an expectation already created to address your needs, both in Great Expectations and the Lakehouse Engine. Any Custom Expectation that is too specific (using hardcoded table/column names) will be rejected. **Expectations should be generic by definition.** [Here you can find more information regarding custom expectations and examples](custom_expectations/custom_expectations.md). ### Row Tagging The row tagging strategy allows users to tag the rows that failed to be easier to identify the problems in the validations. [Here you can find all the details and examples](row_tagging/row_tagging.md). ### Prisma Prisma is part of the Lakehouse Engine DQ Framework, and it allows users to read DQ functions dynamically from a table instead of writing them explicitly in the Acons. [Here you can find more information regarding Prisma](prisma/prisma.md). ## How to check the results of the Data Quality Process? ### 1. Table/location analysis The possibility to configure a **Result Sink** allows you to store the history of executions of the DQ process. You can query the table or the location to search through data and analyse history. ### 2. Power BI Dashboard With the information expanded, interactive analysis can be built on top of the history of the DQ process. A dashboard can be created with the results that we have in `dq_specs`. To be able to have this information you need to use arguments `result_sink_db_table` and/or `result_sink_location`. Through having a dashboard, the runs and expectations can be analysed, filtered by year, month, source and run name, and you will have information about the number of runs, some statistics, status of expectations and more. Analysis such as biggest failures per expectation type, biggest failures by columns, biggest failures per source, and others can be made, using the information in the `result_sink_db_table`/`result_sink_location`. !!! note The recommendation is to use the same result sink table/location for all your dq_specs and in the dashboard you will get a preview of the status of all of them. ================================================ FILE: lakehouse_engine_usage/data_quality/data_quality_validator/__init__.py ================================================ """ .. include::data_quality_validator.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/data_quality_validator/data_quality_validator.md ================================================ # Data Quality Validator DQValidator algorithm allows DQ Validations isolated from the data load (only read and apply data quality validations). With this algorithm you have the capacity to apply the Lakehouse-Engine Data Quality Process, using [Great Expectations](https://greatexpectations.io/expectations/) functions directly into a specific dataset also making use of all the [InputSpecs](../../../reference/packages/core/definitions.md#packages.core.definitions.InputSpec) available in the engine. Validating the Data Quality, using this algorithm, is a matter of defining the data you want to read and the validations you want to do to your data, detailing the great expectations functions you want to apply on the data to assess its quality. !!! warning **This algorithm also gives the possibility to restore a previous version of a delta table or delta files in case the DQ process raises any exception. Please use it carefully!!** You may lose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data Quality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored to the previous version, but the error could have happened 4 or 5 versions before. ## When to use? - **Post-Load validation**: check quality of data already loaded to a table/location - **Pre-Load validation**: check quality of the data you want to load (check DQ by reading a set of files in a specific location...) - **Validation of a DataFrame computed in the notebook itself** (e.g. check data quality after joining or filtering datasets, using the computed DataFrame as input for the validation) This algorithm also gives teams some freedom to: - **Schedule isolated DQ Validations to run periodically**, with the frequency they need; - Define a DQ Validation process **as an end-to-end test** of the respective data product. ## How to use? All of these configurations are passed via the ACON to instantiate a [DQValidatorSpec object](../../../reference/packages/core/definitions.md#packages.core.definitions.DQValidatorSpec). The DQValidator algorithm uses an ACON to configure its execution. In [DQValidatorSpec](../../../reference/packages/core/definitions.md#packages.core.definitions.DQValidatorSpec) you can find the meaning of each ACON property. Here is an example of ACON configuration: ```python from lakehouse_engine.engine import load_data acon = { "input_spec": { "spec_id": "sales_source", "read_type": "batch", "data_format": "table", "db_table": "my_database.my_table" }, "dq_spec": { "spec_id": "dq_sales", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/in/feature/dq_validator/dq", "result_sink_db_table": "my_database.dq_validator", "result_sink_format": "json", "fail_on_error": False, "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "article"}}, { "function": "expect_table_row_count_to_be_between", "args": {"min_value": 3, "max_value": 11}, }, ], }, "restore_prev_version": True, } load_data(acon=acon) ``` On this page you will also find the following examples of usage: 1. Dataframe as input & Success on the DQ Validation 2. Table as input & Failure on DQ Validation & Restore previous version 3. Files as input & Failure on DQ Validation & Fail_on_error disabled 4. Files as input & Failure on DQ Validation & Critical functions defined 5. Files as input & Failure on DQ Validation & Max failure percentage defined ### Example 1 : Dataframe as input & Success on the DQ Validation This example focuses on using a dataframe, computed in this notebook, directly in the input spec. First, a new DataFrame is generated as a result of the join of data from two tables (dummy_deliveries and dummy_pd_article) and some DQ Validations are applied on top of this dataframe. ```python from lakehouse_engine.engine import execute_dq_validation input_df = spark.sql(""" SELECT a.*, b.article_category, b.article_color FROM my_database.dummy_deliveries a JOIN my_database.dummy_pd_article b ON a.article_id = b.article_id """ ) acon = { "input_spec": { "spec_id": "deliveries_article_input", "read_type": "batch", "data_format": "dataframe", "df_name": input_df, }, "dq_spec": { "spec_id": "deliveries_article_dq", "input_id": "deliveries_article_input", "dq_type": "validator", "bucket": "my_data_product_bucket", "result_sink_db_table": "my_database.dq_validator_deliveries", "result_sink_location": "my_dq_path/dq_validator/dq_validator_deliveries/", "expectations_store_prefix": "dq/dq_validator/expectations/", "validations_store_prefix": "dq/dq_validator/validations/", "checkpoint_store_prefix": "dq/dq_validator/checkpoints/", "unexpected_rows_pk": ["salesorder", "delivery_item", "article_id"], "dq_functions": [{"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}], }, "restore_prev_version": False, } execute_dq_validation(acon=acon) ``` ### Example 2: Table as input & Failure on DQ Validation & Restore previous version In this example we are using a table as input to validate the data that was loaded. Here, we are forcing the DQ Validations to fail in order to show the possibility of restoring the table to the previous version. !!! warning **Be careful when using the feature of restoring a previous version of a delta table or delta files.** You may lose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data Quality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored to the previous version, but the error could have happened 4 or 5 versions before (because loads are daily, validations are weekly). Steps followed in this example to show how the restore_prev_version feature works. 1. **Insert rows into the dummy_deliveries table** to adjust the total numbers of rows and **make the DQ process fail**. 2. **Use the "DESCRIBE HISTORY" statement to check the number of versions available on the table** and check the version number resulting from the insertion to the table. 3. **Execute the DQ Validation**, using the configured acon (based on reading the dummy_deliveries table and setting the `restore_prev_version` to `true`). Checking the logs of the process, you can see that the data did not pass all the expectations defined and that the table version restore process was triggered. 4. **Re-run a "DESCRIBE HISTORY" statement to check that the previous version of the table was restored** and thus, the row inserted in the beginning of the process is no longer present in the table. ```python from lakehouse_engine.engine import execute_dq_validation # Force failure of data quality by adding new row spark.sql("""INSERT INTO my_database.dummy_deliveries VALUES (7, 1, 20180601, 71, "article1", "delivered")""") # Check history of the table spark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""") acon = { "input_spec": { "spec_id": "deliveries_input", "read_type": "batch", "db_table": "my_database.dummy_deliveries", }, "dq_spec": { "spec_id": "dq_deliveries", "input_id": "deliveries_input", "dq_type": "validator", "bucket": "my_data_product_bucket", "tbl_to_derive_pk": "my_database.dummy_deliveries", "dq_functions": [ {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 19}}, ], }, "restore_prev_version": True, } execute_dq_validation(acon=acon) # Check that the previous version of the table was restored spark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""") ``` ### Example 3: Files as input & Failure on DQ Validation & Fail_on_error disabled In this example we are using a location as input to validate the files in a specific folder. Here, we are forcing the DQ Validations to fail, however disabling the "fail_on_error" configuration, so the algorithm warns about the expectations that failed but the process/the execution of the algorithm doesn't fail. ```python from lakehouse_engine.engine import execute_dq_validation acon = { "input_spec": { "spec_id": "deliveries_input", "data_format": "delta", "read_type": "streaming", "location": "s3://my_data_product_bucket/silver/dummy_deliveries/", }, "dq_spec": { "spec_id": "dq_deliveries", "input_id": "deliveries_input", "dq_type": "validator", "bucket": "my_data_product_bucket", "tbl_to_derive_pk": "my_database.dummy_deliveries", "fail_on_error": False, "dq_functions": [ {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}}, ], }, "restore_prev_version": False, } execute_dq_validation(acon=acon) ``` ### Example 4: Files as input & Failure on DQ Validation & Critical functions defined In this example we are using a location as input to validate the files in a specific folder. Here, we are forcing the DQ Validations to fail by using the critical functions feature, which will throw an error if any of the functions fails. ```python from lakehouse_engine.engine import execute_dq_validation acon = { "input_spec": { "spec_id": "deliveries_input", "data_format": "delta", "read_type": "streaming", "location": "s3://my_data_product_bucket/silver/dummy_deliveries/", }, "dq_spec": { "spec_id": "dq_deliveries", "input_id": "deliveries_input", "dq_type": "validator", "bucket": "my_data_product_bucket", "tbl_to_derive_pk": "my_database.dummy_deliveries", "fail_on_error": True, "dq_functions": [ {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}, ], "critical_functions": [ {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}}, ], }, "restore_prev_version": False, } execute_dq_validation(acon=acon) ``` ### Example 5: Files as input & Failure on DQ Validation & Max failure percentage defined In this example we are using a location as input to validate the files in a specific folder. Here, we are forcing the DQ Validations to fail by using the max_percentage_failure, which will throw an error if the percentage of failures surpasses the defined maximum threshold. ```python from lakehouse_engine.engine import execute_dq_validation acon = { "input_spec": { "spec_id": "deliveries_input", "data_format": "delta", "read_type": "streaming", "location": "s3://my_data_product_bucket/silver/dummy_deliveries/", }, "dq_spec": { "spec_id": "dq_deliveries", "input_id": "deliveries_input", "dq_type": "validator", "bucket": "my_data_product_bucket", "tbl_to_derive_pk": "my_database.dummy_deliveries", "fail_on_error": True, "dq_functions": [ {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}}, ], "max_percentage_failure": 0.2, }, "restore_prev_version": False, } execute_dq_validation(acon=acon) ``` ## Limitations Unlike DataLoader, this new DQValidator algorithm only allows, for now, one input_spec (instead of a list of input_specs) and one dq_spec (instead of a list of dq_specs). There are plans and efforts already initiated to make this available in the input_specs and one dq_spec (instead of a list of dq_specs). However, you can prepare a Dataframe which joins more than a source, and use it as input, in case you need to assess the Data Quality from different sources at the same time. Alternatively, you can also show interest on any enhancement on this feature, as well as contributing yourself. ================================================ FILE: lakehouse_engine_usage/data_quality/minimal_example/__init__.py ================================================ """ .. include::minimal_example.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/minimal_example/minimal_example.md ================================================ # Minimal Example This scenario illustrates the minimal configuration that you can have to use `dq_specs`, in which it uses required parameters: `spec_id, input_id, dq_type, bucket, dq_functions`. Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check: - **expect_column_to_exist** - if a column exist in the data; - **expect_table_row_count_to_be_between** - if the row count of the data is between the defined interval; - **expect_table_column_count_to_be_between** - if the number of columns in the data is bellow the max value defined. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True, }, "location": "s3://my_data_product_bucket/dummy_deliveries/", } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "dummy_deliveries_source", "dq_type": "validator", "bucket": "my_data_product_bucket", "tbl_to_derive_pk": "my_database.dummy_deliveries", "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "salesorder"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}}, {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}}, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/", } ], } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/data_quality/prisma/__init__.py ================================================ """ .. include::prisma.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/prisma/prisma.md ================================================ # Prisma Prisma is part of the Lakehouse Engine DQ Framework, and it allows users to read DQ functions dynamically from a table instead of writing them explicitly in the Acons. ## How to use Prisma? - Use the Lakehouse Engine version: 1.22.0 or later; - Use DBR 13.3 or later. If you are not using Databricks, ensure a similar environment with Spark 3.4.1 and Delta 2.4.0. - Create the DQ Checks in a table in your Data Product: - Each data quality check conducted in Prisma will be hosted within the bucket defined in the engine config file (lakehouse_engine/configs/engine.yaml). Consequently, the result sink location will receive the results of their assessments at the granularity of each "run", capturing all records generated during every operation. The DQ Checks table is located in the demanding data product and can have any name (i.e: data_quality_checks). - The idea is for it to be a central bucket for all DPs to ensure easier and better observability and unlock offering of easier insights over the Data Quality of the Lakehouse. Below you find a DDL example with the expected schema and description for the fields: ```sql DROP TABLE IF EXISTS my_database.data_quality_checks; CREATE EXTERNAL TABLE my_database.data_quality_checks ( dq_rule_id STRING COMMENT 'DQ Rule ID.', dq_tech_function STRING COMMENT 'Great Expectations function type to apply according to the DQ rules type. Example: expect_column_to_exist.', execution_point STRING COMMENT 'In motion/At rest.', schema STRING COMMENT 'The database schema on which the check is to be applied.', table STRING COMMENT 'The table on which the check is to be applied.', column STRING COMMENT 'The column (either on Lakehouse or in other accessible source systems, such as FDP or SAP BW) on which the check is to be applied.', filters STRING COMMENT 'General filters to the data set (where part of the statement). Note: this is purely descriptive at this point as there is no automated action/filtering of the Lakehouse Engine or PRISMA upon it.', arguments STRING COMMENT 'Additional arguments to run the Great Expectation Function in the same order as they appear in the function. Example: {"column": "amount", "min_value": 0}.', dimension STRING COMMENT 'Data Quality dimension.' ) USING DELTA LOCATION 's3://my-data-product-bucket/inbound/data_quality_checks' COMMENT 'Table with dummy data mapping DQ Checks.' TBLPROPERTIES( 'lakehouse.primary_key'='dq_rule_id', 'delta.enableChangeDataFeed'='true' ) ``` **Data sample:** | dq_rule_id | dq_tech_function | execution_point | schema | table | column | filters | arguments | dimension | |------------|:------------------------------------------|:----------------|:-------------------|:------------|:-------------|:--------|:--------------------------------------------------|--------------| | 1 | expect_column_values_to_not_be_null | at_rest | my_database_schema | dummy_sales | ordered_item | | {"column": "ordered_item"} | Completeness | | 2 | expect_column_min_to_be_between | in_motion | my_database_schema | dummy_sales | ordered_item | | {"column": "amount", "min_value": 0} | Completeness | | 3 | expect_column_values_to_not_be_in_set | in_motion | my_database_schema | dummy_sales | ordered_item | | {"column": "amount", "value_set": [1,2,3]} | Completeness | | 4 | expect_column_pair_a_to_be_not_equal_to_b | at_rest | my_database_schema | dummy_sales | ordered_item | | {"column_A": "amount","column_B": "ordered_item"} | Completeness | | 5 | expect_table_row_count_to_be_between | at_rest | my_database_schema | dummy_sales | ordered_item | | {"min_value": 1, "max_value": 10} | Completeness | **Table definition:** | Column Name | Definition | |------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | dq_rule_id | The identifier of a data quality rule. | | dq_tech_function | Type of Great Expectations function to apply according to the DQ rules type. See the values here: [Gallery of Expectations and Packages](https://greatexpectations.io/legacy/v1/expectations/?filterType=Backend+support&viewType=Summary&showFilters=true&subFilterValues=spark) | | execution_point | The way how validations will be performed on top the the data set. List of values: at_rest, in_motion. | | schema | The schema on which the check is to be applied. | | table | The table on which the check is to be applied. | | column | The column on which the check is to be applied. | | filters | General filters to the data set (where part of the statement). **Note**: this is purely descriptive at this point as there is no automated action/filtering of the Lakehouse Engine or PRISMA upon it. | | arguments | Additional arguments to run the Great Expectation Function in the same order as they appear in the function. | | dimension | Categorisation of a DQ rule related to one of the dimensions. List of values: Completeness, Uniqueness, Timeliness, Validity, Consistency, Accuracy. **Note**: these values are purely descriptive. | **Execution behaviour** - The value of the **execution_point** column determines the type of Acon execution: - **For records at_rest**, they will only be processed when the Lakehouse engine is called by the execute_dq_validation() function. - **For records in_motion**, they will only be processed when the Lakehouse engine is called by load_data() function. ## What are the main changes on my ACON if I already implemented DQ? The following configurations represent the minimum requirements to make Prisma DQ work. - **dq_type:** "prisma" - the value must be set in order for the engine process the DQ with Prisma; - **store_backend:** "file_system" or "s3" - which store backend to use; - **bucket** - the bucket name to consider for the store_backend (store DQ artefacts). **Note**: only applicable and mandatory for store_backend s3. - **local_fs_root_dir:** path of the root directory. **Notes**: only applicable for store_backend file_system; - **dq_db_table:** the DQ Check table that is located in the demanding data product; - **dq_table_table_filter:** name of the table which rules are to be applied in the validations. The table name must match with the values inserted in the column "table" from dq_db_table; - **data_product_name:** the name of the data product; - **tbl_to_derive_pk or unexpected_rows_pk:** - tbl_to_derive_pk - automatically derive the primary keys from a given database table. **Note**: the primary keys are derived from the **lakehouse.primary_key** property of a table. - unexpected_rows_pk - the list of columns composing the primary key of the source data to identify the rows failing the DQ validations. **DQ Prisma Acon example** ```python "dq_specs": [ { "spec_id": "dq_validator_in_motion", "input_id": "dummy_sales_transform", "dq_type": "prisma", "store_backend": "file_system", "local_fs_root_dir": "/my-data-product/artefacts/dq", "dq_db_table": DQ_DB_TABLE, "dq_table_table_filter": "dummy_sales", "data_product_name": DATA_PRODUCT_NAME, "tbl_to_derive_pk": DB_TABLE, } ], ``` !!! note Available extra parameters to use in the DQ Specs for Prisma: - **data_docs_local_fs** - the path for data docs. The parameter is useful in case you want your DQ Results to be reflected on the automatic Data Docs site; - **data_docs_prefix** - prefix where to store data_docs' data. This parameter must be used together with `data_docs_local_fs`; - **dq_table_extra_filters** - extra filters to be used when deriving DQ functions. This is an SQL expression to be applied to `dq_db_table` which means that the statements must use one of the available columns in the table. For example: dq_rule_id in ('rule1','rule2'); - **data_docs_bucket** - the bucket name for data docs only. When defined, it will supersede bucket parameter. **Note:** only applicable for store_backend s3; - **expectations_store_prefix** - prefix where to store expectations' data. **Note:** only applicable for store_backend s3; - **validations_store_prefix** - prefix where to store validations' data. **Note:** only applicable for store_backend s3; - **checkpoint_store_prefix** - prefix where to store checkpoints' data. **Note:** only applicable for store_backend s3; ## End2End Example Below you can also find an End2End and detailed example of loading data into the DQ Checks table and then using PRISMA both with load_data() and execute_dq_validation(). ??? example "**1 - Load the DQ Checks Table**" This example shows how to insert data into the data_quality_checks table using an Acon with a csv file as a source. The location provided is just an example of a place to store the csv. It is also important that the source file contains the **data_quality_checks** schema. ```python acon = { "input_specs": [ { "spec_id": "read_dq_checks", "read_type": "batch", "data_format": "csv", "options": {"header": True, "delimiter": ";"}, "location": "s3://my-data-product/local_data/data_quality_checks/", } ], "output_specs": [ { "spec_id": "write_dq_checks", "input_id": "read_dq_checks", "write_type": "overwrite", "data_format": "delta", "location": "s3://my-data-product-bucket/inbound/data_quality_checks", } ], } load_data(acon=acon) ``` ??? example "**2 - PRISMA - IN MOTION (load_data)**" ```python cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"} acon = { "input_specs": [ { "spec_id": "dummy_sales_bronze", "read_type": "batch", "data_format": "delta", "location": "s3://my-data-product-bucket/bronze/dummy_sales", } ], "transform_specs": [ { "spec_id": "dummy_sales_transform", "input_id": "dummy_sales_bronze", "transformers": [ { "function": "rename", "args": { "cols": cols_to_rename, }, }, ], } ], "dq_specs": [ { "spec_id": "dq_validator_in_motion", "input_id": "dummy_sales_transform", "dq_type": "prisma", "store_backend": "file_system", "local_fs_root_dir": "/my-data-product/artefacts/dq", "dq_db_table": DQ_DB_TABLE, "dq_table_table_filter": "dummy_sales", "dq_table_extra_filters": "1 = 1", "data_docs_local_fs": "my-data-product/my-data-product-dq-site", "data_docs_prefix": "{}/my-data-product-bucket/data_docs/site/".format(DQ_PREFIX), "data_product_name": DATA_PRODUCT_NAME, "tbl_to_derive_pk": DB_TABLE, } ], "output_specs": [ { "spec_id": "dummy_sales_silver", "input_id": "dq_validator_in_motion", "write_type": "overwrite", "data_format": "delta", "location": "s3://my-data-product-bucket/silver/dummy_sales_dq_template_in_motion", } ], } load_data(acon=acon) ``` ??? example "**3 - PRISMA - AT REST (exec_dq_validation)**" ```python acon = { "input_spec": { "spec_id": "dummy_sales_source", "read_type": "batch", "db_table": DB_TABLE, }, "dq_spec": { "spec_id": "dq_validator_at_rest", "input_id": "sales_input", "dq_type": "prisma", "store_backend": "file_system", "local_fs_root_dir": "/my-data-product/artefacts/dq", "dq_db_table": DQ_DB_TABLE, "dq_table_table_filter": "dummy_sales", "data_docs_local_fs": "my-data-product/my-data-product-dq-site", "data_docs_prefix": "{}/my-data-product-bucket/data_docs/site/".format(DQ_PREFIX), "data_product_name": DATA_PRODUCT_NAME, "tbl_to_derive_pk": DB_TABLE, }, } execute_dq_validation(acon=acon) ``` ## Troubleshooting/Common issues This section provides a summary of common issues and resolutions. ??? warning "**Error type: filter does not get rules from DQ Checks table.**" image **Solution**: make sure the records in your DQ Checks table are well-defined. In the Acon, ensure that you have the dq_table_table_filter with the correct table name. ??? warning "**Error type: missing expectation.**" image **Solution**: make sure that you are using a valid expectation. See the valid ones on: [Gallery of Expectations and Packages](https://greatexpectations.io/legacy/v1/expectations/?filterType=Backend+support&viewType=Summary&showFilters=true&subFilterValues=spark) ??? warning "**Error type: missing expectation parameters.**" image **Solution**: make sure that your "arguments" column in the DQ CHECKS table has all necessary parameters for the expectation. For example, the expectation [expect_column_values_to_not_be_null](https://greatexpectations.io/legacy/v1/expectations/expect_column_values_to_not_be_null?filterType=Backend%20support&gotoPage=1&showFilters=true&viewType=Summary&subFilterValues=spark) needs one argument (column (str): The column name). ================================================ FILE: lakehouse_engine_usage/data_quality/result_sink/__init__.py ================================================ """ .. include::result_sink.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/result_sink/result_sink.md ================================================ # Result Sink These scenarios store the results of the dq_specs into a result sink. For that, both scenarios include parameters defining the specific table and location (`result_sink_db_table` and `result_sink_location`) where the results are expected to be stored. With this configuration, people can, later on, check the history of the DQ executions using the configured table/location, as shown bellow. You can configure saving the output of the results in the result sink following two approaches: - [**Denormalized/exploded Data Model (recommended)**](#1-result-sink-exploded-recommended) - the results are stored in a detailed format in which people are able to analyse them by Data Quality Run, by expectation_type and by keyword arguments. | ... | source | column | max_value | min_value | expectation_type | expectation_success | observed_value | run_time_year | ... | |-----------------------------|------------|------------|-----------|-----------|-----------------------------------------|---------------------|----------------|---------------|-----| | all columns from raw + more | deliveries | salesorder | null | null | expect_column_to_exist | TRUE | null | 2023 | ... | | all columns from raw + more | deliveries | null | null | null | expect_table_row_count_to_be_between | TRUE | 23 | 2023 | ... | | all columns from raw + more | deliveries | null | null | null | expect_table_column_count_to_be_between | TRUE | 6 | 2023 | ... | - [**Raw Format Data Model (not recommended)**](#2-raw-result-sink) - the results are stored in the raw format that Great Expectations outputs. This is not recommended as the data will be highly nested and in a string format (to prevent problems with schema changes), which makes analysis and the creation of a dashboard on top way harder. | checkpoint_config | run_name | run_time | run_results | success | validation_result_identifier | spec_id | input_id | |----------------------|----------------------------|----------------------------------|-------------------------------|------------------------|------------------------------|---------|----------| | entire configuration | 20230323-...-dq_validation | 2023-03-23T15:11:32.225354+00:00 | results of the 3 expectations | true/false for the run | identifier | spec_id | input_id | !!! note - More configurations can be applied in the result sink, as the file format and partitions. - It is recommended to: - Use the same result sink table/location for all dq_specs across different data loads, from different sources, in the same Data Product. - Use the parameter `source` (only available with `"result_sink_explode": True`), in the dq_specs, as used in both scenarios, with the name of the data source, to be easier to distinguish sources in the analysis. If not specified, the `input_id` of the dq_spec will be considered as the `source`. - These recommendations will enable more rich analysis/dashboard at Data Product level, considering all the different sources and data loads that the Data Product is having. ## 1. Result Sink Exploded (Recommended) This scenario stores DQ Results (results produces by the execution of the dq_specs) in the Result Sink, in a detailed format, in which people are able to analyse them by Data Quality Run, by expectation_type and by keyword arguments. This is the recommended approach since it makes the analysis on top of the result sink way easier and faster. For achieving the exploded data model, this scenario introduces the parameter `result_sink_explode`, which is a flag to determine if the output table/location should have the columns exploded (as `True`) or not (as `False`). **Default:** `True`, but it is still provided explicitly in this scenario for demo purposes. The table/location will include a schema which contains general columns, statistic columns, arguments of expectations, and others, thus part of the schema will be always with values and other part will depend on the expectations chosen. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True, }, "location": "s3://my_data_product_bucket/dummy_deliveries/", } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "dummy_deliveries_source", "dq_type": "validator", "bucket": "my_data_product_bucket", "result_sink_db_table": "my_database.dq_result_sink", "result_sink_location": "my_dq_path/dq_result_sink/", "result_sink_explode": True, "tbl_to_derive_pk": "my_database.dummy_deliveries", "source": "deliveries_success", "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "salesorder"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}}, {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}}, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/", } ], } load_data(acon=acon) ``` To check the history of the DQ results, you can run commands like: - the table: `display(spark.table("my_database.dq_result_sink"))` - the location: `display(spark.read.format("delta").load("my_dq_path/dq_result_sink/"))` ## 2. Raw Result Sink This scenario is very similar to the previous one, but it changes the parameter `result_sink_explode` to `False` so that it produces a raw result sink output containing only one row representing the full run of `dq_specs` (no matter the amount of expectations/dq_functions defined there). Being a raw output, **it is not a recommended approach**, as it will be more complicated to analyse and make queries on top of it. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True, }, "location": "s3://my_data_product_bucket/dummy_deliveries/", } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "dummy_deliveries_source", "dq_type": "validator", "bucket": "my_data_product_bucket", "result_sink_db_table": "my_database.dq_result_sink_raw", "result_sink_location": "my_dq_path/dq_result_sink_raw/", "result_sink_explode": False, "tbl_to_derive_pk": "my_database.dummy_deliveries", "source": "deliveries_success_raw", "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "salesorder"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}}, {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}}, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/", } ], } load_data(acon=acon) ``` To check the history of the DQ results, you can run commands like: - the table: `display(spark.table("my_database.dq_result_sink_raw"))` - the location: `display(spark.read.format("delta").load("my_dq_path/dq_result_sink_raw/"))` ================================================ FILE: lakehouse_engine_usage/data_quality/row_tagging/__init__.py ================================================ """ .. include::row_tagging.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/row_tagging/row_tagging.md ================================================ # Row Tagging Data quality is essential for any organisation that relies on data to make informed decisions. High-quality data provides accurate, reliable, and timely information that enables organisations to identify opportunities, mitigate risks, and optimize their operations. In contrast, low-quality data can lead to incorrect conclusions, faulty decisions, and wasted resources. There are several common issues that can compromise data quality, such as: - data entry errors; - data duplication; - incomplete / inconsistent data; - changes where data is collected (e.g. sources); - faulty data processing, such as inaccurate data cleansing or transformations. Therefore, implementing data quality controls, such as data validation rules, and regularly monitoring data for accuracy and completeness is key for any organisation. One of these controls that can be applied is the **DQ Row Tagging Strategy** so that you not only apply validations on your data to ensure Data Quality, but you also tag your data with the results of the Data Quality validations providing advantages like: - Transparency for downstream and upstream consumers; - Data Observability and Reliability; - More trust over the data; - Anomaly Detection; - Easier and faster discovery of Data Quality problems, and, consequently faster resolution; - Makes it easier to deal with integrations with other systems and migrations (you can have validations capturing that a column was changed or simply disappeared); !!! note When using the DQ Row Tagging approach data availability will take precedence over Data Quality, meaning that all the data will be introduced into the final target (e.g. table or location) no matter what Data Quality issues it is having. Different Types of Expectations: - Table Level - Column Aggregated Level - Query Level - Column Values (**row level**) - Column Pair Value (**row level**) - Multicolumn Values (**row level**) The expectations highlighted as **row level** will be the ones enabling to Tag failures on specific rows and adding the details about each failure (they affect the field **run_row_result** inside **dq_validations**). The expectations with other levels (not row level) influence the overall result of the Data Quality execution, but won't be used to tag specific rows (they affect the field **run_success** only, so you can even have situations for which you get **run_success False** and **run_row_success True** for all rows). ## How does the Strategy work? The strategy relies mostly on the 6 below arguments. !!! note When you specify `"tag_source_data": True` the arguments **fail_on_error**, **gx_result_format** and **result_sink_explode** are set to the expected values. - **unexpected_rows_pk** - the list columns composing the primary key of the source data to use to identify the rows failing the DQ validations. - **tbl_to_derive_pk** - `db.table` to automatically derive the unexpected_rows_pk from. - **gx_result_format** - great expectations result format. Default: `COMPLETE`. - **tag_source_data** - flag to enable the tagging strategy in the source data, adding the information of the DQ results in a column `dq_validations`. This column makes it possible to identify if the DQ run was succeeded in general and, if not, it unlocks the insights to know what specific rows have made the DQ validations fail and why. Default: `False`. !!! note It only works if result_sink_explode is `True`, result_format is `COMPLETE` and fail_on_error is `False. - **fail_on_error** - whether to fail the algorithm if the validations of your data in the DQ process failed. - **result_sink_explode** - flag to determine if the output table/location should have the columns exploded (as `True`) or not (as `False`). Default: `True`. !!! note It is mandatory to provide one of the arguments (**unexpected_rows_pk** or **tbl_to_derive_pk**) when using **tag_source_data** as **True**. When **tag_source_data** is **False**, this is not mandatory, but **still recommended**. !!! note The tagging strategy only works when `tag_source_data` is `True`, which automatically assigns the expected values for the parameters `result_sink_explode` (True), `fail_on_error` (False) and `gx_result_format` ("COMPLETE"). !!! note For the DQ Row Tagging to work, in addition to configuring the aforementioned arguments in the dq_specs, you will also need to add the **dq_validations** field into your table (your DDL statements, **recommended**) or enable schema evolution. !!! note Kwargs field is a string, because it can assume different schemas for different expectations and runs. It is useful to provide the complete picture of the **row level failure** and to allow filtering/joining with the result sink table, when there is one. Some examples of kwargs bellow: - `{"column": "country", "min_value": 1, "max_value": 2, "batch_id": "o723491yyr507ho4nf3"}` → example for expectations starting with `expect_column_values` (they always make use of "column", the other arguments vary). - `{"column_A: "country", "column_B": "city", "batch_id": "o723491yyr507ho4nf3"}` → example for expectations starting with `expect_column_pair` (they make use of "column_A" and "column_B", the other arguments vary). - `{"column_list": ["col1", "col2", "col3"], "batch_id": "o723491yyr507ho4nf3"}` → example for expectations starting with `expect_multicolumn` (they make use of "column_list", the other arguments vary). `batch_id` is common to all expectations, and it is an identifier for the batch of data being validated by Great Expectations. ### Example This scenario uses the row tagging strategy which allow users to tag the rows that failed to be easier to identify the problems in the validations. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True, }, "location": "s3://my_data_product_bucket/dummy_deliveries/", } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "dummy_deliveries_source", "dq_type": "validator", "bucket": "my_data_product_bucket", "result_sink_db_table": "my_database.dq_result_sink", "result_sink_location": "my_dq_path/dq_result_sink/", "tag_source_data": True, "tbl_to_derive_pk": "my_database.dummy_deliveries", "source": "deliveries_tag", "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "salesorder"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}}, { "function": "expect_column_values_to_be_in_set", "args": {"column": "salesorder", "value_set": ["37"]}, }, { "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "args": {"column_A": "salesorder", "column_B": "delivery_item"}, }, { "function": "expect_multicolumn_sum_to_equal", "args": {"column_list": ["salesorder", "delivery_item"], "sum_total": 100}, }, ], "critical_functions": [ {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 6}}, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/", } ], } load_data(acon=acon) ``` Running bellow cell shows the new column created, named `dq_validations` with information about DQ validations. `display(spark.read.format("delta").load("s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/"))` ## Performance and Limitations Trade-offs When using the DQ Row Tagging Strategy, by default we are using Great Expectations Result Format "Complete" with Unexpected Index Column Names (a primary key for the failures), meaning that for each failure, we are getting all the distinct values for the primary key. After getting all the failures, we are applying some needed transformations and joining them with the source data, so that it can be tagged by filling the "dq_validations" column. Hence, this can definitely be a heavy and time-consuming operation on your data loads. To reduce this disadvantage you can cache the dataframe by passing the `"cache_df": True` in your DQ Specs. In addition to this, always have in mind that each expectation (dq_function) that you add into your DQ Specs, is more time that you are adding into your data loads, so always balance performance vs amount of validations that you need. Moreover, Great Expectations is currently relying on the driver node to capture the results of the execution and return/store them. Thus, in case you have huge amounts of rows failing (let's say 500k or more) Great Expectations might raise exceptions. On these situations, the data load will still happen and the data will still be tagged with the Data Quality validations information, however you won't have the complete picture of the failures, so the raised_exceptions field is filled as True, so that you can easily notice it and debug it. Most of the time, if you have such an amount of rows failing, it will probably mean that you did something wrong and want to fix it as soon as possible (you are not really caring about tagging specific rows, because you will not want your consumers to be consuming a million of defective rows). However, if you still want to try to make it pass, you can try to increase your driver and play with some spark configurations like: - `spark.driver.maxResultSize` - `spark.task.maxFailures` For debugging purposes, you can also use a different [Great Expectations Result Format]( https://docs.greatexpectations.io/docs/reference/expectations/result_format/) like "SUMMARY" (adding in your DQ Spec `"gx_result_format": "SUMMARY"`), so that you get only a partial list of the failures, avoiding surpassing the driver capacity. !!! note When using a Result Format different from the default ("COMPLETE"), the flag "tag_source_data" will be overwritten to `False`, as the results of the tagging wouldn't be complete which could lead to erroneous conclusions from stakeholders (but you can always get the details about the result of the DQ execution in the `result_sink_location` or `result_sink_db_table` that you have configured). ================================================ FILE: lakehouse_engine_usage/data_quality/validations_failing/__init__.py ================================================ """ .. include::validations_failing.md """ ================================================ FILE: lakehouse_engine_usage/data_quality/validations_failing/validations_failing.md ================================================ # Validations Failing The scenarios presented on this page are similar, but their goal is to show what happens when a DQ expectation fails the validations. The logs generated by the execution of the code will contain information regarding which expectation(s) have failed and why. ## 1. Fail on Error In this scenario is specified below two parameters: - `"fail_on_error": False` - this parameter is what controls what happens if a DQ expectation fails. In case this is set to `true` (default), your job will fail/be aborted and an exception will be raised. In case this is set to `false, a log message will be printed about the error (as shown in this scenario) and the result status will also be available in result sink (if configured) and in the [data docs great expectation site](../data_quality.html#3-data-docs-website). On this scenario it is set to `false` to avoid failing the execution of the notebook. - the `max_value` of the function `expect_table_column_count_to_be_between` is defined with specific value so that this expectation fails the validations. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True, }, "location": "s3://my_data_product_bucket/dummy_deliveries/", } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "dummy_deliveries_source", "dq_type": "validator", "bucket": "my_data_product_bucket", "result_sink_db_table": "my_database.dq_result_sink", "result_sink_location": "my_dq_path/dq_result_sink/", "tbl_to_derive_pk": "my_database.dummy_deliveries", "source": "deliveries_fail", "fail_on_error": False, "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "salesorder"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 20}}, {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}}, {"function": "expect_column_values_to_be_null", "args": {"column": "article"}}, {"function": "expect_column_values_to_be_unique", "args": {"column": "status"}}, { "function": "expect_column_min_to_be_between", "args": {"column": "delivery_item", "min_value": 1, "max_value": 15}, }, { "function": "expect_column_max_to_be_between", "args": {"column": "delivery_item", "min_value": 15, "max_value": 30}, }, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/", } ], } load_data(acon=acon) ``` If you run bellow command, you would be able to see the `success` column has the value `false` for the last execution. `display(spark.table(RENDER_UTILS.render_content("my_database.dq_result_sink")))` ## 2. Critical Functions In this scenario, alternative parameters to `fail_on_error` are used: - `critical_functions` - this parameter defaults to `None` if not defined. It controls what DQ functions are considered a priority and as such, it stops the validation and throws an execution error whenever a function defined as critical doesn't pass the test. If any other function that is not defined in this parameter fails, an error message is printed in the logs. This parameter has priority over `fail_on_error`. In this specific example, after defining the `expect_table_column_count_to_be_between` as critical, it is made sure that the execution is stopped whenever the conditions for the function are not met. Additionally, it can also be defined additional parameters like: - `max_percentage_failure` - this parameter defaults to `None` if not defined. It controls what percentage of the total functions can fail without stopping the execution of the validation. If the threshold is surpassed the execution stops and a failure error is thrown. This parameter has priority over `fail_on_error` and `critical_functions`. You can also pair `critical_functions` with `max_percentage_failure` by defining something like a 0.6 max percentage of failure and also defining some critical function. In this case even if the threshold is respected, the list defined on `critical_functions` still is checked. ```python from lakehouse_engine.engine import load_data acon = { "input_specs": [ { "spec_id": "dummy_deliveries_source", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": "|", "inferSchema": True, }, "location": "s3://my_data_product_bucket/dummy_deliveries/", } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "dummy_deliveries_source", "dq_type": "validator", "bucket": "my_data_product_bucket", "result_sink_db_table": "my_database.dq_result_sink", "result_sink_location": "my_dq_path/dq_result_sink/", "source": "deliveries_critical", "tbl_to_derive_pk": "my_database.dummy_deliveries", "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "salesorder"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}}, ], "critical_functions": [ {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}}, ], } ], "output_specs": [ { "spec_id": "dummy_deliveries_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/", } ], } load_data(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/gab/__init__.py ================================================ """ .. include::gab.md """ ================================================ FILE: lakehouse_engine_usage/gab/gab.md ================================================ # GAB - Gold Asset Builder GAB stands for Gold Asset Builder and, technically, it is a SQL-first transformation workflow that allows teams to quickly and collaboratively deploy aggregate tables on top of base fact tables, which can then be used for empowering analytics over different perspectives on dashboards or exploratory queries. GAB provides the following benefits: - **Efficiency and speed**: It reduces the efforts and time to production for new aggregate tables (gold layer assets). - **Simple operation**: It simplifies the cluster decision by having just 3 cluster types (small, medium, large), there's no need to create a separated pipeline for each case. These cluster types are tied to the concept of workload priority in GAB (more on that later). - **Low-code first:** Focus on low-code aggregation configuration with capabilities to also orchestrate complex SQL. !!! warning Before deciding whether your use case can be supported by GAB or not, read the instructions in the sections below carefully. If there is any doubt about certain metrics which might deviate from the realm of GAB, reach out to us before starting your development and we will support you. GAB may not be a one size fit for all your requirements, so use GAB only if it satisfies your requirements. image ## Main Advantages over Self-Orchestrated SQL - More flexibility to define any type of complex sql queries. - Only need to touch SQL, GAB takes care of all its orchestration. - Quick production rollout, adaptability and maintainability, without the need to define any complex aggregation orchestration, rerun logic, monitoring, etc. - Inner-sourcing model really works, as a data analyst can work on a SQL template and hand it over to the data engineering team, which can then adapt that SQL template and take it to production quickly after the data validation. - As shown in the image below, it's possible to generate different perspectives (dimensions - D1, D2, D3...) of different metrics (M1, M2, M3) for a specific use case: 1. **Grouping Set (dimensions D1, D2)** - Compute the same metrics at a higher grain from the finest grain. 2. **Grouping Set (dimensions D1, D2, D3)** - Compute the same metrics at the finest grain. 3. **Grouping Set (dimensions D1)** - Compute the same metrics at a higher grain. | D1 | D2 | D3 | M1 | M2 | M3 | | :------ | :-----:| :-----:| :-----:| :-----:| :-----:| | value 1 | value 2| NULL | 22 | 45 | 54 | | value 1 | value 2| value 3| 89 | 12 | 47 | | value 1 | NULL | NULL | 45 | 57 | 12 | ## When to use GAB? - When an aggregate result, constructed using SQL, is to be created for different levels of detail (AKA different grains) supporting analytics on dashboards or exploratory queries with some specific dimensions and metrics. - When metrics and dimensions are bound to configured *DAY, WEEK, MONTH, QUARTER, YEAR* cadences and you are not calculating the whole universe of data in your SQL query (e.g., you're looking back or forward on a specific time interval). ## When not to use GAB? - When metrics and dimensions are not bound to *DAY, WEEK, MONTH, QUARTER, YEAR* cadences. - When your result is not an aggregated result, i.e., the resulting table is at the transaction grain. - If your start and end dates for the time interval include dates into the future. - !!! warning This is for now a current limitation in the GAB engine codebase (`if new_end_date >= current_date: new_end_date = current_date`) that would require further testing to ensure it can be relaxed. - If your metrics are not calculated incrementally, you should consider the tradeoff of using GAB vs just writing a very simple "full load" SQL code that computes the all universe of data all the time. - !!! note However, if the computation is not very intensive, the orchestration/automation that comes with GAB out of the box can actually provide you value. Moreover, even if the metrics are not computed incrementally, you can collect all the automation benefits from GAB and use a time filter in your SQL statements in GAB. You can take that into consideration for your use case. ## GAB Concepts and Features ### Cadence In which time grain you want the data to be aggregated: DAILY, WEEKLY, MONTHLY, QUARTERLY, YEARLY. The internal dynamics with the CADENCE concept in GAB heavily rely on an automatically generated dimension calendar for GAB's internal usage. ```python {'DAY':{},'WEEK':{},'MONTH':{},'YEAR':{}} ``` ### Dimensions & Metrics #### Dimensions It's just a regular dimension according to the OLAP concept. It will be used to aggregate the metrics, example: `product_category`. Usually it is directly mapped from the source tables without any transformation. #### Metrics Aggregated value at the dimension level. As part of the dimensions, GAB has an automatically generated calendar dimension at different grains (more on that below). There are some options to compute a metric: - **Using SQL to directly** query and aggregate a source table column. Example: `sum(product_amount)` - Compute it in the same cadence, but in **CADENCE - 1 time window**. Example: In a `MONTHLY` cadence it will compute for the previous month. - Compute it in the same cadence, but using **last year's reference value**. Example: In a `QUARTERLY` cadence it will compute it in the same quarter but from the previous year. - Compute it in the same cadence, but with a **custom window function**. Example: In a `QUARTER` cadence computing the last 2 quarters. - Compute it in **using any SQL function**, using any of the available columns, deriving a metric from another, etc. Example: compute a metric by multiplying it by 0.56 for the last 6 months of data. !!! note Each computation derives a [new column on the output view](step_by_step/step_by_step.md#use-case-configuration-using-the-query_builder_helper). ### Extended Window Calculator, Reconciliation & Snapshotting #### Extended Window Calculator This feature aims to calculate the extended window of any cadence despite the user providing custom dates which are not the exact start and end dates of a cadence. For example, if the user wants to calculate the `MONTH` cadence but gives a date range of `2023-01-10` to `2023-01-29`, which is not exactly the start and/or end of the month, the computation window will be extended/adjusted to `2023-01-01`-`2023-01-31`, i.e., including the complete month. This ensures that GAB automatically handles any user error to efficiently integrate the complete data of the selected cadence. #### Reconciliation The concept of Extended Window Calculator is intertwined with the concept of Reconciliation. These enable the user to compute the data aggregated by the specified cadence, but leveraging 1) *"cadence to date"* calculations; or 2) Reconcile the data taking into account late events. ##### "*Cadence to Date*" Calculations For example, there can be a use case where the cadence is `WEEKLY`, but we want the aggregated data with a `DAILY` frequency, so configuring the reconciliation window to be `DAILY` it will compute the data in `WEEK TO DATE` basis. In a case where the first day of week is Monday, on Monday it will have the data just for Monday; on Tuesday will be the computation of Monday + Tuesday; on Wednesday will have the results for Monday + Tuesday + Wednesday; and so on, until the end of week. That example would be configured as follows: ```python {'WEEK': {'recon_window': {'DAY'}}} ``` ##### Reconcile the Data to Account for Late Events Another example can be if we consider WEEK cadence with reconciliation MONTH and QUARTER enabled (`{WEEK':{'recon_window':['MONTH','QUARTER']}`). What this means is, at the start of a new month or a quarter, all the weeks that still belong to that month or that quarter are recalculated to consider the late events. For example, `2023-01-01` is the start of a month, quarter and a year. In this example, since month and quarter are given, and quarter is the higher grain among the two, all the weeks in Q4/22 (using the extended window explained above) are recalculated, i.e. instead of `2022-10-01` to `2022-12-31`, extended window to consider in the current GAB execution is `2022-09-26` to `2023-01-01`. This is true because the first day of Q1/23 was on a Sunday of the last week of Q4/22, and once we execute GAB on 01/01/2023, we are reconciling all the weeks of Q4/22, hence weekly cadence with quarterly reconciliation. You can find in the image below other illustrative examples of how the extended window and the reconciliation concept work together. In the first example, GAB will always extend the processing window and reconcile the results for all the weeks (yellow color) involved in that month (green color color). In the second example, GAB will always extend the processing window and reconcile the results for all the months (yellow color) involved in the year (note that green color is quarter, not year, but since year is an higher grain than quarter GAB extends the window and reconciles the results for all the months involved in the year, not only the quarter). image ### Snapshot It creates a snapshot of the data on a specified cadence. For example: in a case where we have `MONTHLY` cadence and snapshot enabled at `DAILY` basis, we are going to compute the aggregates for each day in the month: ```python {'MONTH': {'recon_window': {'DAY': {'snapshot': 'Y'}}}} ``` This is possible with the template column `{{ to_date }}`, which will tell us the end date of the snapshot. In the version without snapshot, there will be one record for the *MONTH* cadence, but when we enable the above configuration the number of entries for the *MONTH* cadence will be the same as the number of days in the month. This means there will be a separate entry for each day of the month, which enables to compare the data to the previous year on the same day from the start of the month. !!! note The snapshot feature will always write the snapshot entry for the given period (start date and end date), meaning if you have runs that overlap each other but for a different period (e.g., same start date but different end date) it will not rewrite past snapshot entries. The above configuration is just an example, and the snapshot can be enabled on any combination of cadences: ```python {'QUARTER': {'recon_window': {'WEEK': {'snapshot': 'Y'}}}} {'YEAR': {'recon_window': {'MONTH': {'snapshot': 'Y'}}}} {'MONTH': {'recon_window': {'WEEK': {'snapshot': 'Y'}}}} ``` ## Next Steps If you are interested in using GAB you can check our [step-by-step documentation](step_by_step/step_by_step.md) that aims to help in the use case configuration and make easier to use GAB. ## FAQ ### Can we ensure past snapshots are not changed? When we use the snapshots feature, taking monthly cadence with daily reconciliation as example, the number of entries for the *MONTH* cadence will be the same as the number of days in the month, because every day, GAB will generate a snapshot of that month, providing a cumulative picture of the month throughout the several days. In this way, snapshots are immutable. There may be cases, where the date that you want to control the snapshots is different than the cadence date in GAB, and in this case you will have to inject custom snapshot gathering logic in your GAB SQL templates and potentially play around with GAB's filter date to achieve what you want, because as of now, GAB relies on the cadence date to control the snapshot logic. ### How exactly `lookback_window` works? Sometimes, `lookback_days` in [GAB execution notebook](../../assets/gab/notebooks/gab.py) and `lookback_window` get confused. `lookback_window` is only used for when you define derived metrics that use window functions (check [step-by-step documentation](step_by_step/step_by_step.md)), and it is used to configure the window. On the other hand, `lookback_days` are only part of [GAB execution notebook](../../assets/gab/notebooks/gab.py) to modify the provided `start_date` so that it considers `lookback_days` before that. ### Can I use GAB with cadence dates in the future? As mentioned in the ["When not to use GAB?"](#when-not-to-use-gab) section, this is currently not supported. ### What is the purpose of the `rerun` flag? If you run GAB for same start date and end date as it was run before, without the *rerun* flag, GAB will ignore the execution based on the `gab_events_log` table. The *rerun* flag ensures we can force such re-execution. ### Does my data product needs to be using a star schema (fact table and dimension tables) to use GAB? No, GAB can be used regardless of the underlying data model, as you should prepare your data with templated SQL (that can be as simple or as complex as your use case) before feeding it to the GAB execution engine. ================================================ FILE: lakehouse_engine_usage/gab/step_by_step/__init__.py ================================================ """ .. include::step_by_step.md """ ================================================ FILE: lakehouse_engine_usage/gab/step_by_step/step_by_step.md ================================================ # GAB Step-by-Step !!! note Requirements: Lakehouse engine: 1.20.0+ ## 1. Setup Data Product based on Templated Files - Copy GAB assets from the templated files to your data product: - GAB Tables: - [Calendar table - dim_calendar](../../../assets/gab/metadata/tables/dim_calendar.sql) - [Use case configuration table - lkp_query_builder](../../../assets/gab/metadata/tables/lkp_query_builder.sql) - [Unified data table - gab_use_case_results](../../../assets/gab/metadata/tables/gab_use_case_results.sql) - [GAB log events table - gab_log_events](../../../assets/gab/metadata/tables/gab_log_events.sql) - GAB Notebooks: - [Feed Calendar table - gab_dim_calendar](../../../assets/gab/notebooks/gab_dim_calendar.py) - [Use case creation - query_builder_helper](../../../assets/gab/notebooks/query_builder_helper.py) - [GAB execution - gab](../../../assets/gab/notebooks/gab.py) - [GAB job manager - gab_job_manager](../../../assets/gab/notebooks/gab_job_manager.py) ## 2. Set up the Use Case ### 2.1. Create the SQL Template Files Start by writing the SQL code for your use case. Here's an example where you will find several available placeholders (more on that below): ```sql SELECT {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour) {% endif %} AS order_date, # date aggregation: computed cadence start date {{ to_date }} AS to_date, # date aggregation: last day of the cadence or of the snapshot if enabled b.category_name, COUNT(a.article_id) qty_articles, SUM(amount) total_amount FROM {{ database }}.dummy_sales_kpi a # source database {{ joins }} # calendar table join: used to compute the cadence start and end date LEFT JOIN article_categories b ON a.article_id = b.article_id WHERE {{ partition_filter }} # filter: partition filter AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= ( '{{ start_date }}' + interval '{{ offset_value }}' hour ) # filter by date column configured in the use case for this file and timezone shift AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < ( '{{ end_date }}' + interval '{{ offset_value }}' hour ) # filter by date column configured in the use case for this file and timezone shift GROUP BY 1,2,3 ``` #### Available SQL Template Placeholders You can use placeholders in your SQL queries to have them replaced at runtime by the GAB engine. There are several available placeholders that will be listed in this section. !!! warning The placeholder value will always be [injected as per the configurations of the use cases](#use-case-configuration-using-the-query_builder_helper) in the [lkp_query_builder table](../../../assets/gab/metadata/tables/lkp_query_builder.sql). ##### Reference Dates - *Start and End Dates*: - `{{ start_date }}` and `{{ end_date }}` are the dates that control the time window of the current GAB execution. These can be used to execute GAB on a certain schedule and have it incrementally compute the aggregated metrics. These dates are fundamental to control GAB executions and will be provided as arguments in the GAB notebook. - !!! warning Currently only past and present dates are supported. Future dates are not supported. - *Project Date*: - `{{ project_date_column }}` is the reference date used to compute the cadences and the extended window (together with `{{ start_date }}` and `{{ end_date }}`). ```python {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour) {% endif %} ``` - !!! note The `replace_offset_value` is a flag that has the responsibility to instruct GAB to either directly use the `{{ project_date_column }}` or shift it to the specified timezone according to the provided `offset_value` from the configured use case. - *To Date*: - `{{ to_date }}` is the last date of the cadence, if snapshots are disabled, or, if snapshots are enabled, then this date is the snapshot end date. ##### Filter Placeholders - `{{ partition_filter }}` the expression to filter the data according to a date partitioning scheme (year/month/day) and it replaces the placeholder with a filter like `year = **** and month = ** and day = **`: - !!! warning if your table does not have the Year, Month, Day columns you should not add this template - `{{ filter_date_column }}` and `{{offset_value}}` can be used to filter the data to be processed on your use case to be between the specified time range: ```python {{ filter_date_column }} >= ('{{ start_date }}' + interval '{{offset_value}}' hour) AND {{ filter_date_column }} < ('{{ end_date}}' + interval '{{offset_value}}' hour) ``` ##### Source Database From where the data comes from: `{{ database }}`. ##### Dim Calendar join Represented by the `{{ joins }}` placeholder. !!! warning It is mandatory! Can be added after any of the table names in the `from` statement. The framework renders these `joins` with an internal calendar join and populates the `to_date` and the `project_date_column` as per the configured cadences. #### Combining Multiple SQL Template Files for a Use Case For each use case, you can have just one SQL file or have multiple SQL files that depend on each other and need to be executed in a specific order. ##### If there's just one SQL file for the use case The file should start with 1_. Example: 1_xxxx.sql. ##### When the use case has several SQL files The different files will represent different intermediate stages/temp tables in GAB execution of the use case. Create the SQL files according to the sequence order (as shown in the image below) and a final combined script, example: image !!! note We suggest using the folder **metadata/gab** to use as the SQL use case folder but this is a parametrized property that you can override with the property [gab_base_path in the GAB notebook](../../../assets/gab/notebooks/gab.py). This property is used in the [GAB Job Manager](../../../assets/gab/notebooks/gab_job_manager.py) as well. ### 2.2. Configure the Use Case using the Query Builder Helper Notebook GAB will pull information from **`lkp_query_builder`** in order to retrieve information/configuration to execute the process. To help you on this task you can use the [query_builder_help notebook](../../../assets/gab/notebooks/query_builder_helper.py). In this section, we will go step-by-step in the notebook instructions to configure a use case. #### 2.2.1. General Configuration image | Variable | Default Value | Description | |-----------------------------|----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **Complexity** | Low | Defines the complexity of your use case.
You should mainly consider the volume of the data or the complexity of the SQL potentially generating a high load.
Possible values: **Low**, **Medium** and **High**. These values are used GAB's orchestration, i.e., [GAB job manager - gab_job_manager](../../../assets/gab/notebooks/gab_job_manager.py), which uses it to define the job cluster size/type based on the complexity of the query. | | **Database Name** | example_database | Refers to the name of the development environment database where the **lkp_query_builder** table resides.
This parameter is used at the end of the notebook to insert data into the **lkp_query_builder** table. | | **How many dimensions** | 1 | Number of dimension columns expected in the use case.
**Note: Do not consider the `project_date_column` or metrics**, as they have their own parameters. | | **How many views** | 1 | Defines how many output views to generate for the use case. It's possible to have as many as the use case needs.
All views will have the same structure (dimensions and metrics), the only difference possible to specify between the views is the `view filter`.
**Default value is 1.**
**Note**: This configuration has a direct impact in the `3. Configure View Name and Filters` configuration. | | **Is Active** | Y | Flag to make the use case active or not.
**Default value is Y**. | | **Market** | GLOBAL | Used in the **gab_job_manager** to execute the use cases for each **market**. If your business does not have the concept of Market, you can leave the `GLOBAL` default. | | **SQL File Names** | 1_article_category.sql,
2_f_agg_dummy_sales_kpi.sql | Name of the SQL files used in the use case, according to what you have configured in ***step 2.1***.
You can combine different layers of dependencies between them as shown in the example above, where the **2_combined.sql** file depends on **1_product_category.sql** file.
The file name should follow the pattern x_file_name (where x is an integer digit) and should be separated by a comma (e.g.: 1_first_query.sql, 2_second_query.sql). | | **Snapshot End Date** | to_date | This parameter is used in the template, by default its value must be ***to_date***.
You can change it if you have managed this in your SQL files.
The values stored in this column depend on the use case behavior:
  • if snapshots are enabled, it will contain the snapshot end date.
  • If no snapshot is enabled, it will contain the last date of the cadence.
The snapshot behavior is set in the reconciliation steps (more on that later). | | **Timezone Offset** | 0 | The timezone offset that you want to apply to the the date columns (`project_date_column` or `filter_date_column`).
It should be a number to decrement or add to the date (e.g., -8 or 8).
**The default value is 0**, which means that, by default, no timezone transformation will be applied to the date. | | **Use Case Name** | f_agg_dummy_sales_kpi | Name of the use case.
The suggestion is to use lowercase and underlined alphanumeric characters. | | **Use Case Reference Date** | order_date | Reference date of the use case, i.e., `project_date_column`.
The parameter should be the column name and the selected column should have the date/datetime format. | | **Week Start** | MONDAY | The start of the business week of the use case.
Possible values: **SUNDAY** or **MONDAY**. #### 2.2.2. Configure Dimension Names image #### 2.2.3. Configure View Name and Filters This will be the name of the output view at the end of the process. Filters can be applied at this step, if needed. image | Variable | Default Value | Description | |-----------------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **View Filter** | | A SQL *WHERE* clause expression based on the dimensions defined in the previous step.
**Example**: if you have set the country as `D1`, the filter here could be **D1 = "Germany"**. The syntax allowed here is the same as the syntax of the *WHERE* clause in SQL. | | **View Name** | vw_f_agg_dummy_sales_kpi | Name of the view to query the resulting aggregated data. This will contain the results produced by GAB for the configured use case. #### 2.2.4. Configure the Cadence, Reconciliation and Snapshot This step is where we define which will be the cadence displayed at the view. image | Variable | Default Value | Description | |----------------------------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------| | **Reconciliation Cadence** | YEAR | Compute the data aggregated by the specified cadence, optionally defined with reconciliation and snapshotting.
[Check more about it here](../gab.md#reconciliation). | #### 2.2.5. Configure METRICS First question to ask regarding metrics is how many metrics do you have on our SQL use case query. On our template we have two metrics (`qty_articles` and `total_amount`). image image Next, we will define if we want GAB to create secondary calculations for us based on the metric name. !!! warning Metrics should follow the same order as defined on the SQL use case query. image | Variable | Description | |---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | [**Calculated Metric**](../gab.md#Metrics) | It's possible to derive (add secondary calculations) 4 new columns based on each metric.
Those new columns will be based on cadences like ***last_cadence***, ***last_year_cadence*** and ***window function***.
Moreover, you can create a derived column, which is a custom SQL statement that you can write by selecting the ***derived_metric*** option. | | **Metric Name** | Name of the base metric. Should have the same name as on the SQL use case query in the SQL template files defined previously. | After that, it's where you configure secondary calculations. image | Variable | Description | |-------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------| | **derived_metric.Formula** | Formula to calculate the metric referring any of previous configured metrics by the **Metric Name**.
**Example**: `total_amount*0.56` | | **derived_metric.Label** | Name of the generated metric by ***derived_metric***. | | **last_cadence.Label** | Name of the generated metric by ***last_cadence***. | | **last_cadence.Window** | Cadence lookback window, which means in this example, a lookback from the previous year (as the use case is on **YEARLY** cadence) | | **window_function.Agg Func** | SQL Function to calculate the metric.
Possible values: ***sum***, ***avg***, ***max***, ***min***, ***count*** | | **window_function.Label** | Name of the generated metric by ***window_function***. | | **window_function.Window Interval** | Window interval to use on the metric generation. #### 2.2.6. Configure Stages Stages are related to each SQL file in the use case. image | Variable | Description | |--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **Filter Date Column** | It will be used to filter the data of your use case.
This information will be replaced in the placeholder of the GAB template `{{ filter_date_column }}`. | | **Project Date Column** | It will be used as reference date for the given query.
This information will be replaced in the placeholder of the GAB template `{{ project_date_column }}`. | | **Repartition Type** | Type of repartitioning of the data of the query.
Possible values: ***Key*** and ***Number***.
When you use Key, it expects column names separated by a comma.
When you use Number, it expects an integer of how many partitions the user wants. | | **Repartition Value** | This parameter only has effect when used with **Repartition Type parameter**.
It sets the value for the repartitioning type set by the parameter above selected. | | **Storage Level** | Defines the Spark persistence storage level you want (e.g. ***Memory Only***, ***Memory and Disk*** etc). | | **Table Alias** | The alias of the SQL file that will be executed. This name can be used to consume the output of a SQL stage (corresponding to a SQL file) in the next stage (the next SQL file). #### 2.2.7. Build and Execute the SQL Commands to populate the lkp_query_builder Table image image After configuring the use case, it would generate a SQL command to create it on the `lkp_query_builder`: ```sql DELETE FROM example_database.lkp_query_builder WHERE QUERY_LABEL = 'f_agg_dummy_sales_kpi'; INSERT INTO example_database.lkp_query_builder VALUES ( 1, 'f_agg_dummy_sales_kpi', 'GLOBAL', """{ 'vw_f_agg_dummy_sales_kpi': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'category_name' }, 'metric': { 'm1': { 'metric_name': 'qty_articles', 'calculated_metric': {}, 'derived_metric': {} }, 'm2': { 'metric_name': 'total_amount', 'calculated_metric': { 'last_cadence': [ { 'label': 'total_amount_last_year', 'window': '1' } ], 'window_function': [ { 'label': 'avg_total_amount_last_2_years', 'window': [2, 1], 'agg_func': 'avg' } ] }, 'derived_metric': [ { 'label': 'discounted_total_amount', 'formula': 'total_amount*0.56' } ] } }, 'filter': {} } }""", """{ '1': { 'file_path': 'f_agg_dummy_sales_kpi/1_article_category.sql', 'table_alias': 'article_categories', 'storage_level': 'MEMORY_ONLY', 'project_date_column': '', 'filter_date_column': '', 'repartition': {} }, '2': { 'file_path': 'f_agg_dummy_sales_kpi/2_f_agg_dummy_sales_kpi.sql', 'table_alias': 'dummy_sales_kpi', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date', 'filter_date_column': 'order_date', 'repartition': {} } }""", """{'YEAR': {}}""", '0', 'MONDAY', 'Y', 'Low', current_timestamp() ) ``` ## 3. Use case execution After the initial setup and adding your use case to the ***lkp_query_builder*** you can schedule the [gab_job_manager](../../../assets/gab/notebooks/gab_job_manager.py) to manage the use case execution in any schedule you want. You can repeat these steps for each use case you have. ## 4. Consuming the data The data is available in the view you specified as output from the use case in ***step 2***, so you can normally consume the view as you would consume any other data asset (e.g., Report, Dashboard, ML model, Data Pipeline). ================================================ FILE: lakehouse_engine_usage/lakehouse_engine_usage.md ================================================ # How to use the Lakehouse Engine? Lakehouse engine usage examples for all the algorithms and other core functionalities. - [Data Loader](data_loader/data_loader.md) - [Data Quality](data_quality/data_quality.md) - [Reconciliator](reconciliator/reconciliator.md) - [Sensors](sensors/sensors.md) - [GAB](gab/gab.md) ================================================ FILE: lakehouse_engine_usage/managerhelper/managerhelper.md ================================================ # Table and File Manager Operations Generator Generate JSON configurations for TableManager and FileManager operations with an interactive form.

Table Manager Operations

Select an operation above to see its configuration options

File Manager Operations

Select an operation above to see its configuration options

Operations Queue

No operations added yet. Configure and add operations to build your JSON.

Generated JSON Configuration


        
================================================ FILE: lakehouse_engine_usage/managerhelper/operations-script.js ================================================ // ============================================================================ // LAKEHOUSE ENGINE OPERATIONS GENERATOR - MAIN JAVASCRIPT // ============================================================================ // This script manages the interactive UI for generating JSON configurations // for Lakehouse Engine table and file manager operations. // ============================================================================ // ============================================================================ // DOM ELEMENT REFERENCES // ============================================================================ // Cache frequently accessed DOM elements for better performance /** Tab navigation buttons for switching between table and file managers */ const tabButtons = document.querySelectorAll('.tab-button'); /** Tab content containers for table and file manager sections */ const tabContents = document.querySelectorAll('.tab-content'); /** Dropdown select for choosing table manager operations */ const tableOperationSelect = document.getElementById('table-operation-select'); /** Dropdown select for choosing file manager operations */ const fileOperationSelect = document.getElementById('file-operation-select'); /** Container for dynamically generated table operation parameter fields */ const tableDynamicFields = document.getElementById('table-dynamic-fields'); /** Container for dynamically generated file operation parameter fields */ const fileDynamicFields = document.getElementById('file-dynamic-fields'); /** Button to add the currently configured operation to the list */ const addOperationBtn = document.getElementById('add-operation'); /** Button to clear all operations from the list */ const clearOperationsBtn = document.getElementById('clear-operations'); /** Container displaying the list of added operations */ const operationsList = document.getElementById('operations-list'); /** Button to generate JSON configuration from operations list */ const generateBtn = document.getElementById('generate-json'); /** Button to copy generated JSON to clipboard */ const copyBtn = document.getElementById('copy-json'); /** Button to download generated JSON as a file */ const downloadBtn = document.getElementById('download-json'); /** Button to format the displayed JSON */ const formatBtn = document.getElementById('format-json'); /** Button to validate the generated JSON configuration */ const validateBtn = document.getElementById('validate-json'); /** Pre-formatted text area displaying the generated JSON output */ const jsonOutput = document.getElementById('json-output'); /** Element displaying validation results and messages */ const validationResult = document.getElementById('validation-result'); /** Loading spinner overlay element */ const loading = document.getElementById('loading'); /** Toast notification element for user feedback */ const toast = document.getElementById('toast'); // ============================================================================ // APPLICATION STATE // ============================================================================ // Global state variables that track the application's current status /** Current active tab ('table-manager' or 'file-manager') */ let currentTab = 'table-manager'; /** Array of operation objects added by the user */ let operations = []; /** Generated JSON configuration object */ let generatedConfig = null; // ============================================================================ // OPERATION DEFINITIONS - TABLE MANAGER // ============================================================================ // Defines all available table manager operations with their parameters, // validation rules, and UI presentation details /** * Table Manager Operations Configuration * Each operation includes: * - name: Display name for the UI * - icon: FontAwesome icon class * - fields: Array of field definitions with type, validation, and help text */ const TABLE_OPERATIONS = { 'compute_table_statistics': { name: 'Compute Table Statistics', icon: 'fas fa-chart-bar', fields: [ { name: 'table_or_view', label: 'Table or View Name', type: 'text', required: true, help: 'Name of the table or view to compute statistics for' } ] }, 'create_table': { name: 'Create Table', icon: 'fas fa-plus-square', fields: [ { name: 'path', label: 'SQL File Path', type: 'text', required: true, help: 'Path to the SQL file containing the CREATE TABLE statement' }, { name: 'disable_dbfs_retry', label: 'Disable DBFS Retry', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to disable DBFS retry mechanism' }, { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' }, { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' } ] }, 'create_tables': { name: 'Create Multiple Tables', icon: 'fas fa-layer-group', fields: [ { name: 'path', label: 'SQL File Paths', type: 'textarea', required: true, help: 'Comma-separated paths to SQL files containing CREATE TABLE statements' }, { name: 'disable_dbfs_retry', label: 'Disable DBFS Retry', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to disable DBFS retry mechanism' }, { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' }, { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' } ] }, 'create_view': { name: 'Create View', icon: 'fas fa-eye', fields: [ { name: 'path', label: 'SQL File Path', type: 'text', required: true, help: 'Path to the SQL file containing the CREATE VIEW statement' }, { name: 'disable_dbfs_retry', label: 'Disable DBFS Retry', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to disable DBFS retry mechanism' }, { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' }, { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' } ] }, 'drop_table': { name: 'Drop Table', icon: 'fas fa-trash-alt', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to drop' } ] }, 'drop_view': { name: 'Drop View', icon: 'fas fa-eye-slash', fields: [ { name: 'table_or_view', label: 'View Name', type: 'text', required: true, help: 'Name of the view to drop' } ] }, 'execute_sql': { name: 'Execute SQL', icon: 'fas fa-code', fields: [ { name: 'sql', label: 'SQL Commands', type: 'textarea', required: true, help: 'SQL commands to execute (separated by delimiter)' }, { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' }, { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' } ] }, 'truncate': { name: 'Truncate Table', icon: 'fas fa-cut', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to truncate' } ] }, 'vacuum': { name: 'Vacuum Table', icon: 'fas fa-broom', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', help: 'Name of the table to vacuum (leave empty to use path)' }, { name: 'path', label: 'Table Path', type: 'text', help: 'Path to the Delta table location (use if table_or_view is empty)' }, { name: 'vacuum_hours', label: 'Retention Hours', type: 'number', default: '168', help: 'Number of hours to retain old versions (default: 168 hours = 7 days)' } ] }, 'describe': { name: 'Describe Table', icon: 'fas fa-info-circle', fields: [ { name: 'table_or_view', label: 'Table or View Name', type: 'text', required: true, help: 'Name of the table or view to describe' } ] }, 'optimize': { name: 'Optimize Table', icon: 'fas fa-tachometer-alt', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', help: 'Name of the table to optimize (leave empty to use path)' }, { name: 'path', label: 'Table Path', type: 'text', help: 'Path to the Delta table location (use if table_or_view is empty)' }, { name: 'where_clause', label: 'Where Clause', type: 'text', help: 'Optional WHERE clause to limit optimization scope' }, { name: 'optimize_zorder_col_list', label: 'Z-Order Columns', type: 'text', help: 'Comma-separated list of columns for Z-ORDER optimization' } ] }, 'show_tbl_properties': { name: 'Show Table Properties', icon: 'fas fa-cogs', fields: [ { name: 'table_or_view', label: 'Table or View Name', type: 'text', required: true, help: 'Name of the table or view to show properties for' } ] }, 'get_tbl_pk': { name: 'Get Table Primary Key', icon: 'fas fa-key', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to get primary key from' } ] }, 'repair_table': { name: 'Repair Table', icon: 'fas fa-wrench', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to repair' }, { name: 'sync_metadata', label: 'Sync Metadata', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to sync metadata during repair' } ] }, 'delete_where': { name: 'Delete Where', icon: 'fas fa-eraser', fields: [ { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to delete from' }, { name: 'where_clause', label: 'Where Clause', type: 'text', required: true, help: 'WHERE condition for deletion (without WHERE keyword)' } ] } }; // ============================================================================ // OPERATION DEFINITIONS - FILE MANAGER // ============================================================================ // Defines all available file manager operations for S3 and DBFS file systems /** * File Manager Operations Configuration * Supports operations for: * - S3: delete, copy, move, restore from Glacier * - DBFS: delete, copy, move */ const FILE_OPERATIONS = { 'delete_objects': { name: 'Delete Objects', icon: 'fas fa-trash', fields: [ { name: 'bucket', label: 'Bucket Name', type: 'text', help: 'S3 bucket name (leave empty for DBFS paths)' }, { name: 'object_paths', label: 'Object Paths', type: 'textarea', required: true, help: 'Comma-separated list of object paths to delete' }, { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be deleted without actually deleting' } ] }, 'copy_objects': { name: 'Copy Objects', icon: 'fas fa-copy', fields: [ { name: 'bucket', label: 'Source Bucket', type: 'text', help: 'Source S3 bucket name (leave empty for DBFS paths)' }, { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the source object or directory' }, { name: 'destination_bucket', label: 'Destination Bucket', type: 'text', help: 'Destination S3 bucket name (leave empty for DBFS paths)' }, { name: 'destination_object', label: 'Destination Object Path', type: 'text', required: true, help: 'Path of the destination object or directory' }, { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be copied without actually copying' } ] }, 'move_objects': { name: 'Move Objects', icon: 'fas fa-arrows-alt', fields: [ { name: 'bucket', label: 'Source Bucket', type: 'text', help: 'Source S3 bucket name (leave empty for DBFS paths)' }, { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the source object or directory' }, { name: 'destination_bucket', label: 'Destination Bucket', type: 'text', help: 'Destination S3 bucket name (leave empty for DBFS paths)' }, { name: 'destination_object', label: 'Destination Object Path', type: 'text', required: true, help: 'Path of the destination object or directory' }, { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be moved without actually moving' } ] }, 'request_restore': { name: 'Request Restore (S3)', icon: 'fas fa-undo', fields: [ { name: 'bucket', label: 'S3 Bucket', type: 'text', required: true, help: 'S3 bucket containing archived objects' }, { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the archived object to restore' }, { name: 'restore_expiration', label: 'Restore Expiration (days)', type: 'number', required: true, default: '7', help: 'Number of days to keep restored objects available' }, { name: 'retrieval_tier', label: 'Retrieval Tier', type: 'select', options: ['Expedited', 'Standard', 'Bulk'], default: 'Standard', help: 'Speed and cost tier for restoration' }, { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be restored without actually restoring' } ] }, 'check_restore_status': { name: 'Check Restore Status (S3)', icon: 'fas fa-search', fields: [ { name: 'bucket', label: 'S3 Bucket', type: 'text', required: true, help: 'S3 bucket containing archived objects' }, { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the object to check restore status' } ] }, 'request_restore_to_destination_and_wait': { name: 'Request Restore and Copy (S3)', icon: 'fas fa-sync-alt', fields: [ { name: 'bucket', label: 'Source S3 Bucket', type: 'text', required: true, help: 'S3 bucket containing archived objects' }, { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the archived object to restore' }, { name: 'destination_bucket', label: 'Destination S3 Bucket', type: 'text', required: true, help: 'Destination S3 bucket for restored objects' }, { name: 'destination_object', label: 'Destination Object Path', type: 'text', required: true, help: 'Path of the destination for restored objects' }, { name: 'restore_expiration', label: 'Restore Expiration (days)', type: 'number', required: true, default: '7', help: 'Number of days to keep restored objects available' }, { name: 'retrieval_tier', label: 'Retrieval Tier', type: 'select', options: ['Expedited'], default: 'Expedited', help: 'Only Expedited tier supported for this operation' }, { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be restored without actually restoring' } ] } }; // ============================================================================ // INITIALIZATION // ============================================================================ // Set up the application when the DOM is fully loaded /** * Initialize the application on page load * Sets up tabs, event listeners, and loads any saved state */ document.addEventListener('DOMContentLoaded', function() { initializeTabs(); initializeEventListeners(); loadFromLocalStorage(); }); // ============================================================================ // TAB MANAGEMENT // ============================================================================ /** * Initialize tab navigation functionality * Sets up click handlers for switching between table and file manager tabs */ function initializeTabs() { tabButtons.forEach(button => { button.addEventListener('click', () => { const tabId = button.getAttribute('data-tab'); switchTab(tabId); }); }); } /** * Switch to a different tab * @param {string} tabId - The ID of the tab to activate ('table-manager' or 'file-manager') */ function switchTab(tabId) { // Update button active states tabButtons.forEach(btn => btn.classList.remove('active')); document.querySelector(`[data-tab="${tabId}"]`).classList.add('active'); // Update content visibility tabContents.forEach(content => content.classList.remove('active')); document.getElementById(tabId).classList.add('active'); // Update application state currentTab = tabId; updateAddButtonState(); } // ============================================================================ // EVENT LISTENERS SETUP // ============================================================================ /** * Initialize all event listeners for interactive elements * Connects UI actions to their handler functions */ function initializeEventListeners() { // Operation selection change handlers tableOperationSelect.addEventListener('change', handleTableOperationChange); fileOperationSelect.addEventListener('change', handleFileOperationChange); // Button click handlers addOperationBtn.addEventListener('click', addCurrentOperation); clearOperationsBtn.addEventListener('click', clearAllOperations); generateBtn.addEventListener('click', generateJSON); copyBtn.addEventListener('click', copyToClipboard); downloadBtn.addEventListener('click', downloadJSON); formatBtn.addEventListener('click', formatJSON); validateBtn.addEventListener('click', validateJSON); } // ============================================================================ // DYNAMIC FIELD GENERATION // ============================================================================ /** * Handle table operation selection change * Renders the appropriate parameter fields for the selected table operation */ function handleTableOperationChange() { const operation = tableOperationSelect.value; if (operation && TABLE_OPERATIONS[operation]) { renderDynamicFields(tableDynamicFields, TABLE_OPERATIONS[operation], 'table'); updateAddButtonState(); } else { showNoOperationSelected(tableDynamicFields); updateAddButtonState(); } } /** * Handle file operation selection change * Renders the appropriate parameter fields for the selected file operation */ function handleFileOperationChange() { const operation = fileOperationSelect.value; if (operation && FILE_OPERATIONS[operation]) { renderDynamicFields(fileDynamicFields, FILE_OPERATIONS[operation], 'file'); updateAddButtonState(); } else { showNoOperationSelected(fileDynamicFields); updateAddButtonState(); } } /** * Display a message when no operation is selected * @param {HTMLElement} container - The container to display the message in */ function showNoOperationSelected(container) { container.innerHTML = `

Select an operation above to see its configuration options

`; } /** * Render dynamic parameter fields for the selected operation * @param {HTMLElement} container - The container to render fields into * @param {Object} operationDef - The operation definition with field specifications * @param {string} type - The operation type ('table' or 'file') */ function renderDynamicFields(container, operationDef, type) { const html = `

${operationDef.name} Configuration

${operationDef.fields.map(field => renderField(field, type)).join('')}
`; container.innerHTML = html; // Attach validation event listeners to all input fields container.querySelectorAll('input, select, textarea').forEach(input => { input.addEventListener('blur', () => validateField(input)); input.addEventListener('input', () => clearFieldValidation(input)); }); } /** * Render a single input field based on its definition * @param {Object} field - Field definition with name, type, label, etc. * @param {string} type - The operation type for generating unique field IDs * @returns {string} HTML string for the field */ function renderField(field, type) { const fieldId = `${type}-${field.name}`; const required = field.required ? 'required' : ''; const requiredMarker = field.required ? '*' : ''; let inputHtml = ''; // Generate appropriate input HTML based on field type switch (field.type) { case 'text': case 'number': inputHtml = ``; break; case 'textarea': inputHtml = ``; break; case 'select': const options = field.options.map(option => `` ).join(''); inputHtml = ``; break; } return `
${inputHtml}
${field.help}
`; } // ============================================================================ // FIELD VALIDATION // ============================================================================ /** * Validate a single input field * @param {HTMLInputElement} input - The input element to validate * @returns {boolean} True if field is valid, false otherwise */ function validateField(input) { const validationDiv = document.getElementById(`${input.id}-validation`); const isRequired = input.hasAttribute('required'); const value = input.value.trim(); // Clear previous validation state input.classList.remove('valid', 'invalid'); validationDiv.textContent = ''; validationDiv.className = 'validation-message'; // Check if required field is empty if (isRequired && !value) { input.classList.add('invalid'); validationDiv.textContent = 'This field is required'; validationDiv.classList.add('error'); return false; } // Type-specific validation for number fields if (value && input.type === 'number') { const numValue = parseFloat(value); if (isNaN(numValue) || numValue < 0) { input.classList.add('invalid'); validationDiv.textContent = 'Please enter a valid positive number'; validationDiv.classList.add('error'); return false; } } // Mark field as valid if it has a value if (value) { input.classList.add('valid'); validationDiv.textContent = '✓ Valid'; validationDiv.classList.add('success'); } return true; } /** * Clear validation state from an input field * @param {HTMLInputElement} input - The input element to clear validation from */ function clearFieldValidation(input) { input.classList.remove('valid', 'invalid'); const validationDiv = document.getElementById(`${input.id}-validation`); if (validationDiv) { validationDiv.textContent = ''; validationDiv.className = 'validation-message'; } } // ============================================================================ // OPERATION MANAGEMENT // ============================================================================ /** * Update the enabled/disabled state of the Add Operation button * Button is only enabled when an operation is selected */ function updateAddButtonState() { const currentSelect = currentTab === 'table-manager' ? tableOperationSelect : fileOperationSelect; const hasSelection = currentSelect.value !== ''; addOperationBtn.disabled = !hasSelection; } /** * Add the currently configured operation to the operations list * Validates all fields before adding */ function addCurrentOperation() { const currentSelect = currentTab === 'table-manager' ? tableOperationSelect : fileOperationSelect; const operationKey = currentSelect.value; if (!operationKey) return; const operationDef = currentTab === 'table-manager' ? TABLE_OPERATIONS[operationKey] : FILE_OPERATIONS[operationKey]; // Collect and validate field values const config = { function: operationKey }; const container = currentTab === 'table-manager' ? tableDynamicFields : fileDynamicFields; let isValid = true; container.querySelectorAll('input, select, textarea').forEach(input => { if (!validateField(input)) { isValid = false; } const value = input.value.trim(); if (value) { // Handle different field types and convert values appropriately if (input.name === 'object_paths' && value.includes(',')) { config[input.name] = value.split(',').map(s => s.trim()); } else if (input.type === 'number') { config[input.name] = parseInt(value, 10); } else if (value === 'True') { config[input.name] = true; } else if (value === 'False') { config[input.name] = false; } else { config[input.name] = value; } } }); // Abort if validation failed if (!isValid) { showToast('Please fix validation errors before adding the operation', 'error'); return; } // Create and add operation object const operation = { id: Date.now(), type: currentTab === 'table-manager' ? 'table' : 'file', manager: currentTab === 'table-manager' ? 'table' : 'file', functionName: operationKey, displayName: operationDef.name, icon: operationDef.icon, config: config }; operations.push(operation); renderOperationsList(); updateGenerateButtonState(); saveToLocalStorage(); showToast(`${operationDef.name} operation added successfully!`, 'success'); } /** * Remove an operation from the operations list * @param {number} id - The unique ID of the operation to remove */ function removeOperation(id) { operations = operations.filter(op => op.id !== id); renderOperationsList(); updateGenerateButtonState(); saveToLocalStorage(); showToast('Operation removed', 'success'); } /** * Clear all operations from the list after confirmation */ function clearAllOperations() { if (operations.length === 0) return; if (confirm('Are you sure you want to remove all operations?')) { operations = []; renderOperationsList(); updateGenerateButtonState(); saveToLocalStorage(); showToast('All operations cleared', 'success'); } } /** * Render the list of added operations in the UI * Shows empty state if no operations exist */ function renderOperationsList() { if (operations.length === 0) { operationsList.innerHTML = `

No operations added yet. Configure and add operations to build your JSON.

`; return; } const html = operations.map(operation => `
${operation.type} ${operation.displayName}
Function: ${operation.functionName} | Parameters: ${Object.keys(operation.config).filter(k => k !== 'function').length}
`).join(''); operationsList.innerHTML = html; } /** * Update the enabled/disabled state of the Generate JSON button * Button is only enabled when at least one operation exists */ function updateGenerateButtonState() { generateBtn.disabled = operations.length === 0; } // ============================================================================ // JSON GENERATION AND OUTPUT // ============================================================================ /** * Generate JSON configuration from the operations list * Creates the final configuration object in Lakehouse Engine format */ function generateJSON() { if (operations.length === 0) { showToast('No operations to generate. Please add at least one operation.', 'error'); return; } showLoading(); // Use setTimeout to show loading animation setTimeout(() => { try { const config = { operations: operations.map(op => ({ manager: op.manager, ...op.config })) }; generatedConfig = config; displayJSON(config); enableActionButtons(); showToast('JSON configuration generated successfully!', 'success'); } catch (error) { console.error('Generation error:', error); showToast('Error generating JSON: ' + error.message, 'error'); } finally { hideLoading(); } }, 500); } /** * Display formatted JSON in the output area * @param {Object} config - The configuration object to display */ function displayJSON(config) { const formattedJSON = JSON.stringify(config, null, 2); jsonOutput.textContent = formattedJSON; highlightJSON(); } /** * Apply syntax highlighting to the displayed JSON * Colors different JSON elements (keys, strings, numbers, booleans) */ function highlightJSON() { const content = jsonOutput.textContent; const highlighted = content .replace(/"([^"]+)":/g, '"$1":') .replace(/: "([^"]+)"/g, ': "$1"') .replace(/: (\d+)/g, ': $1') .replace(/: (true|false)/g, ': $1') .replace(/: null/g, ': null'); jsonOutput.innerHTML = highlighted; } /** * Format the generated JSON with proper indentation * Re-formats and re-highlights the JSON output */ function formatJSON() { if (!generatedConfig) { showToast('No JSON to format. Generate configuration first.', 'error'); return; } try { const formatted = JSON.stringify(generatedConfig, null, 2); jsonOutput.textContent = formatted; highlightJSON(); showToast('JSON formatted successfully!', 'success'); } catch (error) { showToast('Error formatting JSON: ' + error.message, 'error'); } } /** * Validate the generated JSON configuration * Checks for required fields and proper structure */ function validateJSON() { if (!generatedConfig) { showValidationResult(false, 'No JSON to validate. Generate configuration first.'); return; } try { const config = generatedConfig; const errors = []; // Check for operations array if (!config.operations || !Array.isArray(config.operations)) { errors.push('Missing or invalid operations array'); } else { // Validate each operation config.operations.forEach((op, index) => { if (!op.manager) { errors.push(`Operation ${index + 1}: Missing manager field`); } if (!op.function) { errors.push(`Operation ${index + 1}: Missing function field`); } }); } // Display validation results if (errors.length === 0) { showValidationResult(true, `JSON configuration is valid! Contains ${config.operations.length} operation(s).`); } else { showValidationResult(false, 'Validation errors: ' + errors.join(', ')); } } catch (error) { showValidationResult(false, 'Validation error: ' + error.message); } } /** * Display validation results to the user * @param {boolean} isValid - Whether the validation passed * @param {string} message - The validation message to display */ function showValidationResult(isValid, message) { validationResult.className = `validation-result ${isValid ? 'valid' : 'invalid'}`; validationResult.textContent = isValid ? '✅ ' + message : '❌ ' + message; } /** * Copy the generated JSON to the clipboard * Uses modern Clipboard API with fallback for older browsers */ async function copyToClipboard() { if (!generatedConfig) { showToast('No JSON to copy. Generate configuration first.', 'error'); return; } try { const jsonString = JSON.stringify(generatedConfig, null, 2); await navigator.clipboard.writeText(jsonString); showToast('JSON copied to clipboard!', 'success'); } catch (error) { // Fallback for older browsers const textArea = document.createElement('textarea'); textArea.value = JSON.stringify(generatedConfig, null, 2); document.body.appendChild(textArea); textArea.select(); document.execCommand('copy'); document.body.removeChild(textArea); showToast('JSON copied to clipboard!', 'success'); } } /** * Download the generated JSON as a file * Creates a timestamped filename and triggers browser download */ function downloadJSON() { if (!generatedConfig) { showToast('No JSON to download. Generate configuration first.', 'error'); return; } const jsonString = JSON.stringify(generatedConfig, null, 2); const blob = new Blob([jsonString], { type: 'application/json' }); const url = URL.createObjectURL(blob); // Generate filename with timestamp const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const filename = `lakehouse-operations-${timestamp}.json`; // Trigger download const a = document.createElement('a'); a.href = url; a.download = filename; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); showToast(`Configuration downloaded as ${filename}`, 'success'); } // ============================================================================ // UI HELPER FUNCTIONS // ============================================================================ /** * Enable the JSON action buttons (copy, download) * Called after JSON is successfully generated */ function enableActionButtons() { copyBtn.disabled = false; downloadBtn.disabled = false; } /** * Show the loading spinner overlay */ function showLoading() { loading.style.display = 'flex'; } /** * Hide the loading spinner overlay */ function hideLoading() { loading.style.display = 'none'; } /** * Display a toast notification message * @param {string} message - The message to display * @param {string} type - The toast type ('success' or 'error') */ function showToast(message, type = 'success') { toast.textContent = message; toast.className = `toast ${type}`; toast.classList.add('show'); // Auto-hide after 3 seconds setTimeout(() => { toast.classList.remove('show'); }, 3000); } // ============================================================================ // LOCAL STORAGE PERSISTENCE // ============================================================================ /** * Save the current operations and state to localStorage * Allows users to resume work after page reload */ function saveToLocalStorage() { const data = { operations: operations, currentTab: currentTab, timestamp: Date.now() }; localStorage.setItem('lakehouse-operations-generator', JSON.stringify(data)); } /** * Load previously saved operations and state from localStorage * Only loads data saved within the last 24 hours */ function loadFromLocalStorage() { try { const saved = localStorage.getItem('lakehouse-operations-generator'); if (saved) { const data = JSON.parse(saved); // Only load if saved within last 24 hours if (Date.now() - data.timestamp < 24 * 60 * 60 * 1000) { operations = data.operations || []; renderOperationsList(); updateGenerateButtonState(); if (data.currentTab) { switchTab(data.currentTab); } } } } catch (error) { console.warn('Could not load saved data:', error); } } // ============================================================================ // KEYBOARD SHORTCUTS // ============================================================================ /** * Handle keyboard shortcuts for common actions * - Ctrl/Cmd + G: Generate JSON * - Ctrl/Cmd + A: Add operation (when operation selector focused) * - Ctrl + Delete: Clear all operations */ document.addEventListener('keydown', function(event) { // Ctrl+G or Cmd+G - Generate JSON if ((event.ctrlKey || event.metaKey) && event.key === 'g') { event.preventDefault(); generateJSON(); } // Ctrl+A or Cmd+A when focused on operation selector - Add operation if ((event.ctrlKey || event.metaKey) && event.key === 'a' && (event.target === tableOperationSelect || event.target === fileOperationSelect)) { event.preventDefault(); addCurrentOperation(); } // Ctrl + Delete - Clear operations if (event.key === 'Delete' && event.ctrlKey && operations.length > 0) { event.preventDefault(); clearAllOperations(); } }); // ============================================================================ // FINAL INITIALIZATION // ============================================================================ /** * Initialize button states when page loads * Ensures all buttons are in the correct enabled/disabled state */ document.addEventListener('DOMContentLoaded', function() { updateAddButtonState(); updateGenerateButtonState(); }); ================================================ FILE: lakehouse_engine_usage/managerhelper/operations-styles-mkdocs.css ================================================ /* Import base styles */ /* Operations-specific styles for MkDocs */ .managerhelper-wrapper .operation-selector { background: #e3f2fd; padding: 1.5rem; border-radius: 4px; margin-bottom: 2rem; border-left: 4px solid #2196f3; } .managerhelper-wrapper .operation-selector label { display: block; margin-bottom: 0.5rem; font-weight: 500; color: #1565c0; } .managerhelper-wrapper .operation-selector select { width: 100%; padding: 12px 15px; border: 1px solid #90caf9; border-radius: 4px; font-size: 0.875rem; background: white; color: #1565c0; transition: all 0.3s ease; } .managerhelper-wrapper .operation-selector select:focus { border-color: #2196f3; outline: none; box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2); } /* Operations List */ .managerhelper-wrapper .operations-list-container { background: #fafafa; border-radius: 4px; margin: 0 2rem 2rem; overflow: hidden; box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); border: 1px solid #e0e0e0; } .managerhelper-wrapper .operations-header { display: flex; justify-content: space-between; align-items: center; padding: 1.5rem; background: #2196f3; color: white; } .managerhelper-wrapper .operations-header h3 { margin: 0; display: flex; align-items: center; gap: 10px; } .managerhelper-wrapper .operations-actions { display: flex; gap: 10px; } .managerhelper-wrapper .operations-list { padding: 1rem; max-height: 400px; overflow-y: auto; } .managerhelper-wrapper .empty-operations { text-align: center; padding: 2rem; color: rgba(0, 0, 0, 0.54); } .managerhelper-wrapper .empty-operations i { font-size: 2rem; margin-bottom: 1rem; opacity: 0.5; } .managerhelper-wrapper .operation-item { background: white; border: 1px solid #e0e0e0; border-radius: 4px; padding: 1rem; margin-bottom: 0.5rem; display: flex; justify-content: space-between; align-items: flex-start; transition: all 0.3s ease; } .managerhelper-wrapper .operation-item:hover { box-shadow: 0 2px 4px rgba(0, 0, 0, 0.15); transform: translateY(-1px); } .managerhelper-wrapper .operation-info { flex: 1; } .managerhelper-wrapper .operation-title { font-weight: 500; color: rgba(0, 0, 0, 0.87); margin-bottom: 0.5rem; display: flex; align-items: center; gap: 8px; } .managerhelper-wrapper .operation-title i { color: #2196f3; } .managerhelper-wrapper .operation-details { font-size: 0.8rem; color: rgba(0, 0, 0, 0.54); } .managerhelper-wrapper .operation-actions { display: flex; gap: 8px; margin-left: 1rem; } .managerhelper-wrapper .btn-edit { background: #ffd54f; color: rgba(0, 0, 0, 0.87); border: none; } .managerhelper-wrapper .btn-edit:hover { background: #ffca28; } .managerhelper-wrapper .btn-remove { background: #f44336; color: white; border: none; } .managerhelper-wrapper .btn-remove:hover { background: #d32f2f; } /* Field Groups */ .managerhelper-wrapper .field-group { background: #fafafa; padding: 1.5rem; border-radius: 4px; margin-bottom: 1.5rem; border: 1px solid #e0e0e0; border-left: 3px solid #4caf50; } .managerhelper-wrapper .field-group h4 { color: rgba(0, 0, 0, 0.87); margin-bottom: 1rem; display: flex; align-items: center; gap: 8px; font-weight: 500; } .managerhelper-wrapper .field-group h4 i { color: #4caf50; } .managerhelper-wrapper .field-row { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 1rem; margin-bottom: 1rem; } .managerhelper-wrapper .field-item { display: flex; flex-direction: column; } .managerhelper-wrapper .field-item label { margin-bottom: 0.5rem; font-weight: 500; color: rgba(0, 0, 0, 0.87); } .managerhelper-wrapper .field-item input, .managerhelper-wrapper .field-item select, .managerhelper-wrapper .field-item textarea { padding: 10px 12px; border: 1px solid #bdbdbd; border-radius: 4px; font-size: 0.875rem; transition: all 0.3s ease; } .managerhelper-wrapper .field-item input:focus, .managerhelper-wrapper .field-item select:focus, .managerhelper-wrapper .field-item textarea:focus { outline: none; border-color: #2196f3; box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2); } .managerhelper-wrapper .field-help { font-size: 0.8rem; color: rgba(0, 0, 0, 0.54); margin-top: 0.25rem; } .managerhelper-wrapper .field-required { color: #f44336; } /* Form validation */ .managerhelper-wrapper .form-control.invalid { border-color: #f44336; background-color: #ffebee; } .managerhelper-wrapper .form-control.valid { border-color: #4caf50; background-color: #f1f8e9; } .managerhelper-wrapper .validation-message { font-size: 0.8rem; margin-top: 0.25rem; } .managerhelper-wrapper .validation-message.error { color: #f44336; } .managerhelper-wrapper .validation-message.success { color: #4caf50; } /* Operation Type Badges */ .managerhelper-wrapper .operation-badge { display: inline-block; padding: 0.25rem 0.5rem; font-size: 0.75rem; font-weight: 500; border-radius: 0.25rem; text-transform: uppercase; margin-right: 0.5rem; } .managerhelper-wrapper .badge-table { background-color: #e3f2fd; color: #1565c0; } .managerhelper-wrapper .badge-file { background-color: #fff3e0; color: #ef6c00; } /* Responsive Design */ @media (max-width: 768px) { .managerhelper-wrapper .field-row { grid-template-columns: 1fr; } .managerhelper-wrapper .operations-header { flex-direction: column; gap: 1rem; align-items: stretch; } .managerhelper-wrapper .operations-actions { justify-content: center; } .managerhelper-wrapper .operation-item { flex-direction: column; gap: 1rem; } .managerhelper-wrapper .operation-actions { margin-left: 0; justify-content: flex-end; } .managerhelper-wrapper .operations-list-container { margin: 0 1rem 1rem; } } ================================================ FILE: lakehouse_engine_usage/managerhelper/styles-mkdocs.css ================================================ /* MkDocs-scoped styles for Manager Helper with Material Design theme */ .managerhelper-wrapper { font-family: 'Roboto', 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; color: #333; margin: 0 -24px; padding: 0; } .managerhelper-wrapper * { box-sizing: border-box; } /* Header */ .managerhelper-wrapper .header { text-align: center; margin-bottom: 0; padding: 2rem 1rem; background: #2196f3; color: white; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } .managerhelper-wrapper .logo { display: flex; align-items: center; justify-content: center; gap: 15px; margin-bottom: 10px; } .managerhelper-wrapper .logo i { font-size: 2.5rem; } .managerhelper-wrapper .header h1 { font-size: 2rem; font-weight: 500; margin: 0; } .managerhelper-wrapper .subtitle { font-size: 1rem; opacity: 0.9; margin-top: 10px; } /* Navigation Tabs */ .managerhelper-wrapper .tabs { display: flex; gap: 0; margin-bottom: 0; border-bottom: 2px solid #e0e0e0; overflow-x: auto; background: #fafafa; padding: 0 2rem; } .managerhelper-wrapper .tab-button { display: flex; align-items: center; gap: 8px; padding: 14px 24px; border: none; background: transparent; cursor: pointer; font-size: 0.875rem; color: rgba(0, 0, 0, 0.6); border-bottom: 2px solid transparent; transition: all 0.3s ease; white-space: nowrap; font-weight: 500; } .managerhelper-wrapper .tab-button:hover { color: rgba(0, 0, 0, 0.87); background: rgba(33, 150, 243, 0.08); } .managerhelper-wrapper .tab-button.active { color: #2196f3; border-bottom-color: #ffd54f; background: white; } .managerhelper-wrapper .tab-button i { font-size: 1rem; } /* Form Container */ .managerhelper-wrapper .operations-container { padding: 2rem; } .managerhelper-wrapper .tab-content { display: none; animation: fadeIn 0.3s ease-in; } .managerhelper-wrapper .tab-content.active { display: block; } @keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } } .managerhelper-wrapper .section { background: #fafafa; padding: 2rem; border-radius: 4px; box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12); border: 1px solid #e0e0e0; } .managerhelper-wrapper .section h2 { display: flex; align-items: center; gap: 10px; color: rgba(0, 0, 0, 0.87); margin-bottom: 1.5rem; font-size: 1.5rem; font-weight: 500; } .managerhelper-wrapper .section h2 i { color: #2196f3; } /* Form Groups */ .managerhelper-wrapper .form-group { margin-bottom: 1.5rem; } .managerhelper-wrapper .form-group label { display: block; margin-bottom: 0.5rem; font-weight: 500; color: rgba(0, 0, 0, 0.87); } .managerhelper-wrapper .form-control { width: 100%; padding: 12px 15px; border: 1px solid #bdbdbd; border-radius: 4px; font-size: 0.875rem; transition: all 0.3s ease; background: white; } .managerhelper-wrapper .form-control:focus { outline: none; border-color: #2196f3; box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2); } .managerhelper-wrapper textarea.form-control { resize: vertical; font-family: 'Roboto Mono', 'Fira Code', monospace; line-height: 1.5; } .managerhelper-wrapper .help-text { display: block; margin-top: 0.25rem; color: #6c757d; font-size: 0.8rem; } /* Dynamic Fields */ .managerhelper-wrapper .dynamic-fields { min-height: 200px; } .managerhelper-wrapper .no-operation-selected { text-align: center; padding: 3rem 2rem; color: #757575; } .managerhelper-wrapper .no-operation-selected i { font-size: 3rem; margin-bottom: 1rem; opacity: 0.5; } /* Actions */ .managerhelper-wrapper .actions { display: flex; gap: 15px; margin-bottom: 2rem; flex-wrap: wrap; justify-content: center; padding: 1rem 2rem; } .managerhelper-wrapper .btn { display: inline-flex; align-items: center; gap: 8px; padding: 12px 20px; border: none; border-radius: 4px; font-size: 0.875rem; cursor: pointer; transition: all 0.3s ease; text-decoration: none; font-weight: 500; } .managerhelper-wrapper .btn:disabled { opacity: 0.6; cursor: not-allowed; } .managerhelper-wrapper .btn-primary { background: #2196f3; color: white; } .managerhelper-wrapper .btn-primary:hover:not(:disabled) { background: #1976d2; transform: translateY(-1px); box-shadow: 0 2px 8px rgba(33, 150, 243, 0.4); } .managerhelper-wrapper .btn-secondary { background: #757575; color: white; } .managerhelper-wrapper .btn-secondary:hover:not(:disabled) { background: #616161; transform: translateY(-1px); } .managerhelper-wrapper .btn-outline { background: transparent; border: 2px solid #f44336; color: #f44336; } .managerhelper-wrapper .btn-outline:hover { background: #f44336; color: white; } .managerhelper-wrapper .btn-sm { padding: 6px 12px; font-size: 0.8rem; } /* Output Container */ .managerhelper-wrapper .output-container { background: #263238; border-radius: 4px; overflow: hidden; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); margin: 0 2rem 2rem; } .managerhelper-wrapper .output-header { display: flex; justify-content: space-between; align-items: center; padding: 1rem 1.5rem; background: #37474f; color: white; border-bottom: 1px solid #455a64; } .managerhelper-wrapper .output-header h3 { display: flex; align-items: center; gap: 10px; margin: 0; } .managerhelper-wrapper .output-actions { display: flex; gap: 10px; } .managerhelper-wrapper .json-output { background: #263238; color: #eceff1; padding: 1.5rem; margin: 0; font-family: 'Roboto Mono', 'Fira Code', 'Courier New', monospace; font-size: 0.8rem; line-height: 1.5; overflow-x: auto; min-height: 200px; white-space: pre-wrap; } .managerhelper-wrapper .json-output:empty::before { content: 'Generated JSON configuration will appear here...'; color: #90a4ae; font-style: italic; } /* Validation Result */ .managerhelper-wrapper .validation-result { padding: 1rem 1.5rem; font-weight: 500; display: none; } .managerhelper-wrapper .validation-result.valid { background: #1b5e20; color: #81c784; display: block; } .managerhelper-wrapper .validation-result.invalid { background: #b71c1c; color: #ef5350; display: block; } /* Loading Spinner */ .managerhelper-wrapper .loading { position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0, 0, 0, 0.8); display: flex; flex-direction: column; justify-content: center; align-items: center; z-index: 2000; color: white; } .managerhelper-wrapper .spinner { width: 50px; height: 50px; border: 5px solid rgba(255, 255, 255, 0.3); border-top: 5px solid #ffd54f; border-radius: 50%; animation: spin 1s linear infinite; margin-bottom: 1rem; } @keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } } /* Toast Notification */ .managerhelper-wrapper .toast { position: fixed; top: 80px; right: 20px; background: #4caf50; color: white; padding: 1rem 1.5rem; border-radius: 4px; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3); transform: translateX(400px); transition: transform 0.3s ease; z-index: 2001; } .managerhelper-wrapper .toast.show { transform: translateX(0); } .managerhelper-wrapper .toast.error { background: #f44336; } /* Responsive Design */ @media (max-width: 768px) { .managerhelper-wrapper { margin: 0 -16px; } .managerhelper-wrapper .header h1 { font-size: 1.5rem; } .managerhelper-wrapper .tabs { padding: 0 1rem; } .managerhelper-wrapper .operations-container { padding: 1rem; } .managerhelper-wrapper .section { padding: 1rem; } .managerhelper-wrapper .actions { flex-direction: column; padding: 1rem; } .managerhelper-wrapper .output-container { margin: 0 1rem 1rem; } } ================================================ FILE: lakehouse_engine_usage/reconciliator/__init__.py ================================================ """ .. include::reconciliator.md """ ================================================ FILE: lakehouse_engine_usage/reconciliator/reconciliator.md ================================================ # Reconciliator Checking if data reconciles, using this algorithm, is a matter of reading the **truth** data and the **current** data. You can use any input specification compatible with the lakehouse engine to read **truth** or **current** data. On top of that, you can pass a `truth_preprocess_query` and a `current_preprocess_query` so you can preprocess the data before it goes into the actual reconciliation process. The reconciliation process is focused on joining **truth** with `current` by all provided columns except the ones passed as `metrics`. In the table below, we present how a simple reconciliation would look like: | current_country | current_count | truth_country | truth_count | absolute_diff | perc_diff | yellow | red | recon_type | |-----------------|---------------|---------------|-------------|---------------|-----------|--------|-----|------------| | Sweden | 123 | Sweden | 120 | 3 | 0.025 | 0.1 | 0.2 | percentage | | Germany | 2946 | Sweden | 2946 | 0 | 0 | 0.1 | 0.2 | percentage | | France | 2901 | France | 2901 | 0 | 0 | 0.1 | 0.2 | percentage | | Belgium | 426 | Belgium | 425 | 1 | 0.002 | 0.1 | 0.2 | percentage | The Reconciliator algorithm uses an ACON to configure its execution. You can find the meaning of each ACON property in [ReconciliatorSpec object](../../reference/packages/core/definitions.md#packages.core.definitions.ReconciliatorSpec). Below there is an example of usage of reconciliator. ```python from lakehouse_engine.engine import execute_reconciliation truth_query = """ SELECT shipping_city, sum(sales_order_qty) as qty, order_date_header FROM ( SELECT ROW_NUMBER() OVER ( PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city ORDER BY changed_on desc ) as rank1, sales_order_header, sales_order_item, sales_order_qty, order_date_header, shipping_city FROM truth -- truth is a locally accessible temp view created by the lakehouse engine WHERE order_date_header = '2021-10-01' ) a WHERE a.rank1 = 1 GROUP BY a.shipping_city, a.order_date_header """ current_query = """ SELECT shipping_city, sum(sales_order_qty) as qty, order_date_header FROM ( SELECT ROW_NUMBER() OVER ( PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city ORDER BY changed_on desc ) as rank1, sales_order_header, sales_order_item, sales_order_qty, order_date_header, shipping_city FROM current -- current is a locally accessible temp view created by the lakehouse engine WHERE order_date_header = '2021-10-01' ) a WHERE a.rank1 = 1 GROUP BY a.shipping_city, a.order_date_header """ acon = { "metrics": [{"metric": "qty", "type": "percentage", "aggregation": "avg", "yellow": 0.05, "red": 0.1}], "truth_input_spec": { "spec_id": "truth", "read_type": "batch", "data_format": "csv", "schema_path": "s3://my_data_product_bucket/artefacts/metadata/schemas/bronze/orders.json", "options": { "delimiter": "^", "dateFormat": "yyyyMMdd", }, "location": "s3://my_data_product_bucket/bronze/orders", }, "truth_preprocess_query": truth_query, "current_input_spec": { "spec_id": "current", "read_type": "batch", "data_format": "delta", "db_table": "my_database.orders", }, "current_preprocess_query": current_query, } execute_reconciliation(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/__init__.py ================================================ """ .. include::sensor.md """ ================================================ FILE: lakehouse_engine_usage/sensor/delta_table/__init__.py ================================================ """ .. include::delta_table.md """ ================================================ FILE: lakehouse_engine_usage/sensor/delta_table/delta_table.md ================================================ # Sensor from Delta Table This shows how to create a **Sensor to detect new data from a Delta Table**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. If you want to view some examples of usage you can visit the [delta upstream sensor table](../delta_upstream_sensor_table/delta_upstream_sensor_table.md) or the [jdbc sensor](../jdbc_table/jdbc_table.md). - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and **SUGGESTED** behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data will be consumed from a delta table in streaming mode, so if there is any new data it will give condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "delta", "db_table": "upstream_database.source_delta_table", "options": { "readChangeFeed": "true", # to read changes in upstream table }, }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/delta_upstream_sensor_table/__init__.py ================================================ """ .. include::delta_upstream_sensor_table.md """ ================================================ FILE: lakehouse_engine_usage/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md ================================================ # Sensor from other Sensor Delta Table This shows how to create a **Sensor to detect new data from another Sensor Delta Table**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) It makes use of `generate_sensor_query` to generate the `preprocess_query`, different from [delta_table](../delta_table/delta_table.md). Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger the condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor, generate_sensor_query acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "delta", "db_table": "upstream_database.lakehouse_engine_sensors", "options": { "readChangeFeed": "true", }, }, "preprocess_query": generate_sensor_query("UPSTREAM_SENSOR_ID"), "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/file/__init__.py ================================================ """ .. include::file.md """ ================================================ FILE: lakehouse_engine_usage/sensor/file/file.md ================================================ # Sensor from Files This shows how to create a **Sensor to detect new data from a File Location**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, it will automatically trigger the proceeding task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "csv", # You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles" "location": "s3://my_data_product_bucket/path", }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/jdbc_table/__init__.py ================================================ """ .. include::jdbc_table.md """ ================================================ FILE: lakehouse_engine_usage/sensor/jdbc_table/jdbc_table.md ================================================ # Sensor from JDBC This shows how to create a **Sensor to detect new data from a JDBC table**. ## Configuration required to have a Sensor - **jdbc_args**: Arguments of the JDBC upstream. - **generate_sensor_query**: Generates a Sensor query to consume data from the upstream, this function can be used on `preprocess_query` ACON option. - **sensor_id**: The unique identifier for the Sensor. - **filter_exp**: Expression to filter incoming new data. A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value` so that it can be replaced by the respective values from the sensor `control_db_table_name` for this specific sensor_id. - **control_db_table_name**: Sensor control table name. - **upstream_key**: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream). - **upstream_value**: the **first** upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). ***Note:*** This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`. - **upstream_table_name**: Table name to consume the upstream value. If it's empty the default value applied is `sensor_new_data`. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [Generic JDBC template with `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [Generic JDBC template with `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor, generate_sensor_query acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "JDBC_URL", "table": "JDBC_DB_TABLE", "properties": { "user": "JDBC_USERNAME", "password": "JDBC_PWD", "driver": "JDBC_DRIVER", }, }, "options": { "compress": True, }, }, "preprocess_query": generate_sensor_query( sensor_id="MY_SENSOR_ID", filter_exp="?upstream_key > '?upstream_value'", control_db_table_name="my_database.lakehouse_engine_sensors", upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA", ), "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/kafka/__init__.py ================================================ """ .. include::kafka.md """ ================================================ FILE: lakehouse_engine_usage/sensor/kafka/kafka.md ================================================ # Sensor from Kafka This shows how to create a **Sensor to detect new data from Kafka**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "kafka", "options": { "kafka.bootstrap.servers": "KAFKA_SERVER", "subscribe": "KAFKA_TOPIC", "startingOffsets": "earliest", "kafka.security.protocol": "SSL", "kafka.ssl.truststore.location": "TRUSTSTORE_LOCATION", "kafka.ssl.truststore.password": "TRUSTSTORE_PWD", "kafka.ssl.keystore.location": "KEYSTORE_LOCATION", "kafka.ssl.keystore.password": "KEYSTORE_PWD", }, }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/sap_bw_b4/__init__.py ================================================ """ .. include::sap_bw_b4.md """ ================================================ FILE: lakehouse_engine_usage/sensor/sap_bw_b4/sap_bw_b4.md ================================================ # Sensor from SAP This shows how to create a **Sensor to detect new data from a SAP LOGCHAIN table**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream. The Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream: `generate_sensor_sap_logchain_query` and `generate_sensor_query`. - **generate_sensor_sap_logchain_query**: This function aims to create a temporary table with timestamp from the SAP LOGCHAIN table, which is a process control table. !!! note this temporary table only lives during runtime, and it is related with the sap process control table but has no relationship or effect on the sensor control table. - **chain_id**: SAP Chain ID process. - **dbtable**: SAP LOGCHAIN db table name, default: `my_database.RSPCLOGCHAIN`. - **status**: SAP Chain Status of your process, default: `G`. - **engine_table_name**: Name of the temporary table created from the upstream data, default: `sensor_new_data`. This temporary table will be used as source in the `query` option. - **generate_sensor_query**: Generates a Sensor query to consume data from the temporary table created in the `prepareQuery`. - **sensor_id**: The unique identifier for the Sensor. - **filter_exp**: Expression to filter incoming new data. A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value` so that it can be replaced by the respective values from the sensor `control_db_table_name` for this specific sensor_id. - **control_db_table_name**: Sensor control table name. - **upstream_key**: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream). - **upstream_value**: the **first** upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). .. note:: This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`. - **upstream_table_name**: Table name to consume the upstream value. If it's empty the default value applied is `sensor_new_data`. .. note:: In case of using the `generate_sensor_sap_logchain_query` the default value for the temp table is `sensor_new_data`, so if passing a different value in the `engine_table_name` this parameter should have the same value. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data from SAP, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor, generate_sensor_query, generate_sensor_sap_logchain_query acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "jdbc", "options": { "compress": True, "driver": "JDBC_DRIVER", "url": "JDBC_URL", "user": "JDBC_USERNAME", "password": "JDBC_PWD", "prepareQuery": generate_sensor_sap_logchain_query(chain_id="CHAIN_ID", dbtable="JDBC_DB_TABLE"), "query": generate_sensor_query( sensor_id="MY_SENSOR_ID", filter_exp="?upstream_key > '?upstream_value'", control_db_table_name="my_database.lakehouse_engine_sensors", upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA", ), }, }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensor/sensor.md ================================================ # Sensor ## What is it? The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small single-node clusters to check if an upstream system or data product contains new data since the last execution of our job. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new data, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction, Transformation, and Loading). ## How do Sensor-based jobs work? image With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source system) have new data since the last successful job. We accomplish this through the approach illustrated above, which can be interpreted as follows: 1. A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors; 2. The Sensor task may run in a very tiny single-node cluster to ensure cost efficiency ([check sensor cost efficiency](#are-sensor-based-jobs-cost-efficient)); 3. If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster to process all the ETL tasks (data processing tasks). 4. In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options: 1. **(Preferred)** Sense the upstream Data Product sensor control delta table; 2. Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table sensor); ## The Structure and Relevance of the Data Product’s Sensors Control Table The concept of a lakehouse engine sensor is based on a special delta table stored inside the data product that chooses to opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that data product. You can refer to the below table to understand the sensor delta table structure: | Column Name | Type | Description | |-----------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **sensor_id** | STRING | A unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream.
Each sensor in each job should have a different sensor_id.
If you attempt to create 2 sensors with the same sensor_id, the engine will fail. | | **assets** | ARRAY\ | A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status *PROCESSED_NEW_DATA*. | | **status** | STRING | Status of the sensor. Can either be:
  • *ACQUIRED_NEW_DATA* – when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.
  • *PROCESSED_NEW_DATA* - when the job where the sensor is located has processed all the tasks in that job.
| | **status_change_timestamp** | STRING | Timestamp when the status has changed for the last time. | | **checkpoint_location** | STRING | Base location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors. | | **upstream_key** | STRING | Upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically).
This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database). | | **upstream_value** | STRING | Upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key.
This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database). | !!! note To make use of the sensors you will need to add this table to your data product. ## How is it different from scheduled jobs? Sensor-based jobs are still scheduled, but they can be scheduled with higher frequency, as they are more cost-efficient than ramping up a multi-node cluster supposed to do heavy ETL, only to figure out that the upstream does not have new data. ## Are sensor-based jobs cost-efficient? For the same schedule (e.g., 4 times a day), sensor-based jobs are more cost-efficient than scheduling a regular job, because with sensor-based jobs you can start a **very tiny single-node cluster**, and only if there is new data in the upstream the bigger ETL cluster is spin up. For this reason, they are considered more cost-efficient. Moreover, if you have very hard SLAs to comply with, you can also play with alternative architectures where you can have several sensors in a continuous (always running) cluster, which then keeps triggering the respective data processing jobs, whenever there is new data. ## Sensor Steps 1. Create your sensor task for the upstream source. Examples of available sources: - [Delta Table](delta_table/delta_table.md) - [Delta Upstream Sensor Table](delta_upstream_sensor_table/delta_upstream_sensor_table.md) - [File](file/file.md) - [JDBC](jdbc_table/jdbc_table.md) - [Kafka](kafka/kafka.md) - [SAP BW/B4](sap_bw_b4/sap_bw_b4.md) 2. Setup/Execute your ETL task based in the Sensor Condition 3. Update the Sensor Control table status with the [Update Sensor Status](update_sensor_status/update_sensor_status.md) ================================================ FILE: lakehouse_engine_usage/sensor/update_sensor_status/__init__.py ================================================ """ .. include::update_sensor_status.md """ ================================================ FILE: lakehouse_engine_usage/sensor/update_sensor_status/update_sensor_status.md ================================================ # Update Sensor control delta table after processing the data This shows how to **update the status of your Sensor after processing the new data**. Here is an example on how to update the status of your sensor in the Sensors Control Table: ```python from lakehouse_engine.engine import update_sensor_status update_sensor_status( sensor_id="MY_SENSOR_ID", control_db_table_name="my_database.lakehouse_engine_sensors", status="PROCESSED_NEW_DATA", assets=["MY_SENSOR_ASSETS"] ) ``` If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ================================================ FILE: lakehouse_engine_usage/sensors/__init__.py ================================================ """ .. include::sensors.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/__init__.py ================================================ """ .. include::heartbeat.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/delta_table/__init__.py ================================================ """ .. include::delta_table.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/delta_table/delta_table.md ================================================ # Heartbeat Sensor for Delta Table This shows how to create a Heartbeat Sensor Orchestrator to detect new data from a Delta Table and trigger Databricks Workflows related to them. ## Configuration required to create an orchestration task for the delta table source - **sensor_source**: Set to `delta_table` in the Heartbeat Control Table to identify this as a Delta table source. - **data_format**: Set to `delta` to specify the data format for reading Delta tables. - **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`). - **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`). - **options**: Configuration options for Delta table reading: - `readChangeFeed`: Set to `"true"` to enable change data feed reading. - **base_checkpoint_location**: `S3` path for storing checkpoint data (required if `sensor_read_type` is `streaming`). - **domain**: Databricks workflows domain for job triggering. - **token**: Databricks workflows token for authentication. ### Delta Table Data Feed CSV Configuration Entry To check how the entry for a Delta table source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records). ## Code sample of listener and trigger ```python from lakehouse_engine.engine import ( execute_sensor_heartbeat, trigger_heartbeat_sensor_jobs, ) # Create an ACON dictionary for all delta table source entries. # This ACON dictionary is useful for passing parameters to heartbeat sensors. heartbeat_sensor_config_acon = { "sensor_source": "delta_table", "data_format": "delta", "heartbeat_sensor_db_table": "my_database.heartbeat_sensor", "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors", "options": { "readChangeFeed": "true", }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "domain": "DATABRICKS_WORKFLOWS_DOMAIN", "token": "DATABRICKS_WORKFLOWS_TOKEN", } # Execute Heartbeat sensor and trigger jobs which have acquired new data. execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon) trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/heartbeat.md ================================================ # Heartbeat Sensor ## What is it? The Heartbeat Sensor is a robust, configurable system designed to continuously monitor upstream systems for new data. It enhances the existing sensor infrastructure by addressing key limitations and providing significant improvements: **Previous Sensor Architecture Limitations:** - Required individual sensor configurations for each data source. - Limited scalability when monitoring multiple upstream systems. - Manual job triggering and dependency management. - No centralized control or monitoring of sensor status. - Difficult to manage complex multi-source dependencies. **Heartbeat Sensor Enhancements:** - **Centralized Management**: Single control table to manage all sensor sources and their dependencies. - **Automated Job Orchestration**: Automatically triggers downstream Databricks jobs when new data is detected. - **Multi-Source Support**: Handles diverse source types (SAP, Kafka, Delta Tables, Manual Uploads, Trigger Files) in one unified system. - **Dependency Management**: Built-in hard/soft dependency validation before triggering jobs. - **Scalable Architecture**: Efficiently processes multiple sensors in parallel. - **Status Tracking**: Comprehensive lifecycle tracking from detection to job completion. This provides a centralized, efficient, and automated mechanism to detect and trigger downstream workflows with minimal user intervention. ## How Does the Heartbeat Sensor Work? image The Heartbeat Sensor operates on a pull-based approach using a single-node cluster that continuously monitors upstream systems. Here's how the system works: ### Core Architecture Components **1. [Centralized Control Table](#control-table-schema)** - Tracks and manages all data sources and their configurations. - Dynamically populated by the [Heartbeat Data Feeder Job](heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md). - Provides structured monitoring across various upstream systems. **2. Persistent Heartbeat Job** - Runs continuously or on a user-defined schedule. - Supports both real-time and batch-style data monitoring. - Efficiently processes multiple sensors in parallel. **3. Sensor Integration Framework** - Leverages existing sensor mechanisms for event detection. - Creates appropriate Sensor ACONs based on source types. - Returns `NEW_EVENT_AVAILABLE` status when new data is detected. **4. Automated Job Orchestration** - Triggers Databricks jobs via Job Run API when conditions are met. - Validates dependencies before job execution. - Maintains comprehensive audit trail of all operations. ### Operational Flow 1. **Continuous Monitoring**: The heartbeat cluster continuously polls configured sensor sources. 2. **Event Detection**: Checks each source for `NEW_EVENT_AVAILABLE` status. 3. **Dependency Validation**: Evaluates hard/soft dependencies before triggering jobs. 4. **Automatic Triggering**: Launches Databricks jobs when all conditions are satisfied. 5. **Status Management**: Updates control table throughout the entire lifecycle. !!! warning "Pull-Based Architecture" The system is designed for a "pull" approach, same as the Sensor solution. Downstream data product sensor clusters actively check for new events from the upstream. Upstream sensor clusters do not require write permissions to the downstream data product system. Just read access is required for upstream from downstream system. ### Control Table Schema The Heartbeat Sensor Control Table is the central component that manages all sensor sources and their configurations. Below is the complete schema with detailed descriptions: | Column name | Data Type | Description | Produced/Maintained by | |------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------| | **sensor_source** | STRING | Upstream source system
  • `sap_b4` - SAP 4HANA
  • `sap_bw` SAP BW
  • `delta_table`
  • `lmu_delta_table` - Lakehouse Manual Upload
  • `kafka`
  • `trigger_file`
| User/Developer | | **sensor_id** | STRING | Unique Upstream id or upstream reference.
  • **sap_bw** or **sap_b4** source:
    SAP Chain Id, example: `SAP_CHAIN_ID_SAP_TABLE`
  • **delta_table** source: Delta table name along with database name, examples: `my_database_1.my_table`; `my_database_2.my_table_2`
  • **lmu_delta_table** source: Lakehouse Manual Upload Delta table name along with database name, examples: `my_database.my_lmu_table`
  • **kafka** source: Kafka Topic name starting with prefix and then the topic name, example: `data_product_name: my_product.my.topic`.
  • **trigger_file** source: Asset name/folder name under which trigger file will be kept, example: `my_trigger`
| User/Developer | | **sensor_read_type** | STRING | Sensor read type to fetch new event - can be batch or streaming. | User/Developer | | **asset_description** | STRING | Description of Upstream source (It can be upstream name). | User/Developer | | **upstream_key** | STRING | upstream key (an attribute name from the upstream so that new data can be detected automatically), example: `load_date`.
This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).
**Note**: This attribute will be used in the `preprocess_query`, example: `SELECT * FROM sensor_new_data WHERE ?upstream_key >= current_date() - 7` will be rendered to `SELECT * FROM sensor_new_data WHERE load_date >= current_date() - 7` | User/Developer | | **preprocess_query** | STRING | Query to filter data returned by the upstream. **Note**: This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.
Example: `SELECT * FROM sensor_new_data WHERE load_date >= current_date() - 7` | User/Developer | | **latest_event_fetched_timestamp** | TIMESTAMP | Latest event fetched timestamp for upstream source. It will be updated each time as soon as NEW EVENT is available. | lakehouse-engine | | **trigger_job_id** | STRING | Databricks Job Id of downstream application. Based on this, Job will get triggered by Heartbeat once new event is available. | User/Developer | | **trigger_job_name** | STRING | Databricks Job Name. | User/Developer | | **status** | STRING | Status of the orchestration.
  • `NEW_EVENT_AVAILABLE` once new event is found.
  • `IN PROGRESS` - When job gets triggered
  • `COMPLETED` - once Job completed successfully
| lakehouse-engine | | **status_change_timestamp** | STRING | string containing the datetime when the status has changed. | lakehouse-engine | | **job_start_timestamp** | TIMESTAMP | Start timestamp of downstream Job. It will get updated as soon as Job went into `IN_PROGRESS` job_status. | lakehouse-engine | | **job_end_timestamp** | TIMESTAMP | End timestamp of downstream Job. It will get updated as soon as Job went into `COMPLETED` job_status. | lakehouse-engine | | **job_state** | STRING | Current status of Job in Control table. `PAUSED` or `UNPAUSED`. If `PAUSED`, Sensor will **not look** for NEW EVENTS or Trigger the dependent job. | User/Developer | | **dependency_flag** | STRING |
  • TRUE - For Hard dependency
  • FALSE - For SOFT dependency
All dependent Job needs to complete successfully for HARD dependency. For SOFT → FALSE marked job will be ignored. Default - must be TRUE in case of no dependency. | User/Developer | ### Control Table Reference Records The following table shows **example records** that demonstrate how different types of sensor sources are configured in the Heartbeat Sensor Control Table. These are **sample entries** that illustrate the structure and typical values for each column across various sensor source types (Kafka, Lakehouse Manual Upload Delta Table, SAP B4, Delta Table, and Trigger File). **Purpose of these examples:** - Show real-world configuration patterns for different sensor sources. - Demonstrate how different statuses (`NEW_EVENT_AVAILABLE`, `IN_PROGRESS`, `null`) appear in the table. - Illustrate the relationship between sensor sources and their corresponding Databricks jobs. - Provide reference values for fields like `sensor_id`, `trigger_job_id`, and status timestamps. !!! note These are illustrative examples - your actual table will contain records specific to your data sources and job configurations. | sensor_source | sensor_id | sensor_read_type | asset_description | upstream_key | preprocess_query | latest_event_fetched_timestamp | trigger_job_id | trigger_job_name | status | status_change_timestamp | job_start_timestamp | job_end_timestamp | job_state | dependancy_flag | |-----------------|------------------------------|------------------|----------------------------------------|--------------|------------------|--------------------------------|----------------|------------------------------------------|---------------------|--------------------------|--------------------------|-------------------|-----------|-----------------| | kafka | my_product: my.topic | streaming | My product Kafka Topic | null | null | 2025-04-23T21:40:23.768Z | 111111111 | my-product-kafka_consumer_job | IN_PROGRESS | 2025-04-23T21:40:36.88Z | 2025-04-23T21:40:36.88Z | null | UNPAUSED | TRUE | | lmu_delta_table | my_database.my_lmu_table | batch | My Lakehouse Manual Upload Delta Table | date | null | 2025-04-23T21:46:07.495Z | 222222222 | my-product-lmu_table_consumer_job | IN_PROGRESS | 2025-04-23T21:46:19.4Z | 2025-04-23T21:46:19.4Z | null | UNPAUSED | TRUE | | sap_b4 | SAP_BW_CHAIN_ID_SAP_TABLE | batch | My SAP BW Chain Process | LOAD_DATE | null | 2025-04-23T21:35:10.643Z | 333333333 | my-product-sap_bw_consumer_job | IN_PROGRESS | 2025-04-23T21:35:29.248Z | 2025-04-23T21:35:29.248Z | null | UNPAUSED | TRUE | | delta_table | my_database_1.my_table | streaming | My Delta Table from My Database 1 | null | null | 2025-04-23T22:11:56.384Z | 444444444 | my-product-delta_and_sap_b4_consumer_job | NEW_EVENT_AVAILABLE | 2025-04-23T22:11:56.384Z | null | null | UNPAUSED | TRUE | | sap_b4 | SAP_4HANA_CHAIN_ID_SAP_TABLE | batch | My SAP 4HANA Chain Process | LOAD_DATE | null | null | 444444444 | my-product-delta_and_sap_b4_consumer_job | null | null | null | null | UNPAUSED | TRUE | | trigger_file | my_trigger | streaming | My Trigger File | null | null | 2025-04-23T22:07:28.668Z | 555555555 | my-product-trigger_file_consumer_job | IN_PROGRESS | 2025-04-23T22:07:39.865Z | 2025-04-23T22:07:39.865Z | null | UNPAUSED | TRUE | ## How to Implement the Heartbeat Sensor This step-by-step guide aims to help you through setting up, configuring, and operating the Heartbeat Sensor system from initial setup to ongoing monitoring and troubleshooting. ### Phase 1: Initial Setup and Configuration #### Step 1: Define Your Data Source Configurations Create a CSV file containing your data source configurations with the following required columns: - `sensor_source`: Type of [sensor source](#control-table-schema). - `sensor_id`: Unique upstream identifier or reference. - `sensor_read_type`: How to read the sensor (batch or streaming). - `asset_description`: Description of the upstream source. - `upstream_key`: Attribute name for detecting new data automatically. - `preprocess_query`: Optional query to filter upstream data. - `trigger_job_id`: Databricks Job ID to trigger when new data is available. - `trigger_job_name`: Databricks Job Name. - `job_state`: Job control state (`UNPAUSED` or `PAUSED`). - `dependency_flag`: Dependency type (`TRUE` for hard, `FALSE` for soft). **Example CSV Configuration:** ```csv sensor_source,sensor_id,sensor_read_type,asset_description,upstream_key,preprocess_query,trigger_job_id,trigger_job_name,job_state,dependency_flag kafka,"my_product: my.topic",streaming,"My product Kafka Topic",,,"111111111","my-product-kafka_consumer_job",UNPAUSED,TRUE delta_table,"my_database_1.my_table",streaming,"My Delta Table from My Database 1",,,"444444444","my-product-delta_and_sap_b4_consumer_job",UNPAUSED,TRUE sap_b4,"SAP_4HANA_CHAIN_ID_SAP_TABLE",batch,"My SAP 4HANA Chain Process",LOAD_DATE,,"444444444","my-product-delta_and_sap_b4_consumer_job",UNPAUSED,TRUE ``` #### Step 2: Populate the Heartbeat Control Table Use the [Heartbeat Sensor Control Table Data Feeder](heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md) to: - Read your CSV configuration file. - Validate the configuration entries. - Ingest the data into the Heartbeat Control Table. - Establish the foundation for monitoring and orchestration. ### Phase 2: Heartbeat Sensor Operation Workflow #### Step 3: Continuous Monitoring and Event Detection The Heartbeat sensor cluster (running on a single node) performs the following operations: **3.1 Control Table Scanning** - Scans the Heartbeat Control Table for eligible records. - Filters records based on: - Supported sensor sources: `Delta Table`, `Kafka`, `SAP BW/4HANA`, `Lakehouse Manual Upload`, `Trigger file`. - Job state: `job_state = 'UNPAUSED'`. - Status conditions: `status IS NULL` or `status = 'COMPLETED'`. !!! important "Orchestration job recommendation" We recommend running multiple tasks for each sensor source type in the same Heartbeat Sensor Orchestrator and just create specific source related jobs when it's really needed, example: real time processing jobs or some complex jobs that need to be triggered as soon as the trigger condition is satisfied (all hard dependencies has `NEW_EVENT_AVAILABLE`). !!! note "First-Time Execution" For new sensor sources and IDs, the initial `status` will be `NULL`. This ensures that failed or paused jobs are not automatically triggered. **3.2 Source-Specific Event Detection** For each eligible record, the Heartbeat system: - Creates the appropriate Sensor ACON (configuration) based on the `sensor_source` type. - Passes the configuration to the respective Sensor Algorithm. - The sensor algorithm checks for `NEW_EVENT_AVAILABLE` status for the specific `sensor_id`. **Supported Source Types and Their Configuration:** - **[Delta Table Sources](delta_table/delta_table.md)**: Monitor delta tables for new data. - **[Kafka Sources](kafka/kafka.md)**: Monitor Kafka topics for new messages. - **[Manual Table Sources](manual_table/manual_table.md)**: Monitor manually uploaded delta tables. - **[SAP BW/B4 Sources](sap_bw_b4/sap_bw_b4.md)**: Monitor SAP systems for new process chains. - **[Trigger File Sources](trigger_file/trigger_file.md)**: Monitor file systems for trigger files. #### Step 4: Event Processing and Status Updates **4.1 New Event Detection** When a sensor detects new data: - Updates the traditional sensor table (`lakehouse_engine_sensor`) with detection details. - Returns `NEW_EVENT_AVAILABLE` status to the Heartbeat module. **4.2 Heartbeat Control Table Updates** The Heartbeat system updates the control table with: - `status` → `NEW_EVENT_AVAILABLE`. - `status_change_timestamp` → current timestamp. - `latest_event_fetched_timestamp` → timestamp when event detection started. #### Step 5: Dependency Validation and Job Triggering **5.1 Dependency Evaluation Process** Before triggering any jobs, the system evaluates dependencies: 1. **Filter Eligible Records**: Select records with `status = 'NEW_EVENT_AVAILABLE'`. 2. **Group by Job ID**: Group records by `trigger_job_id` to identify job dependencies. 3. **Evaluate Dependency Flags**: - **TRUE (Hard Dependency)**: Job must have `NEW_EVENT_AVAILABLE` status. - **FALSE (Soft Dependency)**: Job status is optional and doesn't block triggering. 4. **Aggregate and Validate**: Ensure all hard dependencies are satisfied before triggering. **5.2 Triggering Logic Examples** Consider Job 3 that depends on Job 1 and Job 2: - **Scenario A**: Job 1 (HARD) + Job 2 (HARD) → Both must have `NEW_EVENT_AVAILABLE`. - **Scenario B**: Job 1 (HARD) + Job 2 (SOFT) → Only Job 1 needs `NEW_EVENT_AVAILABLE`. image **5.3 Job Triggering via Databricks API** For jobs that pass dependency validation: - Trigger the corresponding `trigger_job_id` via Databricks Job Run API. - Immediately update the control table: - `status` → `IN_PROGRESS`. - `job_start_timestamp` → current timestamp. - `status_change_timestamp` → current timestamp. ### Phase 3: Job Execution and Completion #### Step 6: Databricks Job Execution Each triggered Databricks job must include: - Your primary ETL/processing tasks. - **Final Task**: [Update Heartbeat Sensor Status](update_heartbeat_sensor_status/update_heartbeat_sensor_status.md) task. #### Step 7: Job Completion Handling Upon successful job completion, the update status task: - Sets `status` → `COMPLETED`. - Updates `status_change_timestamp` → current timestamp. - Sets `job_end_timestamp` → job completion timestamp. ### Phase 4: Error Handling and Recovery #### Step 8: Job Failure Recovery Process If a Databricks job fails, follow this recovery process: 1. **Identify the Issue**: Analyze job logs and error messages. 2. **Fix the Problem**: Address the underlying cause of the failure. 3. **Manual Recovery**: Execute at least one successful manual run of the job. 4. **Automatic Resumption**: Heartbeat will resume monitoring and triggering after successful completion. !!! warning "Important Recovery Note" The Heartbeat sensor will **not** resume checking failed jobs for new events until at least one successful completion occurs. This prevents repeated triggering of failing jobs. #### Step 9: Monitoring and Maintenance **9.1 Regular Monitoring Tasks** - Monitor the Heartbeat Control Table for job statuses. - Check for jobs stuck in `IN_PROGRESS` status. - Verify dependency relationships are working correctly. - Review `latest_event_fetched_timestamp` for regular updates. **9.2 Control and Management** - **Pause Jobs**: Set `job_state` to `PAUSED` to temporarily stop monitoring. - **Resume Jobs**: Set `job_state` to `UNPAUSED` to resume monitoring. - **Modify Dependencies**: Update `dependency_flag` to change dependency relationships. ### Phase 5: Advanced Configuration and Optimization #### Step 10: Advanced Configuration Options **10.1 Preprocess Queries** Use `preprocess_query` to filter upstream data: ```sql -- Example: Filter only recent records SELECT * FROM sensor_new_data WHERE load_date >= current_date() - 7 ``` **10.2 Parallel Processing** The Heartbeat sensor automatically handles parallel processing of multiple sources, improving efficiency and scalability. **10.3 Pull-Based Architecture Benefits** - Upstream systems only need read access to downstream systems. - No write permissions required from upstream to downstream. - Improved security and access control. ### Troubleshooting Common Issues | Issue | Symptoms | Solution | |-----------------------------|-----------------------------------------------|--------------------------------------------------------------------| | Jobs not triggering | Status remains `NEW_EVENT_AVAILABLE` | Check dependency flags and ensure all hard dependencies are met. | | Jobs stuck in `IN_PROGRESS` | No completion status updates | Verify that jobs include the update status task as the final step. | | Failed job recovery | Jobs not resuming after fixes | Manually run the job successfully at least once. | | Missing events | `latest_event_fetched_timestamp` not updating | Check sensor source connectivity and configuration. | This workflow ensures reliable, automated data pipeline orchestration with robust error handling and dependency management. !!! note Also have a look at the [Sensor documentation](../sensors.md) to have a better understanding of the underlying sensor mechanisms that power the Heartbeat Sensor system. ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/__init__.py ================================================ """ .. include::heartbeat_sensor_data_feed.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md ================================================ # Heartbeat Sensor Control Table Data Feeder ## What is it? It's a foundational component of the Heartbeat Sensor architecture. The primary purpose is to populate and maintain the Control Table, which drives the entire heartbeat monitoring process. The Data Feeder Job is responsible for creating and updating entries in the Control Table. Each entry in the control table represents a sensor_source (e.g., SAP, Kafka, Delta) for a unique combination of `sensor_id` and `trigger_job_id`. ## Configuration required to execute heartbeat sensor data feed - **heartbeat_sensor_data_feed_path**: S3 path to the CSV file containing the heartbeat sensor control table data (e.g., `"s3://my_data_product_bucket/local_data/heartbeat_sensor/heartbeat_sensor_control_table_data.csv"`). - **heartbeat_sensor_control_table**: Database table name for the [Heartbeat sensor control table](../heartbeat.md#control-table-schema) (e.g., `"my_database.heartbeat_sensor"`). ## How it works 1. A Heartbeat Sensor data feed job in each data product needs to be created to facilitate any addition, update and deletion of entries. 2. Entries need to be added in CSV file format [as shown in Heartbeat Sensor Control table Metadata description section for more](../heartbeat.md#the-structure-and-relevance-of-the-data-products-heartbeat-sensor-control-table). Other fields in the control table will be filled automatically at different stages of the sensor process. 3. After adding/updating/deleting any entries in CSV, the Data feeder job needs to run again to reflect the changes in the table. ## Code sample ```python from lakehouse_engine.engine import execute_heartbeat_sensor_data_feed execute_heartbeat_sensor_data_feed( heartbeat_sensor_data_feed_path="s3://my_data_product_bucket/local_data/heartbeat_sensor/heartbeat_sensor_control_table_data.csv" , heartbeat_sensor_control_table="my_database.heartbeat_sensor" ) ``` ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/kafka/__init__.py ================================================ """ .. include::kafka.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/kafka/kafka.md ================================================ # Heartbeat Sensor for Kafka This shows how to create a Heartbeat Sensor Orchestrator to detect new data from Kafka and trigger Databricks Workflows related to them. ## Configuration required to create an orchestration task for the kafka source - **sensor_source**: Set to `kafka` in the Heartbeat Control Table to identify this as a Kafka source. - **data_format**: Set to `kafka` to specify the data format for reading Kafka streams. - **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`). - **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`). - **options**: Configuration options for Kafka reading: - `readChangeFeed`: Set to `"true"` to enable change data feed reading. - **kafka_configs**: Kafka connection and security configurations: - `kafka_bootstrap_servers_list`: Kafka server endpoints. - `kafka_ssl_truststore_location`: Path to SSL truststore. - `truststore_pwd_secret_key`: Secret key for truststore password. - `kafka_ssl_keystore_location`: Path to SSL keystore. - `keystore_pwd_secret_key`: Secret key for keystore password. - **kafka_secret_scope**: Databricks secret scope for Kafka credentials. - **base_checkpoint_location**: S3 path for storing checkpoint data (required if `sensor_read_type` is `streaming`). - **domain**: Databricks workflows domain for job triggering. - **token**: Databricks workflows token for authentication. ### Kafka Data Feed CSV Configuration Entry To check how the entry for a Kafka source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records). **Additional Requirements for Kafka**: The `sensor_id` follows a specific naming convention because you can have multiple data products using the same configuration file with different Kafka configuration values: - The value for the `sensor_id` will be the Kafka Topic name starting with `` or any other prefix, example: `my_product: my.topic`. - How it works? → Heartbeat receives a dictionary containing all kafka configurations by product, which is passed as `kafka_configs` in the ACON. Then it segregates the config based on `sensor_id` value present in the heartbeat control table. Heartbeat will split the `sensor_id` based on colon (:) and the first part of it will be considered as product name (in our case, `my_product`) and the second part of the split string will be the Kafka topic name (in our case, `my.topic`). Finally, **it will make use of the product related kafka config from the `kafka_configs`**. ## Code sample of listener and trigger ```python from lakehouse_engine.engine import ( execute_sensor_heartbeat, trigger_heartbeat_sensor_jobs, ) # Kafka configurations for the product, we strongly recommend to read these values from a external configuration file. kafka_configs = { "my_product": { "kafka_bootstrap_servers_list": "KAFKA_SERVER", "kafka_ssl_truststore_location": "TRUSTSTORE_LOCATION", "truststore_pwd_secret_key": "TRUSTSTORE_PWD", "kafka_ssl_keystore_location": "KEYSTORE_LOCATION", "keystore_pwd_secret_key": "KEYSTORE_PWD" } } # Create an ACON dictionary for all kafka source entries. # This ACON dictionary is useful for passing parameters to heartbeat sensors. heartbeat_sensor_config_acon = { "sensor_source": "kafka", "data_format": "kafka", "heartbeat_sensor_db_table": "my_database.heartbeat_sensor", "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors", "options": { "readChangeFeed": "true", }, "kafka_configs": kafka_configs, "kafka_secret_scope": "DB_SECRET_SCOPE", "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "domain": "DATABRICKS_WORKFLOWS_DOMAIN", "token": "DATABRICKS_WORKFLOWS_TOKEN", } # Execute Heartbeat sensor and trigger jobs which have acquired new data. execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon) trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/manual_table/__init__.py ================================================ """ .. include::manual_table.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/manual_table/manual_table.md ================================================ # Heartbeat Sensor for Manual Table This shows how to create a Heartbeat Sensor Orchestrator to detect new data from a Manual Table and trigger Databricks Workflows related to them. **Manual Tables (Lakehouse Manual Upload)** are different from regular Delta tables because: - **Data Upload Pattern**: Instead of continuous streaming or scheduled batch loads, data is manually uploaded by users at irregular intervals. - **Detection Challenge**: Unlike regular Delta tables with change data feeds or append operations, manual tables are typically overwritten completely, making it harder to detect new data using standard mechanisms. - **Custom Detection Logic**: Requires a special `upstream_key` (usually a timestamp column) to track when the table was last updated, since the table structure and most content may remain the same between uploads. - **Sensor Source Type**: Uses `lmu_delta_table` instead of `delta_table` to indicate this special handling requirement. ## Configuration required to create an orchestration task for the manual table source - **sensor_source**: Set to `lmu_delta_table` in the Heartbeat Control Table to identify this as a Lakehouse Manual Upload Delta table source. - **data_format**: Set to `delta` to specify the data format for reading Delta tables. - **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`). - **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`). - **domain**: Databricks workflows domain for job triggering. - **token**: Databricks workflows token for authentication. ### Manual Tables Data Feed CSV Configuration Entry To check how the entry for a manual table source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records). **Additional Requirements for Manual Tables**: - **sensor_id**: Needs to be filled with the Lakehouse Manual Upload Delta table name along with database, e.g., `my_database.my_manual_table`. - **upstream_key**: Must specify the table date/timestamp column (typically named `date`) which indicates when the Lakehouse Manual Upload table was last overwritten. This is crucial for detecting new manual uploads. **Setup Requirements**: - A column named **`date`** must be added to your Lakehouse Manual Upload source Delta table. - This column should contain a timestamp value in **YYYYMMDDHHMMSS** format. - The value should be updated to `current_timestamp()` whenever new data is uploaded. - This timestamp serves as the "fingerprint" that the sensor uses to detect new uploads. !!! note **`date` (or any other name, but with the same purpose, need to be defined on `upstream_key` CSV configuration entry) column requirement**: Since manual tables are typically overwritten entirely during each upload, standard Delta table change detection mechanisms won't work. The Heartbeat sensor needs a reliable way to determine if new data has been uploaded since the last check. ## Code sample of listener and trigger ```python from lakehouse_engine.engine import ( execute_sensor_heartbeat, trigger_heartbeat_sensor_jobs, ) # Create an ACON dictionary for all manual table source entries. # This ACON dictionary is useful for passing parameters to heartbeat sensors. heartbeat_sensor_config_acon = { "sensor_source": "lmu_delta_table", "data_format": "delta", "heartbeat_sensor_db_table": "my_database.heartbeat_sensor", "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors", "domain": "DATABRICKS_WORKFLOWS_DOMAIN", "token": "DATABRICKS_WORKFLOWS_TOKEN", } # Execute Heartbeat sensor and trigger jobs which have acquired new data. execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon) trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/__init__.py ================================================ """ .. include::sap_bw_b4.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/sap_bw_b4.md ================================================ # Heartbeat Sensor for SAP BW/B4 This shows how to create a Heartbeat Sensor Orchestrator to detect new data from SAP BW/B4 and trigger Databricks Workflows related to them. ## Configuration required to create an orchestration task for the SAP BW/B4 source - **sensor_source**: Set to `sap_b4` or `sap_bw` in the Heartbeat Control Table to identify this as a SAP source. - **data_format**: Set to `jdbc` to specify the data format for reading from SAP via JDBC connection. - **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`). - **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`). - **options**: JDBC connection configuration: - `compress`: Set to `true` to enable compression. - `driver`: JDBC driver class name. - `url`: JDBC connection URL. - `user`: JDBC username for authentication. - `password`: JDBC password for authentication. - **jdbc_db_table**: SAP logchain table name to query for process chain status. - **domain**: Databricks workflows domain for job triggering. - **token**: Databricks workflows token for authentication. ### SAP BW/B4 Data Feed CSV Configuration Entry To check how the entry for a SAP BW/B4 source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records). **Additional Requirements for SAP BW/4HANA**: - The `sensor_id` needs to be filled with the Process Chain Name of the SAP object. - `sensor_read_type` needs to be `batch` for SAP. ## Code sample of listener and trigger ```python from lakehouse_engine.engine import ( execute_sensor_heartbeat, trigger_heartbeat_sensor_jobs, ) # Create an ACON dictionary for all SAP BW/B4 source entries. # This ACON dictionary is useful for passing parameters to heartbeat sensors. heartbeat_sensor_config_acon = { "sensor_source": "sap_b4|sap_bw", # use sadp_b4 or sap_bw, depending on the source you are reading from "data_format": "jdbc", "heartbeat_sensor_db_table": "my_database.heartbeat_sensor", "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors", "options": { "compress": True, "driver": "JDBC_DRIVER", "url": "JDBC_URL", "user": "JDBC_USERNAME", "password": "JDBC_PSWD", }, "jdbc_db_table": "SAP_LOGCHAIN_TABLE", "domain": "DATABRICKS_WORKFLOWS_DOMAIN", "token": "DATABRICKS_WORKFLOWS_TOKEN", } # Execute Heartbeat sensor and trigger jobs which have acquired new data. execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon) trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/trigger_file/__init__.py ================================================ """ .. include::trigger_file.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/trigger_file/trigger_file.md ================================================ # Heartbeat Sensor for Trigger Files This shows how to create a Heartbeat Sensor Orchestrator to detect new data from Trigger Files and trigger Databricks Workflows related to them. ## Generating the trigger file It's needed to create a task in the upstream pipeline to generate a trigger file, indicating that the upstream source has completed and the dependent job can be triggered. The `sensor_id` used to generate the file must match the `sensor_id` specified in the heartbeat control table. Check here the [code example](#creation-of-the-trigger-file-following-the-sensorid-standard-code-example) of how to generate the trigger file. #### Creation of the trigger file following the `sensor_id` standard code example: ```pyhon import datetime sensor_id = "my_trigger" file_root_path = "s3://my_data_product_bucket/triggers" file_name = f"{sensor_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt" file_path = "/".join([file_root_path, sensor_id, file_name]) ### Write Trigger File to S3 location using dbutils output = dbutils.fs.put(file_path, "Success") ``` ## Configuration required to create an orchestration task for the trigger file source - **sensor_source**: Set to `trigger_file` in the Heartbeat Control Table to identify this as a trigger file source. - **data_format**: Set to `cloudfiles` to enable Spark Auto Loader functionality for monitoring trigger files. This format allows the system to automatically detect when new trigger files are available at the specified location and trigger the [corresponding `trigger_job_id`](../heartbeat.md#control-table-schema). - **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`). - **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`). - **options**: Cloud files configuration: - `cloudFiles.format`: Set to `"csv"` to specify the file format. - **schema_dict**: Schema definition for the trigger files: - Defines the structure with fields like `file_name` (string) and `file_modification_time` (timestamp). - **base_checkpoint_location**: S3 path for storing checkpoint data (required if `sensor_read_type` is `streaming`). - **base_trigger_file_location**: S3 path where trigger files are located. - **domain**: Databricks workflows domain for job triggering. - **token**: Databricks workflows token for authentication. ### Trigger File Data Feed CSV Configuration Entry To check how the entry for a trigger file source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records). **Additional Requirements for Trigger File**: - The `sensor_id` will match the name used to create the trigger file. For example, if the trigger file is named `my_trigger_YYYYMMDDHHMMSS.txt`, then the sensor_id will be `my_trigger`. ## Code sample of listener and trigger ```python from lakehouse_engine.engine import ( execute_sensor_heartbeat, trigger_heartbeat_sensor_jobs, ) # Create an ACON dictionary for all trigger file source entries. # This ACON dictionary is useful for passing parameters to heartbeat sensors. heartbeat_sensor_config_acon = { "sensor_source": "trigger_file", "data_format": "cloudfiles", "heartbeat_sensor_db_table": "my_database.heartbeat_sensor", "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors", "options": { "cloudFiles.format": "csv", }, "schema_dict": { "type": "struct", "fields": [ { "name": "file_name", "type": "string", }, { "name": "file_modification_time", "type": "timestamp", }, ], }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "base_trigger_file_location": "s3://my_data_product_bucket/triggers", "domain": "DATABRICKS_WORKFLOWS_DOMAIN", "token": "DATABRICKS_WORKFLOWS_TOKEN", } # Execute Heartbeat sensor and trigger jobs which have acquired new data. execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon) trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/__init__.py ================================================ """ .. include::update_heartbeat_sensor_status.md """ ================================================ FILE: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/update_heartbeat_sensor_status.md ================================================ # Update Heartbeat Sensor control delta table after processing the data This shows how to update the status of your Heartbeat Sensor after executing the pipeline. The `update_heartbeat_sensor_status` function is **critical for the Heartbeat Sensor lifecycle** because: - **Completes the monitoring cycle**: When a Heartbeat sensor triggers a job, it sets the status to `IN_PROGRESS`. Without this update, the sensor would never know the job completed successfully. - **Enables continuous monitoring**: Only after a job is marked as `COMPLETED` will the Heartbeat sensor resume monitoring that source for new events. - **Prevents stuck jobs**: Without proper status updates, failed jobs remain in `IN_PROGRESS` status indefinitely, blocking future job triggers. - **Supports recovery process**: This is essential for the [Job Failure Recovery Process](../heartbeat.md#heartbeat-sensor-workflow-explanation) described in the main Heartbeat documentation, where at least one successful run must be completed before the sensor resumes monitoring. !!! note **When to use**: This function must be called as the **final task** in every Databricks job that is orchestrated by the Heartbeat Sensor to properly update the `status` to `COMPLETED` and record the job completion timestamp. ## Configuration required to update heartbeat sensor status - **job_id**: The unique identifier of the Databricks job that was triggered by the Heartbeat sensor (e.g., `"MY_JOB_ID"`). - **heartbeat_sensor_control_table**: Database table name for the Heartbeat sensor control table (e.g., `"my_database.heartbeat_sensor"`). - **sensor_table**: Database table name for the lakehouse engine sensors table (e.g., `"my_database.lakehouse_engine_sensors"`). ## Code sample Code sample on how to update the status of your sensor in the Heartbeat Sensors Control Table: ```python from lakehouse_engine.engine import update_heartbeat_sensor_status update_heartbeat_sensor_status( job_id="MY_JOB_ID", heartbeat_sensor_control_table="my_database.heartbeat_sensor", sensor_table="my_database.lakehouse_engine_sensors", ) ``` If you want to know more please visit the definition of the class [here](../../../../reference/packages/core/definitions.md#packages.core.definitions.HeartbeatConfigSpec). ================================================ FILE: lakehouse_engine_usage/sensors/sensor/__init__.py ================================================ """ .. include::sensor.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/delta_table/__init__.py ================================================ """ .. include::delta_table.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/delta_table/delta_table.md ================================================ # Sensor from Delta Table This shows how to create a **Sensor to detect new data from a Delta Table**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. If you want to view some examples of usage you can visit the [delta upstream sensor table](../delta_upstream_sensor_table/delta_upstream_sensor_table.md) or the [jdbc sensor](../jdbc_table/jdbc_table.md). - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and **SUGGESTED** behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data will be consumed from a delta table in streaming mode, so if there is any new data it will give condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "delta", "db_table": "upstream_database.source_delta_table", "options": { "readChangeFeed": "true", # to read changes in upstream table }, }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/__init__.py ================================================ """ .. include::delta_upstream_sensor_table.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md ================================================ # Sensor from other Sensor Delta Table This shows how to create a **Sensor to detect new data from another Sensor Delta Table**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) It makes use of `generate_sensor_query` to generate the `preprocess_query`, different from [delta_table](../delta_table/delta_table.md). Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger the condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor, generate_sensor_query acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "delta", "db_table": "upstream_database.lakehouse_engine_sensors", "options": { "readChangeFeed": "true", }, }, "preprocess_query": generate_sensor_query("UPSTREAM_SENSOR_ID"), "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/sensor/file/__init__.py ================================================ """ .. include::file.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/file/file.md ================================================ # Sensor from Files This shows how to create a **Sensor to detect new data from a File Location**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, it will automatically trigger the proceeding task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "csv", # You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles" "location": "s3://my_data_product_bucket/path", }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/sensor/jdbc_table/__init__.py ================================================ """ .. include::jdbc_table.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/jdbc_table/jdbc_table.md ================================================ # Sensor from JDBC This shows how to create a **Sensor to detect new data from a JDBC table**. ## Configuration required to have a Sensor - **jdbc_args**: Arguments of the JDBC upstream. - **generate_sensor_query**: Generates a Sensor query to consume data from the upstream, this function can be used on `preprocess_query` ACON option. - **sensor_id**: The unique identifier for the Sensor. - **filter_exp**: Expression to filter incoming new data. A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value` so that it can be replaced by the respective values from the sensor `control_db_table_name` for this specific sensor_id. - **control_db_table_name**: Sensor control table name. - **upstream_key**: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream). - **upstream_value**: the **first** upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). ***Note:*** This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`. - **upstream_table_name**: Table name to consume the upstream value. If it's empty the default value applied is `sensor_new_data`. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [Generic JDBC template with `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [Generic JDBC template with `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor, generate_sensor_query acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "JDBC_URL", "table": "JDBC_DB_TABLE", "properties": { "user": "JDBC_USERNAME", "password": "JDBC_PWD", "driver": "JDBC_DRIVER", }, }, "options": { "compress": True, }, }, "preprocess_query": generate_sensor_query( sensor_id="MY_SENSOR_ID", filter_exp="?upstream_key > '?upstream_value'", control_db_table_name="my_database.lakehouse_engine_sensors", upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA", ), "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/sensor/kafka/__init__.py ================================================ """ .. include::kafka.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/kafka/kafka.md ================================================ # Sensor from Kafka This shows how to create a **Sensor to detect new data from Kafka**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "kafka", "options": { "kafka.bootstrap.servers": "KAFKA_SERVER", "subscribe": "KAFKA_TOPIC", "startingOffsets": "earliest", "kafka.security.protocol": "SSL", "kafka.ssl.truststore.location": "TRUSTSTORE_LOCATION", "kafka.ssl.truststore.password": "TRUSTSTORE_PWD", "kafka.ssl.keystore.location": "KEYSTORE_LOCATION", "kafka.ssl.keystore.password": "KEYSTORE_PWD", }, }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/__init__.py ================================================ """ .. include::sap_bw_b4.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/sap_bw_b4.md ================================================ # Sensor from SAP This shows how to create a **Sensor to detect new data from a SAP LOGCHAIN table**. ## Configuration required to have a Sensor - **sensor_id**: A unique identifier of the sensor in a specific job. - **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`. - **control_db_table_name**: Name of the sensor control table. - **input_spec**: Input spec with the upstream source. - **preprocess_query**: Query to filter data returned by the upstream. !!! note This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`. - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data. - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when there is no new data detected from upstream. Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream. The Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream: `generate_sensor_sap_logchain_query` and `generate_sensor_query`. - **generate_sensor_sap_logchain_query**: This function aims to create a temporary table with timestamp from the SAP LOGCHAIN table, which is a process control table. !!! note this temporary table only lives during runtime, and it is related with the sap process control table but has no relationship or effect on the sensor control table. - **chain_id**: SAP Chain ID process. - **dbtable**: SAP LOGCHAIN db table name, default: `my_database.RSPCLOGCHAIN`. - **status**: SAP Chain Status of your process, default: `G`. - **engine_table_name**: Name of the temporary table created from the upstream data, default: `sensor_new_data`. This temporary table will be used as source in the `query` option. - **generate_sensor_query**: Generates a Sensor query to consume data from the temporary table created in the `prepareQuery`. - **sensor_id**: The unique identifier for the Sensor. - **filter_exp**: Expression to filter incoming new data. A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value` so that it can be replaced by the respective values from the sensor `control_db_table_name` for this specific sensor_id. - **control_db_table_name**: Sensor control table name. - **upstream_key**: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream). - **upstream_value**: the **first** upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). .. note:: This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`. - **upstream_table_name**: Table name to consume the upstream value. If it's empty the default value applied is `sensor_new_data`. .. note:: In case of using the `generate_sensor_sap_logchain_query` the default value for the temp table is `sensor_new_data`, so if passing a different value in the `engine_table_name` this parameter should have the same value. If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ## Scenarios This covers the following scenarios of using the Sensor: 1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested) 2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false) Data from SAP, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task. ### `fail_on_empty_result` as True (default and SUGGESTED) ```python from lakehouse_engine.engine import execute_sensor, generate_sensor_query, generate_sensor_sap_logchain_query acon = { "sensor_id": "MY_SENSOR_ID", "assets": ["MY_SENSOR_ASSETS"], "control_db_table_name": "my_database.lakehouse_engine_sensors", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "jdbc", "options": { "compress": True, "driver": "JDBC_DRIVER", "url": "JDBC_URL", "user": "JDBC_USERNAME", "password": "JDBC_PWD", "prepareQuery": generate_sensor_sap_logchain_query(chain_id="CHAIN_ID", dbtable="JDBC_DB_TABLE"), "query": generate_sensor_query( sensor_id="MY_SENSOR_ID", filter_exp="?upstream_key > '?upstream_value'", control_db_table_name="my_database.lakehouse_engine_sensors", upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA", ), }, }, "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints", "fail_on_empty_result": True, } execute_sensor(acon=acon) ``` ### `fail_on_empty_result` as False Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it has acquired new data. This value can be used to execute or not the next steps. ```python from lakehouse_engine.engine import execute_sensor acon = { [...], "fail_on_empty_result": False } acquired_data = execute_sensor(acon=acon) ``` ================================================ FILE: lakehouse_engine_usage/sensors/sensor/sensor.md ================================================ # Sensor ## What is it? The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small single-node clusters to check if an upstream system or data product contains new data since the last execution of our job. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new data, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction, Transformation, and Loading). ## How do Sensor-based jobs work? image With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source system) have new data since the last successful job. We accomplish this through the approach illustrated above, which can be interpreted as follows: 1. A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors; 2. The Sensor task may run in a very tiny single-node cluster to ensure cost efficiency ([check sensor cost efficiency](#are-sensor-based-jobs-cost-efficient)); 3. If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster to process all the ETL tasks (data processing tasks). 4. In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options: 1. **(Preferred)** Sense the upstream Data Product sensor control delta table; 2. Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table sensor); ## The Structure and Relevance of the Sensors Control Table The concept of the lakehouse-engine sensor is based on a special delta table stored inside the data product that chooses to opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that data product. You can refer to the below table to understand the sensor delta table structure: | Column Name | Type | Description | |-----------------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **sensor_id** | STRING | A unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream.
Each sensor in each job should have a different sensor_id.
If you attempt to create 2 sensors with the same sensor_id, the engine will fail. | | **assets** | ARRAY | A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status *PROCESSED_NEW_DATA*. | | **status** | STRING | Status of the sensor. Can either be:
  • *ACQUIRED_NEW_DATA* – when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.
  • *PROCESSED_NEW_DATA* - when the job where the sensor is located has processed all the tasks in that job.
| | **status_change_timestamp** | STRING | Timestamp when the status has changed for the last time. | | **checkpoint_location** | STRING | Base location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors. | | **upstream_key** | STRING | Upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically).
This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database). | | **upstream_value** | STRING | Upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key.
This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database). | !!! note "Control Table Requirements" ** Sensors**: You need to add this control table to your data product to use sensors. **Heartbeat Sensor**: Uses Sensor control table and a different heartbeat control table structure. For Heartbeat Sensor implementation, refer to the [Heartbeat Sensor Control Table structure](heartbeat/heartbeat.md#heartbeat-sensor-control-table-reference-records). ## How is it Different from Scheduled Jobs? Both sensor-based jobs and Heartbeat Sensor jobs are still scheduled, but they can be scheduled with higher frequency because they are more cost-efficient than spinning up multi-node clusters for heavy ETL operations, only to discover that the upstream doesn't have new data. Each job includes a sensor task that checks for new data before proceeding with ETL tasks. If no new data is found, the job exits early without consuming additional resources. ## Are Sensor-based Jobs Cost-Efficient? Yes, for the same schedule (e.g., 4 times a day), sensor-based jobs are significantly more cost-efficient than scheduling regular jobs because: 1. **Minimal Resource Usage**: Sensor tasks run on very small single-node clusters 2. **Conditional Processing**: Larger ETL clusters are only spun up when new data is available 3. **Early Exit Strategy**: Jobs exit early if no new data is detected, saving compute costs 4. **Optimized Scheduling**: You can schedule sensor checks more frequently without proportional cost increases For demanding SLAs, you can implement alternative architectures with continuous (always-running) sensor clusters that trigger respective data processing jobs whenever new data becomes available. ## Sensor Steps 1. Create your sensor task for the upstream source. Examples of available sources: - [Delta Table](delta_table/delta_table.md) - [Delta Upstream Sensor Table](delta_upstream_sensor_table/delta_upstream_sensor_table.md) - [File](file/file.md) - [JDBC](jdbc_table/jdbc_table.md) - [Kafka](kafka/kafka.md) - [SAP BW/B4](sap_bw_b4/sap_bw_b4.md) 2. Setup/Execute your ETL task based in the Sensor Condition 3. Update the Sensor Control table status with the [Update Sensor Status](update_sensor_status/update_sensor_status.md) ================================================ FILE: lakehouse_engine_usage/sensors/sensor/update_sensor_status/__init__.py ================================================ """ .. include::update_sensor_status.md """ ================================================ FILE: lakehouse_engine_usage/sensors/sensor/update_sensor_status/update_sensor_status.md ================================================ # Update Sensor control delta table after processing the data This shows how to **update the status of your Sensor after processing the new data**. Here is an example on how to update the status of your sensor in the Sensors Control Table: ```python from lakehouse_engine.engine import update_sensor_status update_sensor_status( sensor_id="MY_SENSOR_ID", control_db_table_name="my_database.lakehouse_engine_sensors", status="PROCESSED_NEW_DATA", assets=["MY_SENSOR_ASSETS"] ) ``` If you want to know more please visit the definition of the class [here](../../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec). ================================================ FILE: lakehouse_engine_usage/sensors/sensors.md ================================================ # Sensors ## What is it? The lakehouse engine provides two complementary sensor solutions for monitoring upstream systems and detecting new data: ### 1. Sensor Traditional lakehouse engine sensors are abstractions that simplify complex Spark code, allowing you to check if an upstream system or data product contains new data since the last job execution. These sensors run in very small single-node clusters to ensure cost efficiency. If the upstream contains new data, the sensor triggers the rest of the job; otherwise, the job exits without spinning up larger clusters for intensive ETL operations. **Key Characteristics:** - Individual sensor configuration for each data source within jobs. - Manual job execution after sensor detection or the need of adding a Sensor task in the beginning of the pipeline. - Single-source monitoring capability per task, coupling it directly with the source and not with a source type. - Built-in cost optimization through minimal cluster usage. ### 2. Heartbeat Sensor The Heartbeat Sensor is a robust, centralized orchestration system that enhances the Sensor infrastructure. It provides automated event detection, efficient multiple sources parallelism detection, and seamless integration with downstream workflows. Unlike Sensors that require individual configuration for each data source, the Heartbeat Sensor manages multiple sources through a single control table and automatically triggers Databricks jobs when new data is detected. **Key Characteristics:** - Centralized control table for managing all Sensor sources - Automatic Databricks job triggering via Job Run API - Multi-source support with dependency management - Built-in hard/soft dependency validation - Comprehensive status tracking and lifecycle management ## When to Use Each Solution | Aspect | Sensor | Heartbeat Sensor | |---------------------------|----------------------------------------------------------|------------------------------------------------------------| | **Use Case** | Simple, single-source monitoring within individual jobs. | Complex, multi-source orchestration with job dependencies. | | **Configuration** | Individual sensor setup per job. | Centralized control table configuration. | | **Job Triggering** | Manual job execution after sensor detection. | Automatic Databricks job triggering via Job API. | | **Dependency Management** | Not supported. | Built-in hard/soft dependency validation. | | **Scalability** | Limited to individual sensors. | Highly scalable with parallel source type processing. | | **Management Overhead** | Higher (individual configurations). | Lower (centralized management). | | **Best For** | Single data product monitoring. | Enterprise-level orchestration. | ### Decision Guide **Choose Sensors when:** - You need simple monitoring for a single data source. - Your workflow involves manual job execution or you are up to update your pipeline to have a Sensor task at the beginning. - You have straightforward ETL pipelines without complex dependencies. - You prefer embedded sensor logic within individual jobs. - Your data pipeline is relatively straightforward. **Choose Heartbeat Sensor when:** - You need to orchestrate multiple data sources and dependencies. - You want automated job triggering without manual intervention. - You require centralized monitoring and management. - You need to handle complex multi-source workflows at enterprise scale. - You require enterprise-level orchestration capabilities. - You need centralized monitoring and status management. Both solutions can coexist in the same environment, allowing you to choose the appropriate sensor type based on specific use case requirements. ## How do Sensor-based Jobs Work? With sensors, data products in the lakehouse can detect if another data product or upstream system contains new data since the last successful job execution. The workflow is as follows: 1. **Data Detection**: A data product checks if Kafka, JDBC, or any other supported Sensor source contains new data using the respective sensors. 2. **Cost-Efficient Execution**: The Sensor task runs in a very small single-node cluster to ensure cost efficiency. 3. **Conditional Processing**: If the Sensor detects new data in the upstream, you can start a different ETL job cluster to process all ETL tasks (data processing tasks). 4. **Cross-Product Sensing**: Different data products can Sense if upstream data products have new data using: - **(Preferred)** Sensing the upstream data product's Sensor control delta table. - Sensing the upstream data product's data files in S3 (files sensor) or delta tables (delta table sensor). For detailed information about Heartbeat Sensor implementation, configuration, and usage, see the [Sensor documentation](sensor/sensor.md). ## How do Heartbeat Sensor Jobs Work? The Heartbeat Sensor approach uses a centralized sensor cluster running on a single node that continuously checks for new events from different sensor sources mentioned in the Heartbeat sensor control table. When a new event is available from a sensor source, it automatically triggers the corresponding job via the Databricks Job Run API using a pull-based approach. **Workflow Process:** 1. **Continuous Monitoring**: The heartbeat cluster continuously polls various sensor sources. 2. **Event Detection**: Checks for `NEW_EVENT_AVAILABLE` status from configured sources. 3. **Dependency Validation**: Validates hard/soft dependencies before job triggering. 4. **Automatic Triggering**: Automatically triggers dependent Databricks jobs. 5. **Status Management**: Updates job status throughout the lifecycle. **Key Advantages:** - **Centralized Control**: Single control table manages all sensor sources and dependencies. - **Automated Orchestration**: No manual intervention required for job triggering. - **Multi-Source Support**: Handles diverse source types (SAP, Kafka, Delta Tables, Manual Uploads, Trigger Files) in one unified system. - **Dependency Management**: Built-in validation prevents premature job execution. - **Status Tracking**: Comprehensive lifecycle tracking from detection to job completion. For detailed information about Heartbeat Sensor implementation, configuration, and usage, see the [Heartbeat Sensor documentation](heartbeat/heartbeat.md). ================================================ FILE: pyproject.toml ================================================ [build-system] requires = [ "setuptools==74.*" ] build-backend = "setuptools.build_meta" [project] name = "lakehouse-engine" requires-python = ">=3.12" readme = "README.md" license = {file = "LICENSE.txt"} version = "2.0.0" authors = [{name = "Adidas Lakehouse Foundations Team", email = "software.engineering@adidas.com"}] description = "A configuration-driven Spark framework serving as the engine for several lakehouse algorithms and data flows." keywords = ["framework", "big-data", "spark", "databricks", "data-quality", "data-engineering", "great-expectations", "lakehouse", "delta-lake", "configuration-driver"] classifiers = [ "Development Status :: 5 - Production/Stable", "Programming Language :: Python :: 3", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Intended Audience :: Other Audience", "Operating System :: OS Independent", "Topic :: Scientific/Engineering", "Topic :: Software Development", "License :: OSI Approved :: Apache Software License" ] dynamic = ["dependencies", "optional-dependencies"] [project.urls] Repository = "https://github.com/adidas/lakehouse-engine" Documentation = "https://adidas.github.io/lakehouse-engine-docs/index.html" Issues = "https://github.com/adidas/lakehouse-engine/issues" Releases = "https://github.com/adidas/lakehouse-engine/releases" [tool.setuptools.dynamic] dependencies = { file = ["cicd/requirements.lock"] } optional-dependencies.os = { file = ["cicd/requirements_os.lock"] } optional-dependencies.azure = { file = ["cicd/requirements_azure.lock"] } optional-dependencies.dq = { file = ["cicd/requirements_dq.lock"] } optional-dependencies.sftp = { file = ["cicd/requirements_sftp.lock"] } optional-dependencies.sharepoint = { file = ["cicd/requirements_sharepoint.lock"] } [tool.setuptools.packages.find] exclude = ["tests*", "lakehouse_engine_usage*"] namespaces = false [tool.setuptools.package-data] lakehouse_engine = ["configs/engine.yaml"] [tool.isort] profile = "black" [tool.mypy] warn_return_any = true warn_unused_configs = true ignore_missing_imports = false strict_optional = false disallow_untyped_defs = true [[tool.mypy.overrides]] module = [ "delta.*", "pyspark.*", "py4j.*", "great_expectations.*", "pandas.*", "IPython.*", "nest_asyncio.*", "msgraph.*", "importlib.*", "yaml.*", "ruamel.*", "msal.*", "dbruntime.databricks_repl_context.*" ] ignore_missing_imports = true [tool.pytest.ini_options] testpaths = [ "tests" ] filterwarnings = [ # coming from GX and also on their pyproject ignores "ignore: Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning", #1 warning # We are defining result_format at the Checkpoint level (which is the right one), but GX is wrongly # triggering the warning, because it is also considering the defaults of the expectations for triggering the warning. # Only place where we are not defining at Checkpoint level is for custom expectation local test, as we don't # need checkpoint for the test. "ignore:`result_format` configured at the Validator-level will not be persisted:UserWarning", # 12 warnings "ignore:`result_format` configured at the Expectation-level will not be persisted:UserWarning", # 12 warnings "ignore: jsonschema.RefResolver is deprecated as of v4.18.0:DeprecationWarning", #1985 warnings come from this one "ignore: The default dtype for empty Series will be 'object' instead of 'float64' in a future version.:DeprecationWarning", "ignore: The default dtype for empty Series will be:FutureWarning", # Warning about host keys on local ftp tests with paramiko "ignore: Unknown ssh-rsa host key for : UserWarning", # GX library is using fields.Number from marshmallow, which is deprecated and will be removed in Marshmallow 4.0 "ignore: `Number` field should not be instantiated. Use `Integer`, `Float`, or `Decimal` instead.:DeprecationWarning" ] ================================================ FILE: samples/cricket_dq_tutorial.py ================================================ # This sample tutorial is based on the dataset available here: https://www.kaggle.com/datasets/vikramrn/icc-mens-cricket-odi-world-cup-wc-2023-bowling. # The goal of the tutorial is to demonstrate how you can use the Lakehouse Engine to load data into a target location while assessing its data quality. # You can install the Lakehouse Engine framework with below command just like any other python library, # or you can also install it as a cluster-scoped library pip install lakehouse-engine # The ACON (algorithm configuration) is the way how you can interact with the Lakehouse Engine. # Note: don't forget to change locations, buckets and databases to match your environment. acon = { "input_specs": [ { "spec_id": "cricket_world_cup_bronze", "read_type": "batch", "data_format": "csv", "options": { "header": True, "delimiter": ",", }, "location": "s3://your_bucket_file_location/icc_wc_23_bowl.csv", } ], "dq_specs": [ { "spec_id": "cricket_world_cup_data_quality", "input_id": "cricket_world_cup_bronze", "dq_type": "validator", "store_backend": "s3", "bucket": "your_bucket", "result_sink_location": "s3://your_bucket/dq_result_sink/gx_blog/", "result_sink_db_table": "your_database.gx_blog_result_sink", "tag_source_data": True, "unexpected_rows_pk": ["player", "match_id"], "fail_on_error": False, "critical_functions": [ { "function": "expect_column_values_to_be_in_set", "args": { "column": "team", "value_set": [ "Sri Lanka", "Netherlands", "Australia", "England", "Bangladesh", "New Zealand", "India", "Afghanistan", "South Africa", "Pakistan", ], }, }, { "function": "expect_column_values_to_be_in_set", "args": { "column": "opponent", "value_set": [ "Sri Lanka", "Netherlands", "Australia", "England", "Bangladesh", "New Zealand", "India", "Afghanistan", "South Africa", "Pakistan", ], }, }, ], "dq_functions": [ { "function": "expect_column_values_to_not_be_null", "args": {"column": "player"}, }, { "function": "expect_column_values_to_be_between", "args": {"column": "match_id", "min_value": 0, "max_value": 47}, }, { "function": "expect_column_values_to_be_in_set", "args": {"column": "maidens", "value_set": [0, 1]}, }, ], }, ], "output_specs": [ { "spec_id": "cricket_world_cup_silver", "input_id": "cricket_world_cup_data_quality", "write_type": "overwrite", "db_table": "your_database.gx_blog_cricket", "location": "s3://your_bucket/rest_of_path/gx_blog_cricket/", "data_format": "delta", } ], } # You need to import the Load Data algorithm from the Lakehouse Engine, so that you can perform Data Loads. from lakehouse_engine.engine import load_data # Finally, you just need to run the Load Data algorithm with the ACON that you have just defined. load_data(acon=acon) ================================================ FILE: samples/tpch_load_and_analysis_tutorial.py ================================================ # Databricks notebook source # MAGIC %md # MAGIC ### How to use the Lakehouse Engine to load and analyse Data # MAGIC This sample is composed of two main sections and goals: # MAGIC 1. **Data Load (integrate data into the Lakehouse)** # MAGIC - load 2 data sources # MAGIC - join both sources and enhance the dataset with more information # MAGIC - write the output into a target table # MAGIC 2. **Data Analysis (analyse the data ingested in the previous step)** # MAGIC - read the ingested data # MAGIC - assess the quality of that data # MAGIC - output this data as a DataFrame to enable further processing # MAGIC - analyse the data with sample Databricks Notebook Dashboards # MAGIC # MAGIC The base dataset used, on this sample, is the TPCH Dataset from Databricks Datasets (https://docs.databricks.com/en/discover/databricks-datasets.html). # MAGIC Moreover, Databricks Notebook Dashboards are also used. This is why this example consists of a Databricks python Notebook, instead of simple raw python. # COMMAND ---------- # You can install the Lakehouse Engine framework with below command just like any other python library, # or you can also install it as a cluster-scoped library %pip install lakehouse-engine # COMMAND ---------- # MAGIC %md # MAGIC #### 1. Data Load # MAGIC On this section an example is provided in order to accomplish the following: # MAGIC - loading `orders` and `customers` TPCH data # MAGIC - add current date, join both data sources and identify Super VIPs # MAGIC - write data into the final table # MAGIC # MAGIC **Note:** as it can be seen in the following code, the Lakehouse Engine cannot offer transformers for everything one might want to do on the data, as there may be very specific use cases. This is why the Lakehouse Engine provides full flexibility with Custom Transformations (`custom_transformation`), which can be used to pass any custom function, as the `is_a_super_vip` function used on this example. # COMMAND ---------- from pyspark.sql.functions import col from pyspark.sql import DataFrame def is_a_super_vip(df: DataFrame) -> DataFrame: """Example of custom transformation. It checks if the totalprice for a particular order is within the 10% higher and if the order priority is URGENT. If both criterias are met, the customer is considered a super vip. Args: df: DataFrame passed as input. Returns: DataFrame: the transformed DataFrame. """ percentile_90 = df.approxQuantile("o_totalprice", [0.9], 0)[0] df = df.withColumn( "is_a_super_vip", (col("o_totalprice") >= percentile_90) & (col("o_orderpriority") == "1-URGENT") ) return df # COMMAND ---------- acon = { "input_specs": [ # Batch (streaming is also supported) read tpch orders delta files from Databricks datasets location { "spec_id": "tpch_orders", "read_type": "batch", "data_format": "delta", "location": "/databricks-datasets/tpch/delta-001/orders", }, # Batch read tpch customers from a samples delta table in Databricks { "spec_id": "tpch_customer", "read_type": "batch", "data_format": "delta", "db_table": "samples.tpch.customer", } ], "transform_specs": [ { "spec_id": "tpch_orders_transformed", "input_id": "tpch_orders", "transformers": [ # Add current date to easily track when a particular row was added { "function": "add_current_date", "args": { "output_col": "lak_load_date" } }, # Join orders with customers to get the customer name. # Having customer name in the table will make analysis easier { "function": "join", "args": { "join_with": "tpch_customer", "join_type": "left outer", "join_condition": "a.o_custkey = b.c_custkey", "select_cols": ["a.*", "b.c_name as customer_name"] } }, # Custom transformation to assess if a customer should be considered Super VIP. { "function": "custom_transformation", "args": {"custom_transformer": is_a_super_vip}, } ], }, ], "output_specs": [ # Overwrite data into an external table on top of the specified location, using delta data format. # Note: other write types are supported, such as append and merge, but overwrite is used for simplicity on this demo. { "spec_id": "tpch_orders_output", "input_id": "tpch_orders_transformed", "write_type": "overwrite", "db_table": "your_database.tpch_orders", "location": "s3://your_s3_bucket/silver/tpch_orders/", "data_format": "delta", } ], } from lakehouse_engine.engine import load_data tpch_df = load_data(acon=acon) # COMMAND ---------- # As soon as the algorithm is finished, the dataframe output of the framework can be directly checked in order to analyse the data that have been just produced display(tpch_df["tpch_orders_output"]) # COMMAND ---------- # MAGIC %md # MAGIC #### 2. Data Analysis # MAGIC On this section an example is provided in order to accomplish the following: # MAGIC - reading the data loaded on the previous step, using a SQL query # MAGIC - assess the quality of the data, by applying Data Quality functions/expectations # MAGIC - output the data as a DataFrame for further processing # MAGIC - analyse the data with sample Databricks Notebook Dashboards # COMMAND ---------- acon = { "input_specs": [ # Batch read a custom SQL query from the table we have just inserted data into { "spec_id": "tpch_orders", "read_type": "batch", "data_format": "sql", "query": """ SELECT o_orderkey, customer_name, o_totalprice, is_a_super_vip FROM your_database.tpch_orders """, }, ], "dq_specs": [ # Assess the quality of data, by ensuring that the specified 3 columns have no nulls. { "spec_id": "tpch_orders_dq", "input_id": "tpch_orders", "dq_type": "validator", "bucket": "your_s3_bucket", "dq_functions": [ {"function": "expect_column_values_to_not_be_null", "args": {"column": "o_orderkey"}}, {"function": "expect_column_values_to_not_be_null", "args": {"column": "customer_name"}}, {"function": "expect_column_values_to_not_be_null", "args": {"column": "o_totalprice"}} ] }, ], "output_specs": [ # As the data is being analysed, there is no need to write it into any table or location. # Thus, the data output is just a Dataframe that can be used for further debug or processing. { "spec_id": "validated_tpch_orders", "input_id": "tpch_orders_dq", "data_format": "dataframe", } ], } from lakehouse_engine.engine import load_data validated_tpch_df = load_data(acon=acon) # COMMAND ---------- # Create a Temporary View to make it easier to interact with the Data using SQL validated_tpch_df["validated_tpch_orders"].createOrReplaceTempView("tpch_order_analysis") # COMMAND ---------- # MAGIC %sql # MAGIC -- the data that came from the previous load_data algorithm execution can now be queried # MAGIC -- to analyse the customers and orders classified as SUPER VIP # MAGIC SELECT customer_name, o_totalprice, is_a_super_vip # MAGIC FROM tpch_order_analysis # MAGIC GROUP BY customer_name, o_totalprice, is_a_super_vip # MAGIC ORDER BY o_totalprice desc # COMMAND ---------- # MAGIC %sql # MAGIC SELECT customer_name, o_totalprice # MAGIC FROM tpch_order_analysis # MAGIC WHERE is_a_super_vip is True # MAGIC GROUP BY customer_name, o_totalprice # MAGIC ORDER BY o_totalprice desc # MAGIC LIMIT 10 # COMMAND ---------- ================================================ FILE: tests/__init__.py ================================================ """Tests package.""" ================================================ FILE: tests/configs/__init__.py ================================================ """This module has the engine test configurations.""" ================================================ FILE: tests/configs/engine.yaml ================================================ dq_bucket: /app/tests/lakehouse/out/feature dq_dev_bucket: /app/tests/lakehouse/out/feature notif_disallowed_email_servers: - smtp.test.com engine_usage_path: file:///app/tests/lakehouse/logs/lakehouse-engine-logs engine_dev_usage_path: file:///app/tests/lakehouse/logs/lakehouse-engine-logs collect_engine_usage: disabled dq_functions_column_list: - dq_rule_id - execution_point - filters - schema - table - column - dimension dq_result_sink_columns_to_delete: - partial_unexpected_list - partial_unexpected_counts - partial_unexpected_index_list - unexpected_list sharepoint_authority: https://login.microsoftonline.com sharepoint_api_domain: https://graph.microsoft.com sharepoint_company_domain: company_name.sharepoint.com prod_catalog: sample_catalog ================================================ FILE: tests/conftest.py ================================================ """Module to configure the test environment.""" from typing import Any, Generator from unittest.mock import patch import pytest from lakehouse_engine.core.exec_env import ExecEnv from tests.utils.exec_env_helpers import ExecEnvHelpers from tests.utils.local_storage import LocalStorage RESOURCES = "/app/tests/resources/" FEATURE_RESOURCES = RESOURCES + "feature" UNIT_RESOURCES = RESOURCES + "unit" LAKEHOUSE = "/app/tests/lakehouse/" LAKEHOUSE_FEATURE_IN = LAKEHOUSE + "in/feature" LAKEHOUSE_FEATURE_CONTROL = LAKEHOUSE + "control/feature" LAKEHOUSE_FEATURE_OUT = LAKEHOUSE + "out/feature" LAKEHOUSE_FEATURE_LOGS = LAKEHOUSE + "logs/lakehouse-engine-logs" @pytest.fixture(scope="session", autouse=True) def patch_databricks_utils_job_info() -> Generator: """Patch DatabricksUtils.get_databricks_job_information to return local values.""" with patch( "lakehouse_engine.utils.databricks_utils." "DatabricksUtils.get_databricks_job_information", return_value=("local", "local"), ): yield def pytest_addoption(parser: Any) -> Any: """Setting extra options for pytest command.""" parser.addoption( "--spark_driver_memory", action="store", help="memory limit for the spark driver (default 2g)", ) @pytest.fixture(scope="session", autouse=True) def spark_driver_memory(request: Any) -> Any: """Fetching the value of spark_driver_memory parameter.""" return request.config.getoption(name="--spark_driver_memory") @pytest.fixture(scope="session", autouse=True) def prepare_exec_env(spark_driver_memory: str) -> None: """Prepare the execution environment before any test is executed.""" # remove previous test lakehouse data LocalStorage.clean_folder(LAKEHOUSE) ExecEnv.set_default_engine_config("tests.configs") ExecEnvHelpers.prepare_exec_env(spark_driver_memory) ExecEnv.SESSION.sql(f"CREATE DATABASE IF NOT EXISTS test_db LOCATION '{LAKEHOUSE}'") @pytest.fixture(autouse=True) def before_each_test() -> Generator: """Reset default spark session configs.""" yield ExecEnvHelpers.reset_default_spark_session_configs() @pytest.fixture(scope="session", autouse=True) def test_session_closure(request: Any) -> None: """Finalizing resources.""" def finalizer() -> None: """Close spark session.""" ExecEnv.SESSION.stop() request.addfinalizer(finalizer) ================================================ FILE: tests/feature/__init__.py ================================================ """Feature tests focusing on algorithm execution with different acon functionalities.""" ================================================ FILE: tests/feature/custom_expectations/__init__.py ================================================ """Tests related to the custom expectation's implementation.""" ================================================ FILE: tests/feature/custom_expectations/test_custom_expectations.py ================================================ """Test custom expectation validations.""" from json import loads from typing import Any, Tuple import pytest from pyspark.sql import DataFrame from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import execute_dq_validation from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "custom_expectations" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" @pytest.mark.parametrize( "scenario", [ { "expectation_name": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "arguments": { "column_A": "salesorder", "column_B": "amount", "margin": 9.78, }, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "arguments": {"column_A": "salesorder", "column_B": "amount"}, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_multicolumn_column_a_must_equal_b_or_c", "arguments": { "column_list": ["item", "itemcode", "amount"], }, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_multicolumn_column_a_must_equal_b_or_c", "arguments": { "column_list": ["item", "itemcode", "amount"], }, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_queried_column_agg_value_to_be", "arguments": { "template_dict": { "column": "amount", "group_column_list": "year, month, day", "agg_type": "max", "condition": "lesser", "max_value": 10000, }, }, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_queried_column_agg_value_to_be", "arguments": { "template_dict": { "column": "amount", "group_column_list": "year,month,day", "agg_type": "count", "condition": "greater", "min_value": 0, }, }, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_values_to_be_date_not_older_than", "arguments": { "column": "date", "timeframe": {"years": 100}, }, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_values_to_be_date_not_older_than", "arguments": { "column": "date", "timeframe": {"years": 100}, }, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b", # noqa: E501 "arguments": {"column_A": "EDATU", "column_B": "ERDAT"}, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b", # noqa: E501 "arguments": {"column_A": "MBDAT", "column_B": "ERDATA"}, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_pair_a_to_be_not_equal_to_b", "arguments": { "column_A": "group_article", "column_B": "article_number", }, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_pair_a_to_be_not_equal_to_b", "arguments": { "column_A": "group_article", "column_B": "article_number", }, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_values_to_not_be_null_or_empty_string", "arguments": { "column": "number", }, "read_type": "streaming", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, { "expectation_name": "expect_column_values_to_not_be_null_or_empty_string", "arguments": { "column": "number", }, "read_type": "batch", "input_type": "dataframe_reader", "custom_expectation_result": "success", }, ], ) def test_custom_expectation(scenario: dict, caplog: Any) -> None: """Test the implementation of the custom expectations. Args: scenario: scenario to test. caplog: captured log. """ _clean_folders(scenario["expectation_name"]) input_spec = { "spec_id": "sales_source", "read_type": scenario["read_type"], "data_format": "dataframe", "df_name": _generate_dataframe( scenario["read_type"], scenario["expectation_name"] ), } acon = _generate_acon(input_spec, scenario, "validator") LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario['expectation_name']}/data/control/*", f"{TEST_LAKEHOUSE_CONTROL}/{scenario['expectation_name']}/data/", ) execute_dq_validation(acon=acon) dq_result_df, dq_control_df = _get_result_and_control_dfs( "test_db.sales_order", f'dq_control_{scenario["custom_expectation_result"]}', True, scenario["expectation_name"], ) assert not DataframeHelpers.has_diff( dq_result_df.select("spec_id", "input_id", "success"), dq_control_df.fillna("").select("spec_id", "input_id", "success"), ) for key in dq_result_df.collect(): for result in loads(key.validation_results): assert { "success", "expectation_config", }.issubset(result.keys()) def _clean_folders(expectation_name: str) -> None: """Clean test folders and tables.""" LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/{expectation_name}/data") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{expectation_name}/data") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{expectation_name}/checkpoint") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{expectation_name}/dq") ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.dq_sales") ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.sales_order") def _generate_acon( input_spec: dict, scenario: dict, dq_type: str, ) -> dict: """Generate acon according to test scenario. Args: input_spec: input specification. scenario: the scenario being tested. dq_type: the type of data quality process. Returns: a dict corresponding to the generated acon. """ dq_spec_add_options = { "result_sink_db_table": "test_db.sales_order", "result_sink_format": "json", "result_sink_explode": False, "dq_functions": [ { "function": scenario["expectation_name"], "args": scenario["arguments"], } ], } return { "input_spec": input_spec, "dq_spec": { "spec_id": "dq_sales", "input_id": "sales_source", "dq_type": dq_type, "store_backend": "file_system", "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/{scenario['expectation_name']}/dq", # noqa: E501 **dq_spec_add_options, }, "restore_prev_version": scenario.get("restore_prev_version", False), } def _generate_dataframe(load_type: str, expectation_name: str) -> DataFrame: """Generate test dataframe. Args: load_type: batch or streaming. expectation_name: name of the expectation to test Returns: the generated dataframe. """ if load_type == "batch": input_df = ( ExecEnv.SESSION.read.format("csv") .option("header", True) .option("delimiter", "|") .schema( SchemaUtils.from_file( f"file://{TEST_RESOURCES}/{expectation_name}/dq_sales_schema.json" ) ) .load(f"{TEST_RESOURCES}/{expectation_name}/data/source/part-01.csv") ) else: input_df = ( ExecEnv.SESSION.readStream.format("csv") .option("header", True) .option("delimiter", "|") .schema( SchemaUtils.from_file( f"file://{TEST_RESOURCES}/{expectation_name}/dq_sales_schema.json" ) ) .load(f"{TEST_RESOURCES}/{expectation_name}/data/source/*") ) return input_df def _get_result_and_control_dfs( table: str, file_name: str, infer_schema: bool, expectation_name: str ) -> Tuple[DataFrame, DataFrame]: """Helper to get the result and control dataframes. Args: table: the table to read from. file_name: the file name to read from. infer_schema: whether to infer the schema or not. expectation_name: expectation name. Returns: the result and control dataframes. """ dq_result_df = DataframeHelpers.read_from_table(table) dq_control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{expectation_name}/data/{file_name}.csv", file_format="csv", options={"header": True, "delimiter": "|", "inferSchema": infer_schema}, ) return dq_result_df, dq_control_df ================================================ FILE: tests/feature/custom_expectations/test_expectation_validity.py ================================================ """Module with the validation code for the custom expectations.""" import copy import importlib import re import pytest from lakehouse_engine.core.definitions import DQDefaults """This value '✔' is used to filter the output from the GX diagnostics""" CHECKMARK = "\u2714" DIAGNOSTICS_VALIDATIONS = [ " ✔ Has a docstring, including a one-line short description", " ✔ Has at least one positive and negative example case, and all test cases pass", " ✔ Has core logic and passes tests on at least one Execution Engine", " ✔ All [0-9]+ tests for spark are passing", " ✔ Has core logic that passes tests for all applicable Execution Engines and SQL" " dialects", " ✔ All [0-9]+ tests for spark are passing", ] METRIC_NAME_TYPES = [ "column_values", "multicolumn_values", "column_pair_values", "table_rows", "table_columns", ] MAP_METRICS = [] @pytest.mark.parametrize("expectation", DQDefaults.CUSTOM_EXPECTATION_LIST.value) def test_expectation_validity(expectation: str) -> None: """Validates the custom expectations defined in the project. Based on the diagnostics of the custom expectations this test validates if all the best practices are being followed. """ result, metric_name = _run_diagnostics(expectation) _process_diagnostics_output(result) if metric_name: assert _validate_metric_name_structure(metric_name), ( f"Metric name {metric_name} has the incorrect format. " f"Should be 'metric type'.'metric_name'" ) MAP_METRICS.append(metric_name) assert len(MAP_METRICS) == len( set(MAP_METRICS) ), f"Metric names repeated: {MAP_METRICS}" def _run_diagnostics(expectation_name: str) -> tuple: """Runs the diagnostics of the custom expectation. This function both runs the Great Expectations Diagnostics and retrieves the diagnostics checklist and the metric name defined. Args: expectation_name: name of the expectation file. Returns: The output of the diagnostics command and the expectation's metric name. """ segments = expectation_name.split(".")[0].split("_") expectation_class_name = "".join(ele.title() for ele in segments[0:]) module = importlib.import_module( f"lakehouse_engine.dq_processors.custom_expectations.{expectation_name}" ) expectation_class = getattr(module, expectation_class_name) expectation = expectation_class() metric_name = "" if "map_metric" in dir(expectation): metric_name = expectation.map_metric return expectation.run_diagnostics().generate_checklist(), metric_name def _process_diagnostics_output(diagnostics_output: str) -> None: """Processes the output from the expectation diagnostics. Args: diagnostics_output: the output from the diagnostics command. """ validations = copy.deepcopy(DIAGNOSTICS_VALIDATIONS) for line in str(diagnostics_output).split("\n"): if CHECKMARK in line: for validation in validations: if re.match(validation, line): validations.remove(validation) break assert not validations, f"Validations not met: {validations}" def _validate_metric_name_structure(metric_name: str) -> int: """Validates the structure of the custom expectation's metric name. The metric name must have two parts separated by a '.', and the first part must be the type of the expectation. Args: metric_name: custom expectation's metric name. Returns: The validation of custom expectation's the metric name. """ parts = metric_name.split(".") if len(parts) != 2: return False if parts[0] not in METRIC_NAME_TYPES: return False return True ================================================ FILE: tests/feature/data_loader_custom_transformer/__init__.py ================================================ """Feature tests focusing on data loader algorithm execution with custom transformer.""" ================================================ FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_calculate_kpi.py ================================================ """Tests for the DataLoader algorithm with custom transformations.""" import pytest from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "data_loader_custom_transformer" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" def yet_another_kpi_calculator(df: DataFrame) -> DataFrame: """An example custom transformer that will be provided in the ACON. Args: df: DataFrame passed as input. Returns: DataFrame: the transformed DataFrame. """ session = ExecEnv.SESSION df.createOrReplaceTempView("sales") kpi_df = session.sql( """ SELECT date, SUM(amount) AS amount FROM sales GROUP BY date """ ) return kpi_df def get_test_acon() -> dict: """Creates a test ACON with the desired logic for the algorithm. Returns: dict: the ACON for the algorithm configuration. """ return { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"}, "schema_path": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/calculate_kpi/" "source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/calculate_kpi/data", } ], "transform_specs": [ { "spec_id": "calculated_kpi", "input_id": "sales_source", "transformers": [ { "function": "custom_transformation", "args": {"custom_transformer": yet_another_kpi_calculator}, } ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "calculated_kpi", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/calculate_kpi/data", } ], } @pytest.mark.parametrize("scenario", ["calculate_kpi"]) def test_calculate_kpi_and_merge(scenario: str) -> None: """Test full load with a custom transformation function. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/*_schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario}/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=get_test_acon()) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_delta_load.py ================================================ """Tests for the DataLoader algorithm with custom transformations.""" import pytest from pyspark.sql import DataFrame from pyspark.sql.functions import col from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "data_loader_custom_transformer" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" def multiply_by_100(df: DataFrame) -> DataFrame: """An example custom transformer that will be provided in the ACON. Args: df: DataFrame passed as input. Returns: DataFrame: the transformed DataFrame. """ multiplied_df = df.withColumn("amount", col("amount") * 100) return multiplied_df def get_test_acon() -> dict: """Creates a test ACON with the desired logic for the algorithm. Returns: dict: the ACON for the algorithm configuration. """ return { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": {"header": True, "delimiter": "|"}, "location": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/delta_load/data", } ], "transform_specs": [ { "spec_id": "transformed_sales_source", "input_id": "sales_source", "transformers": [ { "function": "custom_transformation", "args": {"custom_transformer": multiply_by_100}, }, { "function": "condense_record_mode_cdc", "args": { "business_key": ["salesorder", "item"], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record", ], "record_mode_col": "recordmode", "valid_record_modes": ["", "N", "R", "D", "X"], }, }, ], } ], "dq_specs": [ { "spec_id": "checked_transformed_sales_source", "input_id": "transformed_sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/dq", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_functions": [ { "function": "expect_column_values_to_not_be_null", "args": {"column": "article"}, } ], }, ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "checked_transformed_sales_source", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/delta_load/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/delta_load/checkpoint" }, "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder " "and current.item = new.item " "and current.date <=> new.date", "update_predicate": "new.actrequest_timestamp > " "current.actrequest_timestamp or ( " "new.actrequest_timestamp = " "current.actrequest_timestamp and " "new.datapakid > current.datapakid) or ( " "new.actrequest_timestamp = " "current.actrequest_timestamp and " "new.datapakid = current.datapakid and " "new.partno > current.partno) or ( " "new.actrequest_timestamp = " "current.actrequest_timestamp and " "new.datapakid = current.datapakid and " "new.partno = current.partno and new.record " ">= current.record)", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode " "not in ('R','D','X')", }, } ], "exec_env": {"spark.sql.streaming.schemaInference": True}, } @pytest.mark.parametrize("scenario", ["delta_load"]) def test_delta_load(scenario: str) -> None: """Test full load with a custom transformation function. Args: scenario: scenario to test. """ _create_table( f"{scenario}", f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=get_test_acon()) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-03.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=get_test_acon()) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-02.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=get_test_acon()) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-04.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=get_test_acon()) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) def _create_table(table_name: str, location: str) -> None: """Create test table. Args: table_name: name of the table. location: location of the table. """ ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS test_db.{table_name} ( actrequest_timestamp string, request string, datapakid int, partno int, record int, salesorder int, item int, recordmode string, date int, customer string, article string, amount int ) USING delta LOCATION '{location}' """ ) ================================================ FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_sql_transformation.py ================================================ """Tests for the DataLoader algorithm with custom transformations.""" import pytest from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "data_loader_custom_transformer" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" SQL = """ SELECT date, SUM(amount) AS amount FROM sales_sql GROUP BY date """ def get_test_acon() -> dict: """Creates a test ACON with the desired logic for the algorithm. Returns: dict: the ACON for the algorithm configuration. """ return { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"}, "schema_path": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/sql_transformation/" "source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/" "data_loader_custom_transformer/sql_transformation/data", "temp_view": "sales_sql", } ], "transform_specs": [ { "spec_id": "calculated_kpi", "input_id": "sales_source", "transformers": [ { "function": "sql_transformation", "args": {"sql": SQL}, } ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "calculated_kpi", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/" "data_loader_custom_transformer/sql_transformation/data", } ], } @pytest.mark.parametrize("scenario", ["sql_transformation"]) def test_sql_transformation_and_merge(scenario: str) -> None: """Test full load with a custom sql transformation function. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/*_schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario}/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=get_test_acon()) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/delta_load/__init__.py ================================================ """Delta load feature tests.""" ================================================ FILE: tests/feature/delta_load/test_delta_load_group_and_rank.py ================================================ """Test delta loads with group and rank.""" from typing import List import pytest from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "delta_load/group_and_rank" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ ["with_duplicates_in_same_file", "batch"], ["with_duplicates_in_same_file", "streaming"], ["fail_with_duplicates_in_same_file", "batch"], ["fail_with_duplicates_in_same_file", "streaming"], ], ) def test_delta_load_group_and_rank(scenario: List[str]) -> None: """Test delta loads in batch mode. Args: scenario: scenario to test. with_duplicates_in_same_file - This test includes duplicated rows in the same file produced by the source (e.g., an order is cancelled and created within the same file). fail_with_duplicates_in_same_file - purposely checks if the delta load fails (result has a diff compared to the control data), because sales order 7 item 1 as cancelled status before created in the second source data file. """ _create_table(scenario) execute_loads(scenario, 1) if scenario[1] == "streaming": # simulate a scenario where the same data is loaded twice in streaming mode execute_loads(scenario, 2) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/control/{scenario[1]}.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/{scenario[1]}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/" f"control_{scenario[1]}_schema.json" ), ) if scenario[0] == "fail_with_duplicates_in_same_file": # sales order 7 item 1 in second file has event cancelled before created assert DataframeHelpers.has_diff(result_df, control_df) else: assert not DataframeHelpers.has_diff(result_df, control_df) def execute_loads(scenario: List[str], iteration: int) -> None: """Execute the data loads. Args: scenario: scenario to test. iteration: number indicating the iteration in the testing process. This is useful because in this test we want to repeat the same loading process twice, to simulate a scenario where the same data is loaded twice. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/WE_SO_SCL_202108111400000000.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}" f"/data/WE_SO_SCL_202108111400000000.csv{iteration}", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/*schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/", ) load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/" f"{scenario[1] + ('_init' if scenario[1] == 'batch' else '_delta')}.json" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/WE_SO_SCL_202108111500000000.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}" f"/data/WE_SO_SCL_202108111500000000.csv{iteration}", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}_delta.json" ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/WE_SO_SCL_202108111600000000.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}" f"/data/WE_SO_SCL_202108111600000000.csv{iteration}", ) load_data(acon=acon) def _create_table(scenario: List[str]) -> None: """Create test table. Args: scenario: scenario being tested. """ ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS test_db.{scenario[0]}_{scenario[1]} ( salesorder int, item int, event string, changed_on int, date int, customer string, article string, amount int, {"extraction_date string," if scenario[1] == "streaming" else "lhe_row_id int,"} {"lhe_batch_id int," if scenario[1] == "streaming" else ""} {"lhe_row_id int" if scenario[1] == "streaming" else "extraction_date string"} ) USING delta LOCATION '{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data' """ ) ================================================ FILE: tests/feature/delta_load/test_delta_load_merge_options.py ================================================ """Test delta loads with different merge options.""" from typing import List import pytest from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "delta_load/merge_options" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["update_column_set", "insert_column_set", "update_all"], ) def test_delta_load_merge_options(scenario: List[str]) -> None: """Test upsert for specific columns in batch mode. Args: scenario: scenario to test. update_column_set - This test uses whenMatchedUpdate option. It allows to update a matched table row based on the rules defined in update_column_set, instead of updating all the columns of the matched table row with the values of the corresponding columns in the source row. insert_column_set - This test uses whenNotMatchedInsert option. It allows to insert a new row to the target table based on the rules defined in insert_column_set, instead of inserting a new target Delta table row by assigning the target columns to the values of the corresponding columns in the source row. update_all - This test uses whenMatchedUpdateAll option. It allows to update a matched table updating all the columns with the values of the corresponding columns in the source row. """ execute_loads(scenario) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=InputFormat.DELTAFILES.value, ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/batch.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/batch.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_batch_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) def execute_loads(scenario: List[str]) -> None: """Execute the data loads. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario}/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/WE_SO_SCL_202108111400000000.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/WE_SO_SCL_202108111400000000.csv", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/WE_SO_SCL_202108111500000000.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/WE_SO_SCL_202108111500000000.csv", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_delta.json") load_data(acon=acon) ================================================ FILE: tests/feature/delta_load/test_delta_load_record_mode_cdc.py ================================================ """Test delta loads with record mode based cdc.""" from typing import List import pytest from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "delta_load/record_mode_cdc" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ ["with_deletes_additional_columns", "csv"], ["with_duplicates", "csv"], ["with_upserts_only_removed_columns", "json"], ], ) def test_batch_delta_load(scenario: List[str]) -> None: """Test delta loads in batch mode. Args: scenario: scenario to test (name and file format). """ _create_table(f"{scenario[0]}", f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data") LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-01.{scenario[1]}", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario[0]}/batch_init.json" ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-0[2,3,4].{scenario[1]}", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario[0]}/batch_delta.json" ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize( "scenario", [ ["late_arriving_changes", "batch"], ["out_of_order_changes", "batch"], ["late_arriving_changes", "streaming"], ["out_of_order_changes", "streaming"], ], ) def test_file_by_file(scenario: str) -> None: """Test delta loads in batch mode. Args: scenario: scenario to test. late_arriving_changes - This test checks if if changes arrive late (certain changes on part-02 are incomplete and only arrive in part-03), the data stays consistent. out_of_order_changes - This test checks if by loading the data out of order (part-03 is loaded before part-02) the delta table stays consistent. """ _create_table( f"{scenario[0]}_{scenario[1]}", f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/", ) load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/" f"{scenario[1] + ('_init' if scenario[1] == 'batch' else '_delta')}.json" ) if scenario[0] == "out_of_order_changes": second_file = "part-03" third_file = "part-02" else: second_file = "part-02" third_file = "part-03" LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/{second_file}.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}_delta.json" ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/{third_file}.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/", ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-04.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/", ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize("scenario", ["backfill"]) def test_backfill(scenario: str) -> None: """Test backfill process of a delta load based table. Args: scenario: scenario to test. This test performs a regular delta load and, after that, backfills from the source where we simulate that all data contained in part-2, part-3 and part-04 has changed to be amount * 10. """ _create_table(f"{scenario}", f"{TEST_LAKEHOUSE_OUT}/{scenario}/data") LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-0[2,3,4].csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_delta.json") load_data(acon=acon) LocalStorage.delete_file(f"{TEST_LAKEHOUSE_IN}/{scenario}/data/part-0[2,3,4].csv") LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-05.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario}/batch_backfill.json" ) load_data(acon=acon) LocalStorage.delete_file(f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/part-01.csv") LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=InputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize("scenario", ["direct_silver_load"]) def test_direct_silver_load(scenario: str) -> None: """Test a delta load based process that loads to bronze and silver in the same run. We get data from the source, load it to bronze and then into silver, without needing to run two separate algorithms. Args: scenario: scenario to test. """ _create_table(f"{scenario}_bronze", f"{TEST_LAKEHOUSE_OUT}/{scenario}/bronze/data") _create_table(f"{scenario}_silver", f"{TEST_LAKEHOUSE_OUT}/{scenario}/silver/data") scenario = "direct_silver_load" LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-0[2,3,4].csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_delta.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/bronze/data/", ) bronze_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/bronze/data", file_format=InputFormat.DELTAFILES.value, ) control_bronze_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/bronze/data", file_format="csv" ) assert not DataframeHelpers.has_diff(bronze_df, control_bronze_df) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-02.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/silver/data/", ) silver_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/silver/data", file_format=InputFormat.DELTAFILES.value, ) control_silver_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/silver/data", file_format="csv" ) assert not DataframeHelpers.has_diff(silver_df, control_silver_df) def _create_table(table_name: str, location: str) -> None: """Create test table. Args: table_name: name of the table. location: location of the table. """ ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS test_db.{table_name} ( extraction_timestamp string, actrequest_timestamp string, request string, datapakid int, partno int, record int, salesorder int, item int, recordmode string, date int, customer string, article string, amount int ) USING delta LOCATION '{location}' """ ) ================================================ FILE: tests/feature/test_append_load.py ================================================ """Test append loads.""" from typing import Any import pytest from py4j.protocol import Py4JJavaError from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "append_load" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" @pytest.mark.parametrize("scenario", ["jdbc_permissive"]) def test_permissive_jdbc_append_load(scenario: str) -> None: """Test append loads from jdbc source with permissive read mode. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) _append_data_into_source(scenario) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-02.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) _append_data_into_source(scenario) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-03.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) _append_data_into_source(scenario) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_table(f"test_db.{scenario}_table") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize("scenario", ["failfast"]) def test_failfast_append_load(scenario: str) -> None: """Test append loads with failfast read mode. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-0[2,3].csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) with pytest.raises(Py4JJavaError) as e: # should raise malformed records due to failfast, as amount column was # renamed to amount2 and there is one more column in the pat-03.csv file. acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch.json") load_data(acon=acon) assert "Malformed CSV record" in str(e.value) @pytest.mark.parametrize("scenario", ["streaming_dropmalformed"]) def test_streaming_dropmalformed(scenario: str) -> None: """Test append loads, in streaming mode, with dropmalformed read mode. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/streaming.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-02.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-03.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_table(f"test_db.{scenario}_table") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", schema=ConfigUtils.read_json_acon( f"file://{TEST_RESOURCES}/{scenario}/streaming.json" )["input_specs"][0]["schema"], ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize("scenario", ["streaming_with_terminators"]) def test_streaming_with_terminators(scenario: str, caplog: Any) -> None: """Test append loads, in streaming mode, with terminator functions. Args: scenario: scenario to test. caplog: captured log. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/streaming.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_table(f"test_db.{scenario}_table") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", schema=ConfigUtils.read_json_acon( f"file://{TEST_RESOURCES}/{scenario}/streaming.json" )["input_specs"][0]["schema"], ) assert not DataframeHelpers.has_diff(result_df, control_df) assert ( "sql command: OPTIMIZE test_db.streaming_with_terminators_table" in caplog.text ) assert "Vacuuming table: test_db.streaming_with_terminators_table" in caplog.text assert ( "sql command: ANALYZE TABLE test_db.streaming_with_terminators_table " "COMPUTE STATISTICS" in caplog.text ) def _append_data_into_source(scenario: str) -> None: """Append data into jdbc sql lite table used as source for append load tests. Args: scenario: scenario being tested. """ source_df = DataframeHelpers.read_from_file(f"{TEST_LAKEHOUSE_IN}/{scenario}/data") DataframeHelpers.write_into_jdbc_table( source_df, f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/tests.db", f"{scenario}" ) ================================================ FILE: tests/feature/test_data_quality.py ================================================ """Test data quality process in different types of data loads.""" from json import loads from typing import Any import pytest from pyspark.sql import DataFrame from pyspark.sql.functions import array_sort, col, regexp_replace, transform from pyspark.sql.types import IntegerType, StringType, StructField, StructType from lakehouse_engine.core.definitions import ( DQExecutionPoint, DQFunctionSpec, DQSpec, DQType, ) from lakehouse_engine.dq_processors.dq_factory import DQFactory from lakehouse_engine.dq_processors.exceptions import DQValidationsFailedException from lakehouse_engine.engine import load_data from lakehouse_engine.utils.dq_utils import PrismaUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.dq_rules_table_utils import _create_dq_functions_source_table from tests.utils.local_storage import LocalStorage TEST_PATH = "data_quality" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ { "name": "delta_with_duplicates", "read_type": "streaming", "results_exploded": True, "tag_source_data": False, }, { "name": "delta_with_duplicates_tag", "read_type": "streaming", "results_exploded": True, "tag_source_data": True, }, { "name": "delta_with_dupl_tag_gen_fail", "read_type": "streaming", "results_exploded": True, "tag_source_data": True, }, { "name": "no_transformers", "read_type": "streaming", "results_exploded": False, "tag_source_data": False, }, { "name": "full_overwrite", "read_type": "batch", "results_exploded": True, "tag_source_data": False, }, { "name": "full_overwrite_tag", "read_type": "batch", "results_exploded": True, "tag_source_data": True, }, ], ) def test_load_with_dq_validator(scenario: dict) -> None: """Test the data quality validator process as part of the load_data algorithm. Description of the test scenarios: - delta_with_duplicates - test the DQ process for a streaming init and delta load with duplicates and merge strategy scenario. It's generated a DQ result_sink where some columns are exploded to make easier the analysis. - delta_with_duplicates_tag - similar to delta_with_duplicates but using DQ Row Tagging. The scenarios with tagging, test not only the loads and the result DQ sink, but also the resulting data to assert the "dq_validations" column that gets added into the source data used. This scenario covers different kinds of expectations (table, column aggregated, column, multi-column, column pair) with successes and failures. - delta_with_dupl_tag_gen_fail - similar to delta_with_duplicates_tag, but tests DQ success on init and then only general failures (not row level). - no_transformers - test the DQ process for a streaming init and delta without transformers or micro batch transformers. It's generated a DQ result_sink in a raw format. - full_overwrite - test the DQ process for a batch full overwrite scenario. It's generated a DQ result_sink where some columns are exploded to make easier the analysis, in which includes some extra columns set by the user to be included (using parameter result_sink_extra_columns). - full_overwrite_tag - similar to full_overwrite but using DQ Row Tagging. This scenario covers different kinds of expectations, all succeeded. Args: scenario: scenario to test. name - name of the scenario. read_type - type of read, namely batch or streaming. results_exploded - flag to generate a DQ result_sink in a raw format (False) or an exploded format easier for analysis (True). tag_source_data - whether the test scenario tests tagging the source data with the DQ results or not. """ test_name = "load_with_dq_validator" LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/", ) load_data( f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"{scenario['read_type']}_init.json" ) if "full_overwrite" in scenario["name"]: LocalStorage.clean_folder( f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data", ) result_sink_df = DataframeHelpers.read_from_table( f"test_db.validator_{scenario['name']}" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"data/source/part-0[2,3,4].csv", f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/", ) load_data( f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"{scenario['read_type']}_new.json" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"data/control/data_validator.json", f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/control/sales.json", f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"data/control/*_schema.json", f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/", ) result_sink_df = DataframeHelpers.read_from_table( f"test_db.validator_{scenario['name']}" ) control_sink_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/", file_format="json", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/" f"{scenario['name']}/validator/data_validator_schema.json" ), ) # drop columns for which the values vary from run to run (ex: depending on date) cols_to_drop = [ "checkpoint_config", "run_name", "run_time", "run_results", "validation_results", "validation_result_identifier", "exception_info", "batch_id", "run_time_year", "run_time_month", "run_time_day", "kwargs", "processed_keys", ] assert ( result_sink_df.columns == control_sink_df.select(*result_sink_df.columns).columns ) assert not DataframeHelpers.has_diff( result_sink_df.drop(*cols_to_drop), control_sink_df.drop(*cols_to_drop), ) if scenario["tag_source_data"]: result_data_df = _prepare_validation_df( DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{test_name}/{scenario['name']}/data", file_format="delta", ) ) control_data_df = _prepare_validation_df( DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/", file_format="json", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/" f"{scenario['name']}/validator/sales_schema.json" ), ) ) assert not DataframeHelpers.has_diff(result_data_df, control_data_df) @pytest.mark.parametrize( "scenario", [ { "name": "delta_with_duplicates_tag", "read_type": "streaming", "results_exploded": True, }, { "name": "delta_with_dupl_tag_gen_fail", "read_type": "streaming", "results_exploded": True, }, { "name": "full_overwrite_tag", "read_type": "batch", "results_exploded": True, }, ], ) def test_load_with_dq_validator_table(scenario: dict) -> None: """Test the data quality validator process as part of the load_data algorithm. Description of the test scenarios: - delta_with_duplicates_tag - test the DQ process for a streaming init and delta load with duplicates and merge strategy scenario. It's generated a DQ result_sink where some columns are exploded to make easier the analysis using DQ Row Tagging. The scenarios with tagging, test not only the loads and the result DQ sink, but also the resulting data to assert the "dq_validations" column that gets added into the source data used. This scenario covers different kinds of expectations (table, column aggregated, column, multi-column, column pair) with successes and failures. - delta_with_dupl_tag_gen_fail - similar to delta_with_duplicates_tag, but tests DQ success on init and then only general failures (not row level). - full_overwrite_tag - test the DQ process for a batch full overwrite scenario. It's generated a DQ result_sink where some columns are exploded to make easier the analysis, in which includes some extra columns set by the user to be included (using parameter result_sink_extra_columns). This scenario covers different kinds of expectations, all succeeded. Args: scenario: scenario to test. name - name of the scenario. read_type - type of read, namely batch or streaming. results_exploded - flag to generate a DQ result_sink in a raw format (False) or an exploded format easier for analysis (True). tag_source_data - whether the test scenario tests tagging the source data with the DQ results or not. """ test_name = "load_with_dq_table" LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/", ) _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name=f"{test_name}/{scenario['name']}", scenario=scenario["name"], table_name=f"test_db.dq_functions_source_{test_name}_{scenario['name']}_init", ) load_data( f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"{scenario['read_type']}_init.json" ) if "full_overwrite" in scenario["name"]: LocalStorage.clean_folder( f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"data/source/part-0[2,3,4].csv", f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/", ) _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name=f"{test_name}/{scenario['name']}", scenario=scenario["name"], table_name=f"test_db.dq_functions_source_{test_name}_{scenario['name']}_new", ) load_data( f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"{scenario['read_type']}_new.json" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"data/control/data_validator.json", f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/control/sales.json", f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/" f"data/control/*_schema.json", f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/", ) result_sink_df = DataframeHelpers.read_from_file( location=f"{LAKEHOUSE_FEATURE_OUT}/{scenario['name']}/result_sink/", file_format="delta", ) control_sink_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/", file_format="json", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/" f"{scenario['name']}/validator/data_validator_schema.json" ), ) # drop columns for which the values vary from run to run (ex: depending on date) cols_to_drop = [ "checkpoint_config", "run_name", "run_time", "run_results", "validation_results", "validation_result_identifier", "exception_info", "batch_id", "run_time_year", "run_time_month", "run_time_day", "kwargs", "meta", ] assert ( result_sink_df.columns == control_sink_df.select(*result_sink_df.columns).columns ) assert not DataframeHelpers.has_diff( result_sink_df.drop(*cols_to_drop), control_sink_df.drop(*cols_to_drop), ) result_data_df = _prepare_validation_df( DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{test_name}/{scenario['name']}/data", file_format="delta", ) ) control_data_df = _prepare_validation_df( DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/", file_format="json", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/" f"{scenario['name']}/validator/sales_schema.json" ), ) ) assert not DataframeHelpers.has_diff(result_data_df, control_data_df) @pytest.mark.parametrize( "scenario", [ { "spec_id": "dq_success", "dq_type": "validator", "dq_functions": [ DQFunctionSpec("expect_column_to_exist", {"column": "article"}), DQFunctionSpec( "expect_table_row_count_to_be_between", {"min_value": 0, "max_value": 50}, ), ], "fail_on_error": True, "critical_functions": None, "max_percentage_failure": None, }, { "spec_id": "dq_failure", "dq_type": "validator", "dq_functions": [ DQFunctionSpec("expect_column_to_exist", {"column": "article"}), DQFunctionSpec( "expect_table_row_count_to_be_between", {"min_value": 0, "max_value": 1}, ), ], "fail_on_error": True, "critical_functions": None, "max_percentage_failure": None, }, { "spec_id": "dq_failure_error_disabled", "dq_type": "validator", "dq_functions": [ DQFunctionSpec("expect_column_to_exist", {"column": "article"}), DQFunctionSpec( "expect_table_row_count_to_be_between", {"min_value": 0, "max_value": 1}, ), ], "fail_on_error": False, "critical_functions": None, "max_percentage_failure": None, }, { "spec_id": "dq_failure_critical_functions", "dq_type": "validator", "dq_functions": [ DQFunctionSpec("expect_column_to_exist", {"column": "article"}), ], "fail_on_error": False, "critical_functions": [ DQFunctionSpec( "expect_table_row_count_to_be_between", { "min_value": 0, "max_value": 1, }, ), ], "max_percentage_failure": None, }, { "spec_id": "dq_failure_max_percentage", "dq_type": "validator", "dq_functions": [ DQFunctionSpec("expect_column_to_exist", {"column": "article"}), ], "fail_on_error": False, "critical_functions": [ DQFunctionSpec( "expect_table_row_count_to_be_between", { "min_value": 0, "max_value": 1, }, ), ], "max_percentage_failure": 0.2, }, { "spec_id": "dq_success", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_dq_success", "dq_table_table_filter": "dummy_sales", "data_product_name": "dq_success", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], }, { "spec_id": "dq_failure_error_disabled", "dq_type": "prisma", "fail_on_error": False, "dq_db_table": None, "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 0, "max_value": 1, "meta": { "dq_rule_id": "rule_1", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", }, }, }, { "function": "expect_table_column_count_to_be_between", "args": { "min_value": 0, "max_value": 50, "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", }, }, }, ], "critical_functions": [], "data_product_name": "dq_failure_error_disabled", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "max_percentage_failure": None, }, ], ) def test_validator_dq_spec(scenario: dict, caplog: Any) -> None: """Test the data quality process using DQSpec. Data Quality Functions tested using validator: - dq_success: it tests two expectations and both are succeeded. - dq_failure: it tests two expectations and one of them fails, raising an exception in the DQ process. - dq_failure_error_disabled: it tests one expectation and it fails, but no exception is raised, because the fail_on_error is set to false. - dq_failure_critical_functions: it tests two expectations where one fails, since the one that fails is part of the "critical_functions" an exception is raised. - dq_failure_max_percentage: it tests two expectations where one fails, since the "max_percentage_failure" variable is not respected, an exception is thrown. - dq_success: it tests two expectations defined using prisma and both succeed. - dq_failure_error_disabled: it tests one expectation defined in prisma, by manually defining the functions in the acon, and it fails, but no exception is raised, because the fail_on_error is set to false. Args: scenario: scenario to test. caplog: captured log. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/validator/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario['dq_type']}/{scenario['spec_id']}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/validator/data/control/data_validator.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario['dq_type']}/{scenario['spec_id']}/data/", ) input_data = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_IN}/{scenario['dq_type']}/{scenario['spec_id']}/data", file_format="csv", options={"header": True, "delimiter": "|", "inferSchema": True}, ) location = TEST_LAKEHOUSE_OUT.replace("file://", "") if scenario["dq_type"] == DQType.PRISMA.value: if scenario["dq_db_table"]: _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name="validator", scenario=scenario["spec_id"], table_name=scenario["dq_db_table"], ) dq_functions = PrismaUtils.build_prisma_dq_spec( scenario, DQExecutionPoint.AT_REST.value, )["dq_functions"] else: dq_functions = scenario["dq_functions"] dq_spec = DQSpec( spec_id=scenario["spec_id"], input_id="sales_orders", dq_type=scenario["dq_type"], dq_db_table=scenario["dq_db_table"], store_backend="file_system", local_fs_root_dir=f"{location}/{scenario['dq_type']}/" f"{scenario['spec_id']}/", result_sink_format="json", result_sink_explode=False, processed_keys_location=f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/" f"{scenario['spec_id']}/processed_keys", dq_functions=[ DQFunctionSpec( function=dq_function["function"], args=dq_function["args"] ) for dq_function in dq_functions ], unexpected_rows_pk=scenario["unexpected_rows_pk"], result_sink_location=f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/" f"{scenario['spec_id']}/data", fail_on_error=scenario["fail_on_error"], max_percentage_failure=scenario["max_percentage_failure"], ) else: dq_spec = DQSpec( spec_id=scenario["spec_id"], input_id="sales_orders", dq_type=scenario["dq_type"], store_backend="file_system", local_fs_root_dir=f"{location}/{scenario['dq_type']}/" f"{scenario['spec_id']}/", result_sink_format="json", result_sink_explode=False, unexpected_rows_pk=[ "salesorder", "item", "date", "customer", ], dq_functions=scenario["dq_functions"], result_sink_location=f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/" f"{scenario['spec_id']}/data", fail_on_error=scenario["fail_on_error"], critical_functions=scenario["critical_functions"], max_percentage_failure=scenario["max_percentage_failure"], ) if scenario["spec_id"] == "dq_failure": with pytest.raises(DQValidationsFailedException) as ex: DQFactory.run_dq_process(dq_spec, input_data) assert "Data Quality Validations Failed!" in str(ex.value) elif scenario["spec_id"] == "dq_failure_critical_functions": if scenario["dq_type"] != DQType.PRISMA.value: with pytest.raises(DQValidationsFailedException) as ex: DQFactory.run_dq_process(dq_spec, input_data) assert ( "Data Quality Validations Failed, the following critical expectations " "failed: ['expect_table_row_count_to_be_between']." in str(ex.value) ) else: DQFactory.run_dq_process(dq_spec, input_data) elif scenario["spec_id"] == "dq_failure_max_percentage": with pytest.raises(DQValidationsFailedException) as ex: DQFactory.run_dq_process(dq_spec, input_data) assert "Max error threshold is being surpassed!" in str(ex.value) else: DQFactory.run_dq_process(dq_spec, input_data) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/" f"{scenario['spec_id']}/data", file_format="json", ) if scenario["spec_id"] == "dq_failure_error_disabled": assert ( "1 out of 2 Data Quality Expectation(s) have failed! " "Failed Expectations" in caplog.text ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario['dq_type']}/" f"{scenario['spec_id']}/data", file_format="csv", options={"header": True, "delimiter": "|", "inferSchema": True}, ).fillna("") assert not DataframeHelpers.has_diff( result_df.filter(result_df["spec_id"] == scenario["spec_id"]).select( "spec_id", "input_id", "success" ), control_df.filter(control_df["spec_id"] == scenario["spec_id"]).select( "spec_id", "input_id", "success" ), ) assert result_df.columns == control_df.select(*result_df.columns).columns _test_result_structure(result_df) @pytest.mark.parametrize( "scenario", [ { "result": "success", "tag_source_data": False, "num_chunks": 2, "num_rows": 10, "dq_functions": [ { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "id", "min_value": 0, "max_value": 5, "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_data", "column": "", "dimension": "", "filters": "", }, }, }, { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "static_column", "min_value": 0, "max_value": 5, "meta": { "dq_rule_id": "rule_3", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_data", "column": "", "dimension": "", "filters": "", }, }, }, ], }, { "result": "failure", "tag_source_data": False, "num_chunks": 20, "num_rows": 15, "dq_functions": [ { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "id", "min_value": 0, "max_value": 1, "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_data", "column": "", "dimension": "", "filters": "", }, }, }, { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "static_column", "min_value": 0, "max_value": 1, "meta": { "dq_rule_id": "rule_3", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_data", "column": "", "dimension": "", "filters": "", }, }, }, ], }, { "result": "success", "tag_source_data": True, "num_chunks": 6, "num_rows": 15, "dq_functions": [ { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "id", "min_value": 0, "max_value": 1, "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_data", "column": "", "dimension": "", "filters": "", }, }, }, { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "static_column", "min_value": 0, "max_value": 20, "meta": { "dq_rule_id": "rule_3", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_data", "column": "", "dimension": "", "filters": "", }, }, }, ], }, ], ) def test_chunked_result_sink(scenario: dict, caplog: Any) -> None: """Test the chunked result sink for data quality validation. Scenario 0: test two expectations and both are successful. Scenario 1: test two expectations, both with errors Scenario 2: test two expectations, one with error and one without and the tagging functionality when multiple chunks exist. Args: scenario: scenario to test. caplog: captured log. """ LocalStorage.clean_folder(f"{LAKEHOUSE_FEATURE_OUT}/test_dp/") schema = StructType( [ StructField("id", IntegerType(), False), StructField("static_column", StringType(), False), ] ) data = [] for x in range(0, scenario["num_rows"]): data.append((x, True)) df = DataframeHelpers.create_dataframe(data=data, schema=schema) acon = { "input_specs": [ { "spec_id": "test_in", "read_type": "batch", "data_format": "dataframe", "df_name": df, }, ], "dq_specs": [ { "spec_id": "test_dq", "input_id": "test_in", "dq_type": DQType.PRISMA.value, "store_backend": "file_system", "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/chunked_result_sink/", "result_sink_format": "json", "data_product_name": "test_dp", "unexpected_rows_pk": ["id", "static_column"], "result_sink_chunk_size": 1, "dq_functions": scenario["dq_functions"], "tag_source_data": scenario["tag_source_data"], } ], "output_specs": [ { "spec_id": "test_out", "input_id": "test_dq", "data_format": "dataframe", "write_type": "overwrite", } ], } result_df = load_data(acon=acon)["test_out"] result_sink = DataframeHelpers.read_from_file( location=f"{LAKEHOUSE_FEATURE_OUT}/test_dp/result_sink/", file_format="json" ) assert result_sink.count() == scenario["num_chunks"] processed_keys = DataframeHelpers.read_from_file( location=f"{LAKEHOUSE_FEATURE_OUT}/test_dp/dq_processed_keys/", file_format="json", ) assert processed_keys.count() == scenario["num_rows"] if scenario["result"] == "failure": assert ( "2 out of 2 Data Quality Expectation(s) have failed! Failed Expectations" in caplog.text ) if scenario["tag_source_data"]: final_df = result_df.groupBy("dq_validations").count() assert final_df.count() == 2 for ele in final_df.collect(): if ele.dq_validations.dq_failure_details: assert ele["count"] == 5 else: assert ele["count"] == 10 def _test_result_structure(df: DataFrame) -> None: """Test if a dataframe has the expected keys in its structure. Tests the validity of a dataframe, by checking if some keys are part of the base structure of that dataframe. Args: df: dataframe to test. """ for key in df.collect(): for result in loads(key.validation_results): assert { "success", "expectation_config", }.issubset(result.keys()) def _prepare_validation_df(df: DataFrame) -> DataFrame: """Given a DataFrame apply necessary transformations to prepare it for validations. It performs necessary transformations like removing the date from the run_name and removing the batch_id from the dq_failure_details. Args: df: dataframe to transform. Returns: the transformed dataframe """ return df.withColumn( "dq_validations", col("dq_validations") .withField( "run_name", regexp_replace(col("dq_validations.run_name"), "[0-9]", "") ) .withField( "dq_failure_details", array_sort( transform( "dq_validations.dq_failure_details", lambda x: x.withField( "kwargs", regexp_replace( x.kwargs, '"batch_id":.*?,', "", ), ), ), ), ), ) ================================================ FILE: tests/feature/test_dq_validator.py ================================================ """Test data quality validator.""" from json import loads from typing import Any, Dict, List, Tuple, Union import py4j import pytest from pyspark.sql import DataFrame from pyspark.sql.utils import StreamingQueryException from lakehouse_engine.core.definitions import DQType from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.dq_processors.exceptions import ( DQDuplicateRuleIdException, DQValidationsFailedException, ) from lakehouse_engine.engine import execute_dq_validation, load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.dq_rules_table_utils import _create_dq_functions_source_table from tests.utils.local_storage import LocalStorage _LOGGER = LoggingHandler(__name__).get_logger() TEST_NAME = "dq_validator" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" @pytest.mark.parametrize( "scenario", [ { "spec_id": "spec_without_duplicate", "name": "table_batch_dq_rule", "dq_type": "prisma", "read_type": "batch", "input_type": "file_reader", "dq_table_table_filter": "dummy_sales", "dq_validator_result": "success", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_table_rule_id_success", "max_percentage_failure": None, }, { "spec_id": "spec_with_duplicate", "name": "table_batch_dq_rule", "dq_type": "prisma", "read_type": "batch", "input_type": "file_reader", "dq_table_table_filter": "dummy_sales", "dq_validator_result": "failed", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_table_rule_id_failure", "max_percentage_failure": None, }, { "spec_id": "streaming_spec_without_duplicate", "name": "table_streaming_dq_rule", "dq_type": "prisma", "read_type": "streaming", "input_type": "file_reader", "dq_table_table_filter": "dummy_sales", "dq_validator_result": "success", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_table_rule_id_success", "max_percentage_failure": None, }, { "spec_id": "streaming_spec_with_duplicate", "name": "table_streaming_dq_rule", "dq_type": "prisma", "read_type": "streaming", "input_type": "file_reader", "dq_table_table_filter": "dummy_sales", "dq_validator_result": "failed", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_table_rule_id_failure", "max_percentage_failure": None, }, ], ) def test_dq_rule_id_uniqueness(scenario: dict, caplog: Any) -> None: """Test the function to detect duplicate dq_rule_id. Dq_rule_id scenarios: - scenario 1: using the file reader in batch to test if the dq_db_table has duplicated dq_rule_id. This scenario do not have duplicates. - scenario 2: Using the file reader in batch mode to check for duplicate dq_rule_id values in the dq_db_table. In this scenario, duplicates are found in rule_3 and rule_4. - scenario 3: using the file reader in streaming to test if the dq_db_table has duplicated dq_rule_id. This scenario do not have duplicates. - scenario 4: using the file reader in streaming mode to check for duplicate dq_rule_id values in the dq_db_table. In this scenario, duplicates are found in rule_3 and rule_5. Args: scenario: scenario to test. caplog: captured log. """ _clean_folders() _create_table("dq_sales") _execute_load(scenario["read_type"]) input_spec = { "spec_id": "sales_source", "data_format": "delta", "read_type": scenario["read_type"], "location": f"{TEST_LAKEHOUSE_OUT}/data/", } _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name=scenario["name"], scenario=scenario["read_type"], table_name=scenario["dq_db_table"], ) acon = _generate_acon( input_spec, scenario, scenario.get("dq_type", DQType.VALIDATOR.value) ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) if (scenario["dq_validator_result"] == "failed") and ("batch" in scenario["name"]): with pytest.raises(DQDuplicateRuleIdException) as error: execute_dq_validation(acon=acon) assert "rule_3" and "rule_4" in error.value.args[0] _LOGGER.critical(error.value.args[0]) elif (scenario["dq_validator_result"] == "failed") and ( "streaming" in scenario["name"] ): with pytest.raises(DQDuplicateRuleIdException) as error: execute_dq_validation(acon=acon) assert "rule_3" and "rule_5" in error.value.args[0] _LOGGER.critical(error.value.args[0]) else: execute_dq_validation(acon=acon) assert "A duplicate dq_rule_id was found!!!" not in caplog.text @pytest.mark.parametrize( "scenario", [ { "name": "batch_dataframe_success", "read_type": "batch", "input_type": "dataframe_reader", "dq_validator_result": "success", "restore_prev_version": False, "fail_on_error": True, "critical_functions": None, "max_percentage_failure": None, }, { "name": "streaming_dataframe_failure", "read_type": "streaming", "input_type": "dataframe_reader", "dq_validator_result": "failure", "restore_prev_version": False, "fail_on_error": True, "critical_functions": None, "max_percentage_failure": None, }, { "name": "streaming_failure_disabled", "read_type": "streaming", "input_type": "table_reader", "dq_validator_result": "failure_disabled", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "max_percentage_failure": None, }, { "name": "batch_failure", "read_type": "batch", "input_type": "table_reader", "dq_validator_result": "failure", "restore_prev_version": True, "fail_on_error": True, "critical_functions": None, "max_percentage_failure": None, }, { "name": "streaming_failure", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "failure", "restore_prev_version": True, "fail_on_error": True, "critical_functions": None, "max_percentage_failure": None, }, { "name": "streaming_failure_critical", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "failure", "restore_prev_version": True, "fail_on_error": True, "critical_functions": [ { "function": "expect_table_row_count_to_be_between", "args": {"min_value": 3, "max_value": 11}, } ], "max_percentage_failure": None, }, { "name": "streaming_failure_critical_notes", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "failure", "restore_prev_version": True, "fail_on_error": True, "critical_functions": [ { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 3, "max_value": 11, "meta": {"notes": "Test notes"}, }, } ], "max_percentage_failure": None, }, { "name": "streaming_failure_critical_markdown", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "failure", "restore_prev_version": True, "fail_on_error": True, "critical_functions": [ { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 3, "max_value": 11, "meta": { "notes": {"format": "markdown", "content": "**Test Notes**"} }, }, } ], "max_percentage_failure": None, }, { "name": "streaming_failure_percentage", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "failure", "restore_prev_version": True, "fail_on_error": True, "critical_functions": None, "max_percentage_failure": 0.2, }, { "name": "table_batch_success", "dq_type": "prisma", "read_type": "batch", "input_type": "file_reader", "dq_validator_result": "success_explode", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_functions_source_table_success", "max_percentage_failure": None, }, { "name": "table_batch_failure_disabled", "dq_type": "prisma", "read_type": "batch", "input_type": "file_reader", "dq_validator_result": "success_explode_disabled", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_functions_source_table_failure", "max_percentage_failure": None, }, { "name": "table_streaming_success", "dq_type": "prisma", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "success_explode", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_functions_source_table_success", "max_percentage_failure": None, }, { "name": "table_streaming_failure_disabled", "dq_type": "prisma", "read_type": "streaming", "input_type": "file_reader", "dq_validator_result": "success_explode_disabled", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_functions_source_table_failure", "max_percentage_failure": None, }, { "name": "table_batch_dataframe_success", "dq_type": "prisma", "read_type": "batch", "input_type": "dataframe_reader", "dq_validator_result": "success_explode", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_functions_source_table_success", "max_percentage_failure": None, }, { "name": "table_batch_dataframe_failure_disabled", "dq_type": "prisma", "read_type": "streaming", "input_type": "dataframe_reader", "dq_validator_result": "success_explode_disabled", "restore_prev_version": False, "fail_on_error": False, "critical_functions": None, "dq_db_table": "test_db.dq_functions_source_table_failure", "max_percentage_failure": None, }, ], ) def test_dq_validator(scenario: dict, caplog: Any) -> None: """Test the Data Quality Validator algorithm with DQ Type Validator. Data Quality Validator scenarios: - scenario 1: test DQ Validator having a generated dataframe as input that passes all the expectations defined. - scenario 2: test DQ Validator, reading a generated dataframe as stream that fails one of the expectations defined. - scenario 3: test DQ Validator, reading as streaming a delta table, failing one of the expectations but not failing the complete DQ process as fail_on_error is disabled. - scenario 4: test DQ Validator, reading a delta table (batch), that fails one of the expectations defined and a previous version of the delta table is restored. - scenario 5: test DQ Validator, reading as streaming a set of files in a specific location, that fail one of the expectations defined and a previous version of the delta table is restored. - scenario 6: test DQ Validator, reading as streaming a set of files in a specific location, that fails one of the expectations that is defined as critical. - scenario 7: test DQ Validator, reading as streaming a set of files in a specific location, that fails one of the expectations that is defined as critical and notes in default format. - scenario 8: test DQ Validator, reading as streaming a set of files in a specific location, that fails one of the expectations that is defined as critical and notes with markdown. - scenario 9: test DQ Validator, reading as streaming a set of files in a specific location, that fails the whole expectation suite because the maximum percentage threshold is surpassed. Args: scenario: scenario to test. caplog: captured log. """ _clean_folders() if "dataframe" in scenario["input_type"]: input_spec = { "spec_id": "sales_source", "read_type": scenario["read_type"], "data_format": "dataframe", "df_name": _generate_dataframe(scenario["read_type"]), } else: _create_table("dq_sales") _execute_load(scenario["read_type"]) if "table" in scenario["input_type"]: input_spec = { "spec_id": "sales_source", "read_type": scenario["read_type"], "db_table": "test_db.dq_sales", } else: input_spec = { "spec_id": "sales_source", "data_format": "delta", "read_type": scenario["read_type"], "location": f"{TEST_LAKEHOUSE_OUT}/data/", } if "dq_db_table" in scenario.keys(): _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name=scenario["name"], scenario=scenario["read_type"], table_name=scenario["dq_db_table"], ) acon = _generate_acon( input_spec, scenario, scenario.get("dq_type", DQType.VALIDATOR.value) ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) if scenario["dq_validator_result"] == "failure": with pytest.raises( (DQValidationsFailedException, StreamingQueryException), match=".*Data Quality Validations Failed!.*", ): execute_dq_validation(acon=acon) else: execute_dq_validation(acon=acon) if scenario["restore_prev_version"] is True: data_result_df, data_control_df = _get_result_and_control_dfs( "test_db.dq_sales", "data_restore_control", False ) assert not DataframeHelpers.has_diff(data_result_df, data_control_df) assert "Data Quality Expectation(s) have failed!" in caplog.text if scenario["dq_validator_result"] == "failure_disabled": assert ( "1 out of 3 Data Quality Expectation(s) have failed! " "Failed Expectations" in caplog.text ) dq_result_df, dq_control_df = _get_result_and_control_dfs( result=f"{LAKEHOUSE_FEATURE_OUT}/{scenario['name']}/result_sink/", control=f'dq_control_{scenario["dq_validator_result"]}', infer_schema=True, result_is_table=False, ) assert not DataframeHelpers.has_diff( dq_result_df.select("spec_id", "input_id", "success"), dq_control_df.fillna("").select("spec_id", "input_id", "success"), ) for key in dq_result_df.collect(): validation_results = loads(key.validation_results) result = ( validation_results[0] if isinstance(validation_results, list) else validation_results ) assert { "success", "expectation_config", }.issubset(result.keys()) @pytest.mark.parametrize( "scenario", [ { "name": "streaming_dataframe_two_runs", "dq_type": "prisma", "read_type": "streaming", "input_type": "dataframe_reader", "dq_validator_result": "success_explode", "dq_db_table_first_run": "test_db.dq_functions_streaming_dataframe_two_runs_first_run", # noqa: E501 "dq_db_table_second_run": "test_db.dq_functions_streaming_dataframe_two_runs_second_run", # noqa: E501 "fail_on_error": False, "critical_functions": None, "max_percentage_failure": None, "restore_prev_version": False, }, ], ) def test_dq_validator_two_runs(scenario: dict, caplog: Any) -> None: """Test the integrity of the result sink after two runs. This tests performs two runs of the Data Quality Validator with the same scenario but different dq functions source tables. The goal is to ensure that the result sink does not have void types and that it is able to be read without issues. This is a regression test for the case when the Data Quality Validator was writing a column with void types to the result sink, which caused issues when reading the result sink. Data Quality Validator scenarios: - scenario 1: test result sink structure by having two runs writing to the same result sink without creating an issue with void types. Args: scenario: scenario to test. caplog: captured log. """ _clean_folders() input_spec = { "spec_id": "sales_source", "read_type": scenario["read_type"], "data_format": "dataframe", "df_name": _generate_dataframe(scenario["read_type"]), } _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name=scenario["name"], scenario=scenario["read_type"], table_name=scenario["dq_db_table_first_run"], ) _create_dq_functions_source_table( test_resources_path=TEST_RESOURCES, lakehouse_in_path=TEST_LAKEHOUSE_IN, lakehouse_out_path=TEST_LAKEHOUSE_OUT, test_name=scenario["name"], scenario=scenario["read_type"], table_name=scenario["dq_db_table_second_run"], ) scenario["dq_db_table"] = scenario["dq_db_table_first_run"] first_acon = _generate_acon( input_spec, scenario, scenario.get("dq_type", DQType.PRISMA.value) ) scenario["dq_db_table"] = scenario["dq_db_table_second_run"] second_acon = _generate_acon( input_spec, scenario, scenario.get("dq_type", DQType.PRISMA.value) ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) execute_dq_validation(acon=first_acon) execute_dq_validation(acon=second_acon) result_sink_path = f"{LAKEHOUSE_FEATURE_OUT}/{scenario['name']}/result_sink/" df = ExecEnv.SESSION.sql( f"""select * from delta.`{result_sink_path}`""" # nosec B608 ) try: df.show() except py4j.protocol.Py4JJavaError: pytest.fail("Failed to write to result sink due to void type in the dataframe.") def _clean_folders() -> None: """Clean test folders and tables.""" LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/data") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/data") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/checkpoint") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/dq") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/profiling") ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.dq_sales") ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.dq_validator") def _create_table(table_name: str) -> None: """Create test table. Args: table_name: name of the test table. """ ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS test_db.{table_name} ( salesorder string, item string, date string, customer string, article string, amount string ) USING delta LOCATION '{TEST_LAKEHOUSE_OUT}/data' TBLPROPERTIES( 'lakehouse.primary_key'='salesorder, `item`, date ,`customer`', 'delta.enableChangeDataFeed'='false' ) """ ) def _execute_load(load_type: str) -> None: """Helper function to reuse for loading the data for the scenario tests. Args: load_type: batch or streaming. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{load_type}.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/part-02.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) load_data(acon=acon) def _generate_acon( input_spec: dict, scenario: dict, dq_type: str, ) -> dict: """Generate acon according to test scenario. Args: input_spec: input specification. scenario: the scenario being tested. dq_type: the type of data quality process. Returns: A dict corresponding to the generated acon. """ if "dataframe" in scenario["input_type"]: unexpected_rows_pk: Dict[str, Union[str, List[str]]] = { "unexpected_rows_pk": ["salesorder", "item", "date", "customer"] } else: unexpected_rows_pk = {"tbl_to_derive_pk": "test_db.dq_sales"} if dq_type == DQType.VALIDATOR.value or dq_type == DQType.PRISMA.value: dq_spec_add_options = { "result_sink_location": f"{LAKEHOUSE_FEATURE_OUT}/" f"{scenario['name']}/result_sink/", "dq_db_table": scenario.get("dq_db_table"), "dq_table_table_filter": "dummy_sales", "result_sink_format": "delta", "fail_on_error": scenario["fail_on_error"], "critical_functions": scenario["critical_functions"], "max_percentage_failure": scenario["max_percentage_failure"], "result_sink_explode": False, "data_product_name": scenario["name"], "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "article"}}, { "function": "expect_table_row_count_to_be_between", "args": {"min_value": 3, "max_value": 11}, }, { "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "args": {"column_A": "salesorder", "column_B": "amount"}, }, ], } dq_spec_add_options.update(unexpected_rows_pk) return { "input_spec": input_spec, "dq_spec": { "spec_id": "dq_sales", "input_id": "sales_source", "dq_type": dq_type, "store_backend": "file_system", "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/dq", **dq_spec_add_options, }, "restore_prev_version": scenario.get("restore_prev_version", False), } def _generate_dataframe(load_type: str) -> DataFrame: """Generate test dataframe. Args: load_type: batch or streaming. Returns: the generated dataframe. """ if load_type == "batch": input_df = ( ExecEnv.SESSION.read.format("csv") .schema( SchemaUtils.from_file(f"file://{TEST_RESOURCES}/dq_sales_schema.json") ) .load(f"{TEST_RESOURCES}/data/source/part-01.csv") ) else: input_df = ( ExecEnv.SESSION.readStream.format("csv") .schema( SchemaUtils.from_file(f"file://{TEST_RESOURCES}/dq_sales_schema.json") ) .load(f"{TEST_RESOURCES}/data/source/*") ) return input_df def _get_result_and_control_dfs( result: str, control: str, infer_schema: bool, result_is_table: bool = True ) -> Tuple[DataFrame, DataFrame]: """Helper to get the result and control dataframes. Args: result: the table to read from. control: the file name to read from. infer_schema: whether to infer the schema or not. result_is_table: whether the result is a table or a file. Returns: the result and control dataframes. """ if result_is_table: dq_result_df = DataframeHelpers.read_from_table(result) else: dq_result_df = DataframeHelpers.read_from_file( location=result, file_format="delta", ) dq_control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{control}.csv", file_format="csv", options={"header": True, "delimiter": "|", "inferSchema": infer_schema}, ) return dq_result_df, dq_control_df ================================================ FILE: tests/feature/test_engine_usage_stats.py ================================================ """Tests for the log lakehouse engine function.""" import os import re from datetime import datetime import pytest from pyspark.sql import DataFrame from pyspark.sql.functions import lit from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import execute_dq_validation, load_data, manage_table from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_LOGS, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "engine_usage_stats" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" TIMESTAMP = datetime.now() YEAR = TIMESTAMP.year MONTH = TIMESTAMP.month def custom_transformation(df: DataFrame) -> DataFrame: """A sample custom transformation to use in the ACON. Args: df: DataFrame passed as input. Returns: DataFrame: the transformed DataFrame. """ return df.withColumn("new_column", lit("literal")) def _get_test_acon(scenario_name: str) -> dict: """Creates a test ACON with the desired logic for the test. Args: scenario_name: name of the test scenario running. Returns: dict: the ACON for the algorithm configuration. """ df = ExecEnv.SESSION.read.options( header="True", inferSchema="True", delimiter="|" ).csv(f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/") input_spec: dict = { "spec_id": "sales_source", "read_type": "batch", } transformers = [ { "function": "rename", "args": {"cols": {"salesorder": "salesorder1"}}, } ] if "simple_acon" not in scenario_name: transformers.append( { "function": "custom_transformation", "args": {"custom_transformer": custom_transformation}, } ) input_spec = {**input_spec, "data_format": "dataframe", "df_name": df} else: input_spec = { **input_spec, "data_format": "csv", "options": { "mode": "FAILFAST", "header": True, "delimiter": "|", "password": "dummy_password", }, "location": f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/", } return { "input_specs": [input_spec], "transform_specs": [ { "spec_id": "renamed_kpi", "input_id": "sales_source", "transformers": transformers, } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "renamed_kpi", "write_type": "overwrite", "data_format": "delta", "location": f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data/", } ], "exec_env": {"dp_name": scenario_name}, } @pytest.mark.parametrize("scenario", ["load_simple_acon", "load_custom_transf_and_df"]) def test_load_data(scenario: str) -> None: """Test Data Loader with different scenarios. Scenarios: engine_usage_stats: Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control.json", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) load_data( acon=_get_test_acon(scenario), spark_confs={"dp_name": "dp_name"}, collect_engine_usage="enabled", ) _prepare_and_compare_dfs(scenario) @pytest.mark.parametrize("scenario", ["table_manager"]) def test_table_manager(scenario: str) -> None: """Test Table Manager with different scenarios. Scenarios: table_manager: table_manager logging behaviour Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control.json", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) acon = { "function": "execute_sql", "sql": "select 1", "exec_env": {"dp_name": scenario}, } manage_table( acon=acon, spark_confs={"dp_name": "dp_name"}, collect_engine_usage="enabled" ) _prepare_and_compare_dfs(scenario) @pytest.mark.parametrize("scenario", ["dq_validator"]) def test_dq_validator(scenario: str) -> None: """Test DQ Validator with different scenarios. Scenarios: dq_validator: dq_validator logging behaviour Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control.json", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) acon = { "input_spec": { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"}, "location": f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", }, "dq_spec": { "spec_id": "dq_sales", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/dq", "result_sink_db_table": "test_db.dq_validator", "result_sink_format": "json", "result_sink_explode": False, "dq_functions": [ {"function": "expect_column_to_exist", "args": {"column": "article"}}, { "function": "expect_table_row_count_to_be_between", "args": {"min_value": 3, "max_value": 11}, }, { "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "args": {"column_A": "salesorder", "column_B": "amount"}, }, ], }, "exec_env": {"dp_name": scenario}, } execute_dq_validation( acon=acon, spark_confs={"dp_name": "dp_name"}, collect_engine_usage="enabled" ) _prepare_and_compare_dfs(scenario) def _prepare_and_compare_dfs(scenario: str) -> None: """Prepare DF and compare test and control dataframes. Args: scenario: Scenario to load dataframes to compare. """ control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", "json", options={"inferSchema": True}, ) log_folder_path = f"{LAKEHOUSE_FEATURE_LOGS}/{scenario}/{YEAR}/{MONTH}/" log_file_path = os.listdir(log_folder_path)[-1] eng_usage_df = DataframeHelpers.read_from_file( f"{log_folder_path}{log_file_path}", "json" ) assert eng_usage_df.columns == control_df.columns assert ( eng_usage_df.select("start_timestamp").first()[0] >= control_df.select("start_timestamp").first()[0] ) assert _prepare_df_comparison(eng_usage_df) == _prepare_df_comparison(control_df) def _prepare_df_comparison(df: DataFrame) -> str: """Prepared DF to be comparable by dropping columns and converting it to string. Args: df: DataFrame to be prepared. Returns: a string representation of the Dataframe, ready to be compared. """ cols_to_ignore = ["start_timestamp", "engine_version"] str_df = str(df.drop(*cols_to_ignore).first()[0]) str_df = re.sub("' None: """Test the extraction from SAP B4 AQ DSO. Args: scenario: scenario to test. """ extra_params = { "changelog_table": DB_TABLE, "test_name": "extract_aq_dso", "adso_type": "AQ", } LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") _prepare_files(scenario["scenario_name"], extra_params) _load_test_table("rspmrequest", scenario["scenario_name"], extra_params) _execute_and_validate(scenario, extra_params) @pytest.mark.parametrize("scenario", TEST_SCENARIOS) def test_extract_cl_dso(scenario: dict) -> None: """Test the extraction from SAP B4 CL DSO. Args: scenario: scenario to test. """ extra_params = { "changelog_table": f"{DB_TABLE}_cl", "test_name": "extract_cl_dso", "adso_type": "CL", } LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") _prepare_files(scenario["scenario_name"], extra_params) _load_test_table("rspmrequest", scenario["scenario_name"], extra_params) _execute_and_validate(scenario, extra_params) def _execute_and_validate(scenario: dict, extra_params: dict) -> None: """Helper function to reuse for triggering the load data and validation of results. Args: scenario: scenario being tested. extra_params: extra params for the scenario being tested. """ _execute_load(scenario=scenario, extraction_type="init", extra_params=extra_params) _execute_load( scenario=scenario, extraction_type="delta", iteration=1, extra_params=extra_params, ) _execute_load( scenario=scenario, extraction_type="delta", iteration=2, extra_params=extra_params, ) _validate( scenario["scenario_name"], extra_params, scenario["min_timestamp"] is not None, ) def _execute_load( scenario: dict, extra_params: dict, extraction_type: str, iteration: int = None, ) -> None: """Helper function to reuse for loading the data for the scenario tests. Args: scenario: scenario being tested. extra_params: extra params for the scenario being tested. extraction_type: type of extraction (delta or init). iteration: number of the iteration, in case it is to test a delta. """ write_type = "overwrite" if extraction_type == "init" else "append" _load_test_table( extra_params["changelog_table"] if extraction_type != "init" else DB_TABLE, scenario["scenario_name"], extra_params, iteration, ) # if it is an init, we need to provide an extraction_timestamp, otherwise the # current time would be used and data would be filtered accordingly. acon = _get_test_acon( extraction_timestamp=( "20210713151010" if extraction_type == "init" else datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") ), extraction_type=extraction_type, write_type=write_type, scenario=scenario, extra_params=extra_params, ) load_data(acon=acon) def _get_test_acon( extraction_type: str, write_type: str, scenario: dict, extra_params: dict, extraction_timestamp: str = None, ) -> dict: """Creates a test ACON with the desired logic for the algorithm. Args: extraction_type: type of extraction (delta or init). write_type: the spark write type to be used. scenario: the scenario being tested. extra_params: extra params for the scenario being tested. extraction_timestamp: timestamp of the extraction. For local tests we specify it in the init, otherwise would be calculated and tests would fail. Returns: dict: the ACON for the algorithm configuration. """ return { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "sap_b4", "calculate_upper_bound": scenario["calculate_upper_bound"], "calc_upper_bound_schema": scenario["calculate_upper_bound_schema"], "generate_predicates": scenario["generate_predicates"], "options": { "driver": "org.sqlite.JDBC", "user": "dummy_user", "password": "dummy_pwd", "url": f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/" f"{scenario['scenario_name']}/{extra_params['test_name']}/tests.db", "dbtable": DB_TABLE, "data_target": "dummy_table", "act_req_join_condition": scenario["act_req_join_condition"], "changelog_table": extra_params["changelog_table"], "customSchema": "reqtsn DECIMAL(23,0), datapakid STRING, " "record INTEGER, extraction_start_timestamp DECIMAL(15,0)", "request_status_tbl": "rspmrequest", "extra_cols_req_status_tbl": scenario["extra_cols_req_status_tbl"], "latest_timestamp_data_location": f"file:///{TEST_LAKEHOUSE_OUT}/" f"{scenario['scenario_name']}/{extra_params['test_name']}/data", "extraction_type": extraction_type, "numPartitions": 2, "partitionColumn": scenario["part_col"], "lowerBound": scenario["lower_bound"], "upperBound": scenario["upper_bound"], "default_upper_bound": scenario.get("default_upper_bound", "Null"), "extraction_timestamp": extraction_timestamp, "min_timestamp": scenario["min_timestamp"], "predicates": scenario["predicates_list"], "adso_type": extra_params["adso_type"], }, } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": write_type, "data_format": "delta", "partitions": ["reqtsn"], "location": f"file:///{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/" f"{extra_params['test_name']}/data", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": ( True if scenario["extra_cols_req_status_tbl"] else False ) }, } def _prepare_files(scenario: str, extra_params: dict) -> None: """Copy all the files needed for the tests. Args: scenario: scenario being tested. extra_params: extra params for the scenario being tested. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/source/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/*.json", f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/*_schema.json", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/", ) if scenario == "no_part_col_join_condition": LocalStorage.copy_file( f"{TEST_RESOURCES}/" f"{extra_params['test_name']}/data/control/" f"dummy_table_join_condition.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/", ) else: LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/" f"dummy_table.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/", ) def _load_test_table( db_table: str, scenario: str, extra_params: dict, iteration: int = None ) -> DataFrame: """Load the JDBC tables for the tests and return a Dataframe with the content. Args: db_table: table being loaded. scenario: scenario being tested. extra_params: extra params for the scenario being tested. iteration: number of the iteration, in case it is to test a delta. Returns: A Dataframe with the content of the JDBC table loaded. """ file_name = f"{db_table}_{iteration}" if iteration else db_table source_df = DataframeHelpers.read_from_file( location=f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/" f"source/{file_name}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/" f"{db_table}_schema.json" ), options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"}, ) DataframeHelpers.write_into_jdbc_table( source_df, f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/" f"{extra_params['test_name']}/tests.db", db_table, ) return DataframeHelpers.read_from_jdbc( f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/" f"{extra_params['test_name']}/tests.db", db_table, ) def _validate(scenario: str, extra_params: dict, min_timestamp: bool) -> None: """Perform the validation part of the local tests. Args: scenario: the scenario being tested. extra_params: extra params for the scenario being tested. min_timestamp: whether the min_timestamp is provided or not. """ control_df = DataframeHelpers.read_from_file( location=f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/" f"data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/" f"{extra_params['test_name']}/dummy_table_schema.json" ), options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"}, ) control_df_columns = control_df.columns result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/{extra_params['test_name']}/data", file_format=OutputFormat.DELTAFILES.value, ).select(control_df_columns) if min_timestamp: # when we fill the min_timestamp, it means it can either skip or # re-extract things, depending on the timestamp provided. In our scenario # is expected to re-extract, causing duplicates, thus if we remove the # duplicates we expect to match the non-duplicated control dataframe result_df = result_df.drop_duplicates() assert not DataframeHelpers.has_diff(control_df, result_df) ================================================ FILE: tests/feature/test_extract_from_sap_bw.py ================================================ """Test extractions from SAP BW.""" import re from datetime import datetime, timezone import pytest from _pytest.logging import LogCaptureFixture from pyspark.sql import DataFrame from lakehouse_engine.core.definitions import OutputFormat, WriteType from lakehouse_engine.engine import load_data from lakehouse_engine.utils.extraction.sap_bw_extraction_utils import ( SAPBWExtraction, SAPBWExtractionUtils, ) from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "extract_from_sap_bw" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" LOGGER = LoggingHandler(__name__).get_logger() DB_TABLE = "dummy_table" """Scenario - Description: no_part_col_no_lower_and_upper_bound_extra_cols - no strategy to split the extraction. Moreover, test adding single extra column from the activation requests table. int_part_col_provide_upper_bound_&_min_timestamp - partition column of type int, manually provided upper_bound to parallelize the extraction. Moreover, it provides the min_timestamp to use to get the data from the changelog in the delta extraction after the init, which mimics the possible situation, in which people might need to provide a specific timestamp for backfilling, instead of deriving it from an existing location. int_part_col_generate_predicates_multi_extra_cols - partition column of type int to automatically generate predicates and parallelize the extraction. Moreover, test adding multiple extra columns from the activation requests table. str_part_col_generate_predicates - partition column of type str to automatically generate predicates and parallelize the extraction. str_part_col_predicates_list - partition column of type str, manually provided predicates list to parallelize the extraction. date_part_col_calculate_upper_bound - partition column of type date to automatically calculate the upper_bound and parallelize the extraction. timestamp_part_col_calculate_upper_bound - partition column of type timestamp to automatically calculate the upper_bound and parallelize the extraction from. init_timestamp_from_actrequest - get the init timestamp from act_request table instead of assuming a given timestamp. fail_calc_upper_bound - empty partition of type date to force failure on the upper bound calculation. no_part_col_join_condition - no strategy to split the extraction. Test to validate custom join condition on activation table. """ TEST_SCENARIOS = [ { "scenario_name": "no_part_col_no_lower_and_upper_bound_extra_cols", "calculate_upper_bound": False, "calculate_upper_bound_schema": None, "part_col": None, "lower_bound": None, "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": "act_req.request as activation_request", "act_req_join_condition": None, }, { "scenario_name": "int_part_col_provide_upper_bound_&_min_timestamp", "calculate_upper_bound": False, "calculate_upper_bound_schema": "upper_bound int", "part_col": "item", "lower_bound": 1, "upper_bound": 3, "min_timestamp": "20211004151010", "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": None, "act_req_join_condition": None, }, { "scenario_name": "int_part_col_generate_predicates_multi_extra_cols", "calculate_upper_bound": False, "calculate_upper_bound_schema": None, "part_col": "item", "lower_bound": None, "upper_bound": None, "min_timestamp": None, "generate_predicates": True, "predicates_list": None, "extra_cols_act_request": "act_req.request as actrequest_request, status", "act_req_join_condition": None, }, { "scenario_name": "str_part_col_generate_predicates", "calculate_upper_bound": False, "calculate_upper_bound_schema": None, "part_col": '"/bic/article"', "lower_bound": None, "upper_bound": None, "min_timestamp": None, "generate_predicates": True, "predicates_list": None, "extra_cols_act_request": None, "act_req_join_condition": None, }, { "scenario_name": "str_part_col_predicates_list", "calculate_upper_bound": False, "calculate_upper_bound_schema": None, "part_col": None, "lower_bound": None, "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": [ "\"/bic/article\"='article1'", "\"/bic/article\"='article2'", "\"/bic/article\"='article3'", "\"/bic/article\"='article4'", "\"/bic/article\"='article5'", "\"/bic/article\"='article6'", "\"/bic/article\"='article7'", "\"/bic/article\"='article33'", "\"/bic/article\"='article60'", '"/bic/article" IS NULL', ], "extra_cols_act_request": None, "act_req_join_condition": None, }, { "scenario_name": "date_part_col_calculate_upper_bound", "calculate_upper_bound": True, "calculate_upper_bound_schema": "upper_bound date", "part_col": "date", "lower_bound": "2000-01-01", "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": None, "act_req_join_condition": None, }, { "scenario_name": "timestamp_part_col_calculate_upper_bound", "calculate_upper_bound": True, "calculate_upper_bound_schema": "upper_bound timestamp", "part_col": "time", "lower_bound": "2000-01-01 01:01:01.000", "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": None, "act_req_join_condition": None, }, { "scenario_name": "init_timestamp_from_actrequest", "calculate_upper_bound": True, "calculate_upper_bound_schema": "upper_bound timestamp", "part_col": "time", "lower_bound": "2000-01-01 01:01:01.000", "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": None, "get_timestamp_from_act_request": True, "act_req_join_condition": None, }, { "scenario_name": "fail_calc_upper_bound", "calculate_upper_bound": True, "calculate_upper_bound_schema": "upper_bound date", "part_col": "order_date", "lower_bound": "2000-01-01", "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": None, "act_req_join_condition": None, }, { "scenario_name": "no_part_col_join_condition", "calculate_upper_bound": False, "calculate_upper_bound_schema": None, "part_col": None, "lower_bound": None, "upper_bound": None, "min_timestamp": None, "generate_predicates": False, "predicates_list": None, "extra_cols_act_request": None, "act_req_join_condition": "changelog_tbl.request = act_req.actrequest " "AND changelog_tbl.request = act_req.request", }, ] @pytest.mark.parametrize("scenario", TEST_SCENARIOS) def test_extract_dso(scenario: dict, caplog: LogCaptureFixture) -> None: """Test the extraction from SAP BW DSO. Args: scenario: scenario to test. caplog: fixture to capture console logs. """ extra_params = { "request_col_name": "actrequest", "changelog_table": f"{DB_TABLE}_cl", "test_name": "extract_dso", "include_changelog_tech_cols": True, } LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") _prepare_files(scenario["scenario_name"], extra_params) _load_test_table("rsodsactreq", scenario["scenario_name"], extra_params) _execute_and_validate("extract_dso", scenario, extra_params, caplog) @pytest.mark.parametrize("scenario", TEST_SCENARIOS) def test_extract_write_optimised_dso(scenario: dict, caplog: LogCaptureFixture) -> None: """Test the extraction from SAP BW Write Optimised DSO. Args: scenario: scenario to test. caplog: fixture to capture console logs. """ extra_params = { "request_col_name": "request", "changelog_table": DB_TABLE, "test_name": "extract_write_optimised_dso", "include_changelog_tech_cols": False, } LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") _prepare_files(scenario["scenario_name"], extra_params) _load_test_table("rsodsactreq", scenario["scenario_name"], extra_params) _execute_and_validate("extract_wodso", scenario, extra_params, caplog) def _execute_and_validate( test_name: str, scenario: dict, extra_params: dict, caplog: LogCaptureFixture ) -> None: """Helper function to reuse for trigger loading data and validation of results. Args: test_name: test being executed (for dso or wodso). scenario: scenario being tested. extra_params: extra params for the scenario being tested. caplog: fixture to capture console logs. """ if scenario["scenario_name"] == "fail_calc_upper_bound": with pytest.raises(AttributeError, match="Not able to calculate upper bound"): _execute_load( scenario=scenario, extraction_type="init", extra_params=extra_params ) elif test_name == "extract_dso" and "from_actrequest" in scenario["scenario_name"]: with pytest.raises( AttributeError, match="Not able to get the extraction query" ): _execute_load( scenario=scenario, extraction_type="init", extra_params=extra_params ) else: _execute_load( scenario=scenario, extraction_type="init", extra_params=extra_params ) changelog_table = extra_params["changelog_table"] assert f"The changelog table derived is: '{changelog_table}'" in caplog.text _execute_load( scenario=scenario, extraction_type="delta", iteration=1, extra_params=extra_params, ) _execute_load( scenario=scenario, extraction_type="delta", iteration=2, extra_params=extra_params, ) _validate( scenario["scenario_name"], extra_params, scenario["min_timestamp"] is not None, ) def _execute_load( scenario: dict, extra_params: dict, extraction_type: str, iteration: int = None, ) -> None: """Helper function to reuse for loading the data for the scenario tests. Args: scenario: scenario being tested. extra_params: extra params for the scenario being tested. extraction_type: type of extraction (delta or init). iteration: number of the iteration, in case it is to test a delta. """ write_type = "overwrite" if extraction_type == "init" else "append" _load_test_table( DB_TABLE if extraction_type == "init" else extra_params["changelog_table"], scenario["scenario_name"], extra_params, iteration, ) # if it is an init, we need to provide an extraction_timestamp, otherwise the # current time would be used and data would be filtered accordingly. acon = _get_test_acon( extraction_timestamp=( "20211004151010" if extraction_type == "init" else datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") ), extraction_type=extraction_type, write_type=write_type, scenario=scenario, extra_params=extra_params, ) load_data(acon=acon) def _get_test_acon( extraction_type: str, write_type: str, scenario: dict, extra_params: dict, extraction_timestamp: str = None, ) -> dict: """Creates a test ACON with the desired logic for the algorithm. Args: extraction_type: type of extraction (delta or init). write_type: the spark write type to be used. scenario: the scenario being tested. extra_params: extra params for the scenario being tested. extraction_timestamp: timestamp of the extraction. For local tests we specify it in the init, otherwise would be calculated and tests would fail. Returns: dict: the ACON for the algorithm configuration. """ return { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "sap_bw", "calculate_upper_bound": scenario["calculate_upper_bound"], "calc_upper_bound_schema": scenario["calculate_upper_bound_schema"], "generate_predicates": scenario["generate_predicates"], "options": { "driver": "org.sqlite.JDBC", "user": "dummy_user", "password": "dummy_pwd", "url": f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/" f"{scenario['scenario_name']}/{extra_params['test_name']}/tests.db", "dbtable": DB_TABLE, "changelog_table": ( extra_params["changelog_table"] if "changelog_table" in extra_params.keys() else None ), "customSchema": "actrequest_timestamp DECIMAL(15,0), " "datapakid STRING, request STRING, " "partno INTEGER, record INTEGER, " "extraction_start_timestamp DECIMAL(15,0)", "act_request_table": "rsodsactreq", "extra_cols_act_request": scenario["extra_cols_act_request"], "latest_timestamp_data_location": f"file:///{TEST_LAKEHOUSE_OUT}/" f"{scenario['scenario_name']}/{extra_params['test_name']}/data", "extraction_type": extraction_type, "numPartitions": 2, "partitionColumn": scenario["part_col"], "lowerBound": scenario["lower_bound"], "upperBound": scenario["upper_bound"], "default_upper_bound": "Null", "extraction_timestamp": extraction_timestamp, "min_timestamp": scenario["min_timestamp"], "request_col_name": extra_params["request_col_name"], "act_req_join_condition": scenario["act_req_join_condition"], "include_changelog_tech_cols": extra_params[ "include_changelog_tech_cols" ], "predicates": scenario["predicates_list"], "get_timestamp_from_act_request": scenario.get( "get_timestamp_from_act_request", False ), }, } ], "transform_specs": [ { "spec_id": "filtered_sales", "input_id": "sales_source", "transformers": [ { "function": "expression_filter", "args": {"exp": "`/bic/article` like 'article%'"}, } ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": write_type, "data_format": "delta", "partitions": ["actrequest_timestamp"], "location": f"file:///{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/" f"{extra_params['test_name']}/data", } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": ( True if scenario["extra_cols_act_request"] else False ) }, } def _prepare_files(scenario: str, extra_params: dict) -> None: """Copy all the files needed for the tests. Args: scenario: scenario being tested. extra_params: extra params for the scenario being tested. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/source/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/*.json", f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/*_schema.json", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/", ) if ( "optimised_dso" in extra_params["test_name"] and scenario == "init_timestamp_from_actrequest" ): LocalStorage.copy_file( f"{TEST_RESOURCES}/" f"{extra_params['test_name']}/data/control/" f"dummy_table_actreq_timestamp.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/", ) elif scenario == "no_part_col_join_condition": LocalStorage.copy_file( f"{TEST_RESOURCES}/" f"{extra_params['test_name']}/data/control/" f"dummy_table_join_condition.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/", ) else: LocalStorage.copy_file( f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/" f"dummy_table.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/", ) def _load_test_table( db_table: str, scenario: str, extra_params: dict, iteration: int = None ) -> DataFrame: """Load the JDBC tables for the tests and return a Dataframe with the content. Args: db_table: table being loaded. scenario: scenario being tested. extra_params: extra params for the scenario being tested. iteration: number of the iteration, in case it is to test a delta. Returns: A Dataframe with the content of the JDBC table loaded. """ file_name = f"{db_table}_{iteration}" if iteration else db_table source_df = DataframeHelpers.read_from_file( location=f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/" f"source/{file_name}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/" f"{db_table}_schema.json" ), options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"}, ) DataframeHelpers.write_into_jdbc_table( source_df, f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/" f"{extra_params['test_name']}/tests.db", db_table, ) return DataframeHelpers.read_from_jdbc( f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/" f"{extra_params['test_name']}/tests.db", db_table, ) def _validate(scenario: str, extra_params: dict, min_timestamp: bool) -> None: """Perform the validation part of the local tests. Args: scenario: the scenario being tested. extra_params: extra params for the scenario being tested. min_timestamp: whether the min_timestamp is provided or not. """ control_df = DataframeHelpers.read_from_file( location=f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/" f"data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/" f"{extra_params['test_name']}/dummy_table_schema.json" ), options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"}, ) control_df_columns = control_df.columns result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/{extra_params['test_name']}/data", file_format=OutputFormat.DELTAFILES.value, ).select(control_df_columns) if min_timestamp: # when we fill the min_timestamp, it means it can either skip or # re-extract things, depending on the timestamp provided. In our scenario # is expected to re-extract, causing duplicates, thus if we remove the # duplicates we expect to match the non-duplicated control dataframe result_df = result_df.drop_duplicates() assert not DataframeHelpers.has_diff(control_df, result_df) @pytest.mark.parametrize( "scenario", [ { "name": "derive_changelog_table_name", "odsobject": "testtable", "logsys": "DHACLNT003", }, { "name": "derive_changelog_table_name", "odsobject": "test_table", }, ], ) def test_changelog_table_name_derivation(scenario: dict) -> None: """Test the changelog table name derivation. Args: scenario: scenario to be tested. """ LocalStorage.copy_file( f"""{TEST_RESOURCES}/{scenario["name"]}/data/source/*.csv""", f"""{TEST_LAKEHOUSE_IN}/{scenario["name"]}/source/""", ) LocalStorage.copy_file( f"""{TEST_RESOURCES}/{scenario["name"]}/*.json""", f"""{TEST_LAKEHOUSE_IN}/{scenario["name"]}/""", ) for table in ["RSTSODS", "RSBASIDOC"]: source_df = DataframeHelpers.read_from_file( location=f"""{TEST_LAKEHOUSE_IN}/{scenario["name"]}/""" f"""source/{table}.csv""", schema=SchemaUtils.from_file_to_dict( f"""file://{TEST_LAKEHOUSE_IN}/{scenario["name"]}/""" f"""{table}_schema.json""" ), options={"header": True, "delimiter": "|"}, ) DataframeHelpers.write_into_jdbc_table( source_df, f"""jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario["name"]}/tests.db""", table, write_type=WriteType.OVERWRITE.value, ) extraction_utils = SAPBWExtractionUtils( SAPBWExtraction( # nosec B106 sap_bw_schema="", odsobject=scenario["odsobject"], dbtable="dummy_table", driver="org.sqlite.JDBC", user="dummy_user", password="dummy_pwd", url=f"""jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario["name"]}/tests.db""", **( {"logsys": scenario["logsys"]} if "logsys" in scenario and scenario["logsys"] is not None else {} ), ) ) assert re.match( f"""{scenario["odsobject"]}_OA""", extraction_utils.get_changelog_table(), ) ================================================ FILE: tests/feature/test_file_manager.py ================================================ """Test file manager.""" import logging from typing import Any import boto3 import pytest from moto import mock_s3 # type: ignore from lakehouse_engine.engine import manage_files from tests.conftest import FEATURE_RESOURCES TEST_PATH = "file_manager" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" @mock_s3 def test_file_manager(caplog: Any) -> None: """Test functions from file manager. Args: caplog: captured log. """ s3_res = boto3.resource("s3", region_name="us-east-1") s3_cli = boto3.client("s3", region_name="us-east-1") s3_res.create_bucket(Bucket="test_bucket") s3_res.create_bucket(Bucket="destination_bucket") with caplog.at_level(logging.INFO): # Creating test files/folders in S3 # 2000 files are created to test the pagination is being correctly performed s3_cli.put_object(Bucket="test_bucket", Key="test_single_file.json", Body="") s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="") for x in range(0, 2000): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory/test_recursive_file{x}.json", Body="", ) s3_cli.put_object(Bucket="test_bucket", Key="test_directory_test/", Body="") for x in range(0, 2000): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory_test/test_recursive_file{x}.json", Body="", ) _test_file_manager_copy(caplog, s3_cli) _test_file_manager_delete(caplog, s3_cli) def _test_file_manager_copy(caplog: Any, s3_cli: Any) -> None: """Testing file manager copy operations. Args: caplog: captured log. s3_cli: s3 client interface. """ manage_files( f"file://{TEST_RESOURCES}/copy_object/acon_copy_single_object_dry_run.json" ) assert "{'test_single_file.json': ['test_single_file.json']}" in caplog.text manage_files( f"file://{TEST_RESOURCES}/copy_object/acon_copy_directory_dry_run.json" ) for x in range(0, 2000): assert f"test_directory/test_recursive_file{x}.json" in caplog.text manage_files(f"file://{TEST_RESOURCES}/copy_object/acon_copy_single_object.json") assert "'KeyCount': 1" in str(s3_cli.list_objects_v2(Bucket="destination_bucket")) manage_files(f"file://{TEST_RESOURCES}/copy_object/acon_copy_directory.json") assert "'KeyCount': 2002" in str( s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000) ) def _test_file_manager_delete(caplog: Any, s3_cli: Any) -> None: """Testing file manager delete operations. Args: caplog: captured log. s3_cli: s3 client interface. """ manage_files( f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects_dry_run.json" ) assert ( "{'test_single_file.json': ['test_single_file.json'], " "'test_directory/': ['test_directory/'" in caplog.text ) for x in range(0, 2000): assert f"test_directory/test_recursive_file{x}.json" in caplog.text manage_files(f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects.json") assert "'KeyCount': 2001" in str( s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000) ) @mock_s3 @pytest.mark.parametrize( "scenario", [ {"scenario_name": "glacier", "storage_class": "GLACIER"}, {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"}, {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"}, ], ) def test_file_manager_restore_archive(scenario: dict, caplog: Any) -> None: """Test restore functions from file manager. Args: scenario: scenario to test. caplog: captured log. """ s3_res = boto3.resource("s3", region_name="us-east-1") s3_cli = boto3.client("s3", region_name="us-east-1") s3_res.create_bucket(Bucket="test_bucket") s3_res.create_bucket(Bucket="destination_bucket") with caplog.at_level(logging.INFO): s3_cli.put_object( Bucket="test_bucket", Key="test_single_file.json", Body="", StorageClass=scenario.get("storage_class"), ) s3_cli.put_object(Bucket="test_bucket", Key="test_directory", Body="") for x in range(0, 3): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory/test_recursive_file{x}.json", Body="", StorageClass=scenario.get("storage_class"), ) _test_file_manager_restore_request(caplog, s3_cli, s3_res) _test_file_manager_restore_check(caplog, s3_cli, s3_res) def _test_file_manager_restore_check(caplog: Any, s3_cli: Any, s3_res: Any) -> None: """Testing file manager restore check. Args: caplog: captured log. s3_cli: s3 client interface. s3_res: s3 resource interface. """ test_bucket = s3_res.Bucket("test_bucket") expected_restored_objects = 4 restored_objects = 0 manage_files( f"file://{TEST_RESOURCES}/check_restore_status/" "acon_check_restore_status_directory.json" ) for x in range(0, 3): assert ( f"Checking restore status for: test_directory/test_recursive_file{x}.json" in caplog.text ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 5" in str( s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000) ) assert expected_restored_objects == restored_objects def _test_file_manager_restore_request(caplog: Any, s3_cli: Any, s3_res: Any) -> None: """Testing file manager restore request. Args: caplog: captured log. s3_cli: s3 client interface. s3_res: s3 resource interface. """ test_bucket = s3_res.Bucket("test_bucket") expected_restored_objects = 4 restored_objects = 0 manage_files( f"file://{TEST_RESOURCES}/request_restore/" "acon_request_restore_single_object.json" ) manage_files( f"file://{TEST_RESOURCES}/request_restore/" "acon_request_restore_directory.json" ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 5" in str( s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000) ) assert expected_restored_objects == restored_objects @mock_s3 @pytest.mark.parametrize( "scenario", [ {"scenario_name": "glacier", "storage_class": "GLACIER"}, {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"}, {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"}, ], ) def test_file_manager_restore_sync(scenario: dict, caplog: Any) -> None: """Test restore functions from file manager. Args: scenario: scenario to test. caplog: captured log. """ s3_res = boto3.resource("s3", region_name="us-east-1") s3_cli = boto3.client("s3", region_name="us-east-1") s3_res.create_bucket(Bucket="test_bucket") s3_res.create_bucket(Bucket="destination_bucket") with caplog.at_level(logging.INFO): s3_cli.put_object( Bucket="test_bucket", Key="test_single_file.json", Body="", StorageClass=scenario.get("storage_class"), ) s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="") for x in range(0, 3): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory/test_recursive_file{x}.json", Body="", StorageClass=scenario.get("storage_class"), ) _test_file_manager_restore_sync(caplog, s3_cli, s3_res) _test_file_manager_restore_sync_retrieval_tier_exception(caplog) def _test_file_manager_restore_sync(caplog: Any, s3_cli: Any, s3_res: Any) -> None: """Testing file manager restore file sync. Args: caplog: captured log. s3_cli: s3 client interface. s3_res: s3 resource interface. """ test_bucket = s3_res.Bucket("test_bucket") expected_single_restored_objects = 1 restored_objects = 0 manage_files( f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/" "acon_request_restore_to_destination_and_wait_single_object.json" ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 1" in str( s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000) ) assert expected_single_restored_objects == restored_objects restored_objects = 0 expected_restored_objects = 4 manage_files( f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/" "acon_request_restore_to_destination_and_wait_directory.json" ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 5" in str( s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000) ) assert expected_restored_objects == restored_objects def _test_file_manager_restore_sync_retrieval_tier_exception(caplog: Any) -> None: """Testing file manager restore sync operation when raising exception. Args: caplog: captured log. """ with pytest.raises(ValueError) as exception: manage_files( f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/" "acon_request_restore_to_destination_and_wait_single" "_object_raise_error.json" ) assert ( "Retrieval Tier Bulk not allowed on this operation! " "This kind of restore should be used just with `Expedited` retrieval tier " "to save cluster costs." in str(exception.value) ) ================================================ FILE: tests/feature/test_file_manager_dbfs.py ================================================ """Test file manager for dbfs.""" import logging import os import shutil from dataclasses import dataclass from pathlib import Path from typing import Any, Iterator from unittest.mock import patch import pytest from lakehouse_engine.engine import manage_files from lakehouse_engine.utils.databricks_utils import DatabricksUtils from tests.conftest import FEATURE_RESOURCES TEST_PATH = "file_manager_dbfs" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_DBFS = "tests/lakehouse/dbfs" @dataclass class FileInfoFixture: """This class mocks the DBUtils FileInfo object.""" path: str name: str size: int def isDir(self) -> bool: """Construct to check if the path is a directory. Returns: A bool as true is it is a directory. """ return os.path.isdir(self.path) def isFile(self) -> bool: """Construct to check if the path is a file. Returns: A bool as true is it is a file. """ return os.path.isfile(self.path) class DBUtilsFixture: """This class is used for mocking the behaviour of DBUtils inside tests.""" def __init__(self) -> None: """Construct to mock DBUtils filesystem operations.""" self.fs = self @staticmethod def cp(src: str, dest: str, recurse: bool = False) -> None: """This mocks the behavior of dbutils when copy files or directories. Args: src: string with the path to copy from. dest: string with the path to copy to. recurse: bool to recursively move files or directories. """ if os.path.isfile(src): shutil.copy(src, dest) elif recurse: shutil.copytree(src, dest) else: shutil.copy(src, dest) @staticmethod def ls(path: str) -> list: """This mocks the behavior of dbutils when reading a directory or files inside. Args: path: string with the path to read the directory or files inside. """ paths = Path(path).glob("*") objects = [ FileInfoFixture(str(p.absolute()), p.name, p.stat().st_size) for p in paths ] return objects @staticmethod def mkdirs(path: str) -> None: """This mocks the behavior of dbutils when creating a directory. Args: path: string with the path to create the directory. """ Path(path).mkdir(parents=True, exist_ok=True) @staticmethod def mv(src: str, dest: str, recurse: bool = False) -> None: """This mocks the behavior of dbutils when moving files or directories. Args: src: string with the path to move from. dest: string with the path to move to. recurse: bool to recursively move files or directories. """ if os.path.isfile(src): shutil.move(src, dest, copy_function=shutil.copy) elif recurse: shutil.move(src, dest, copy_function=shutil.copytree) else: shutil.move(src, dest, copy_function=shutil.copy) @staticmethod def put(path: str, content: str, overwrite: bool = False) -> None: """This mocks the behavior of dbutils when inserting in files. Args: path: string with the path to insert content. content: string with the content to insert in the file. overwrite: bool to overwrite file with the content. """ file = Path(path) if file.exists() and not overwrite: raise FileExistsError("File already exists") file.write_text(content, encoding="utf-8") @staticmethod def rm(path: str, recurse: bool = False) -> None: """This mocks the behavior of dbutils when removing files or directories. Args: path: string with the path to remove. recurse: bool to recursively remove files or directories. """ if os.path.isfile(path): os.remove(path) elif recurse: shutil.rmtree(path) else: os.remove(path) @pytest.fixture(scope="session", autouse=True) def dbutils_fixture() -> Iterator[None]: """This fixture patches the `get_db_utils` function.""" with patch.object(DatabricksUtils, "get_db_utils", lambda _: DBUtilsFixture()): yield @patch( "lakehouse_engine.utils.storage.file_storage_functions." "FileStorageFunctions.is_boto3_configured", return_value=False, ) def test_file_manager_dbfs(_patch: Any, caplog: Any) -> None: """Test functions from file manager. Args: caplog: captured log. """ dbutils = DBUtilsFixture() with caplog.at_level(logging.INFO): # Creating test files/folders in dbfs dbutils.fs.mkdirs(path=TEST_LAKEHOUSE_DBFS) dbutils.fs.put(path=f"{TEST_LAKEHOUSE_DBFS}/test_single_file.json", content="") dbutils.fs.mkdirs(path=f"{TEST_LAKEHOUSE_DBFS}/test_directory/") for x in range(0, 2000): dbutils.fs.put( path=f"{TEST_LAKEHOUSE_DBFS}/test_directory/" f"test_recursive_file{x}.json", content="", ) dbutils.fs.mkdirs(path=f"{TEST_LAKEHOUSE_DBFS}/test_directory_test/") for x in range(0, 2000): dbutils.fs.put( path=f"{TEST_LAKEHOUSE_DBFS}/test_directory_test/" f"test_recursive_file{x}.json", content="", ) _test_file_manager_dbfs_copy(caplog, dbutils) _test_file_manager_dbfs_delete(caplog, dbutils) _test_file_manager_dbfs_move(caplog, dbutils) def _list_objects(path: str, objects_list: list, dbutils: Any) -> list: list_objects = dbutils.fs.ls(path) for file_or_directory in list_objects: if file_or_directory.isDir(): _list_objects(file_or_directory.path, objects_list, dbutils) else: objects_list.append(file_or_directory.path) return objects_list def _test_file_manager_dbfs_copy(caplog: Any, dbutils: Any) -> None: """Testing file manager copy operations. Args: caplog: captured log. dbutils: Dbutils from databricks. """ manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/" f"acon_copy_directory_dry_run.json" ) for x in range(0, 2000): assert ( f"/app/tests/lakehouse/dbfs/test_directory/test_recursive_file{x}.json" in caplog.text ) manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_directory.json" ) assert len(dbutils.fs.ls("tests/lakehouse/dbfs/test_directory")) == len( dbutils.fs.ls("tests/lakehouse/dbfs/destination_directory") ) manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_single_object.json" ) assert "tests/lakehouse/dbfs/test_single_file.json" in str( dbutils.fs.ls("tests/lakehouse/dbfs/") ) def _test_file_manager_dbfs_delete(caplog: Any, dbutils: Any) -> None: """Testing file manager delete operations. Args: caplog: captured log. dbutils: Dbutils from databricks. """ manage_files( acon_path=f"file://{TEST_RESOURCES}/delete_objects/" f"acon_delete_objects_dry_run.json" ) assert ( "{'tests/lakehouse/dbfs/test_directory': " "['/app/tests/lakehouse/dbfs/test_directory/" in caplog.text ) for x in range(0, 2000): assert ( f"/app/tests/lakehouse/dbfs/test_directory/" f"test_recursive_file{x}.json" in caplog.text ) for x in range(0, 2000): assert ( f"/app/tests/lakehouse/dbfs/destination_directory/" f"test_recursive_file{x}.json" in caplog.text ) manage_files( acon_path=f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects.json" ) assert len(dbutils.fs.ls("tests/lakehouse/dbfs/destination_directory")) == 0 def _test_file_manager_dbfs_move(caplog: Any, dbutils: Any) -> None: """Testing file manager move operations. Args: caplog: captured log. dbutils: Dbutils from databricks. """ manage_files( acon_path=f"file://{TEST_RESOURCES}/move_objects/acon_move_objects_dry_run.json" ) assert ( "{'tests/lakehouse/dbfs/test_directory': " "['/app/tests/lakehouse/dbfs/test_directory/" in caplog.text ) for x in range(0, 2000): assert ( f"/app/tests/lakehouse/dbfs/test_directory/" f"test_recursive_file{x}.json" in caplog.text ) for x in range(0, 2000): assert ( f"/app/tests/lakehouse/dbfs/destination_directory/" f"test_recursive_file{x}.json" in caplog.text ) manage_files( acon_path=f"file://{TEST_RESOURCES}/move_objects/acon_move_objects.json" ) assert len(dbutils.fs.ls("tests/lakehouse/dbfs/test_directory")) == 0 assert len(dbutils.fs.ls("tests/lakehouse/dbfs/test_mv_directory")) == 2000 ================================================ FILE: tests/feature/test_file_manager_s3.py ================================================ """Test file manager for s3.""" import logging from typing import Any import boto3 import pytest from moto import mock_s3, mock_sts # type: ignore from lakehouse_engine.engine import manage_files from tests.conftest import FEATURE_RESOURCES TEST_PATH = "file_manager_s3" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" @mock_sts def test_get_caller_identity_with_default_credentials() -> None: """Test get_caller_identity of sts client.""" boto3.client("sts", region_name="us-east-1").get_caller_identity() @mock_s3 def test_file_manager_s3(caplog: Any) -> None: """Test functions from file manager. Args: caplog: captured log. """ s3_res = boto3.resource("s3", region_name="us-east-1") s3_cli = boto3.client("s3", region_name="us-east-1") test_get_caller_identity_with_default_credentials() s3_res.create_bucket(Bucket="test_bucket") s3_res.create_bucket(Bucket="destination_bucket") with caplog.at_level(logging.INFO): # Creating test files/folders in S3 # 2000 files are created to test the pagination is being correctly performed s3_cli.put_object(Bucket="test_bucket", Key="test_single_file.json", Body="") s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="") for x in range(0, 2000): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory/test_recursive_file{x}.json", Body="", ) s3_cli.put_object(Bucket="test_bucket", Key="test_directory_test/", Body="") for x in range(0, 2000): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory_test/test_recursive_file{x}.json", Body="", ) _test_file_manager_s3_copy(caplog, s3_cli) _test_file_manager_s3_delete(caplog, s3_cli) def _test_file_manager_s3_copy(caplog: Any, s3_cli: Any) -> None: """Testing file manager copy operations. Args: caplog: captured log. s3_cli: s3 client interface. """ manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/" f"acon_copy_single_object_dry_run.json" ) assert "{'test_single_file.json': ['test_single_file.json']}" in caplog.text manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/" f"acon_copy_directory_dry_run.json" ) for x in range(0, 2000): assert f"test_directory/test_recursive_file{x}.json" in caplog.text manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_single_object.json" ) assert "'KeyCount': 1" in str(s3_cli.list_objects_v2(Bucket="destination_bucket")) manage_files( acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_directory.json" ) assert "'KeyCount': 2002" in str( s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000) ) def _test_file_manager_s3_delete(caplog: Any, s3_cli: Any) -> None: """Testing file manager delete operations. Args: caplog: captured log. s3_cli: s3 client interface. """ manage_files( acon_path=f"file://{TEST_RESOURCES}/delete_objects/" f"acon_delete_objects_dry_run.json" ) assert ( "{'test_single_file.json': ['test_single_file.json'], " "'test_directory/': ['test_directory/'" in caplog.text ) for x in range(0, 2000): assert f"test_directory/test_recursive_file{x}.json" in caplog.text manage_files( acon_path=f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects.json" ) assert "'KeyCount': 2001" in str( s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000) ) @mock_s3 @pytest.mark.parametrize( "scenario", [ {"scenario_name": "glacier", "storage_class": "GLACIER"}, {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"}, {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"}, ], ) def test_file_manager_s3_restore_archive(scenario: dict, caplog: Any) -> None: """Test restore functions from file manager. Args: scenario: scenario to test. caplog: captured log. """ s3_res = boto3.resource("s3", region_name="us-east-1") s3_cli = boto3.client("s3", region_name="us-east-1") test_get_caller_identity_with_default_credentials() s3_res.create_bucket(Bucket="test_bucket") s3_res.create_bucket(Bucket="destination_bucket") with caplog.at_level(logging.INFO): s3_cli.put_object( Bucket="test_bucket", Key="test_single_file.json", Body="", StorageClass=scenario.get("storage_class"), ) s3_cli.put_object(Bucket="test_bucket", Key="test_directory", Body="") for x in range(0, 3): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory/test_recursive_file{x}.json", Body="", StorageClass=scenario.get("storage_class"), ) _test_file_manager_s3_restore_request(caplog, s3_cli, s3_res) _test_file_manager_s3_restore_check(caplog, s3_cli, s3_res) def _test_file_manager_s3_restore_check(caplog: Any, s3_cli: Any, s3_res: Any) -> None: """Testing file manager restore check. Args: caplog: captured log. s3_cli: s3 client interface. s3_res: s3 resource interface. """ test_bucket = s3_res.Bucket("test_bucket") expected_restored_objects = 4 restored_objects = 0 manage_files( acon_path=f"file://{TEST_RESOURCES}/check_restore_status/" f"acon_check_restore_status_directory.json" ) for x in range(0, 3): assert ( f"Checking restore status for: test_directory/test_recursive_file{x}.json" in caplog.text ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 5" in str( s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000) ) assert expected_restored_objects == restored_objects def _test_file_manager_s3_restore_request( caplog: Any, s3_cli: Any, s3_res: Any ) -> None: """Testing file manager restore request. Args: caplog: captured log. s3_cli: s3 client interface. s3_res: s3 resource interface. """ test_bucket = s3_res.Bucket("test_bucket") expected_restored_objects = 4 restored_objects = 0 manage_files( acon_path=f"file://{TEST_RESOURCES}/request_restore/" f"acon_request_restore_single_object.json" ) manage_files( acon_path=f"file://{TEST_RESOURCES}/request_restore/" f"acon_request_restore_directory.json" ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 5" in str( s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000) ) assert expected_restored_objects == restored_objects @mock_s3 @pytest.mark.parametrize( "scenario", [ {"scenario_name": "glacier", "storage_class": "GLACIER"}, {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"}, {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"}, ], ) def test_file_manager_s3_restore_sync(scenario: dict, caplog: Any) -> None: """Test restore functions from file manager. Args: scenario: scenario to test. caplog: captured log. """ s3_res = boto3.resource("s3", region_name="us-east-1") s3_cli = boto3.client("s3", region_name="us-east-1") test_get_caller_identity_with_default_credentials() s3_res.create_bucket(Bucket="test_bucket") s3_res.create_bucket(Bucket="destination_bucket") with caplog.at_level(logging.INFO): s3_cli.put_object( Bucket="test_bucket", Key="test_single_file.json", Body="", StorageClass=scenario.get("storage_class"), ) s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="") for x in range(0, 3): s3_cli.put_object( Bucket="test_bucket", Key=f"test_directory/test_recursive_file{x}.json", Body="", StorageClass=scenario.get("storage_class"), ) _test_file_manager_s3_restore_sync(caplog, s3_cli, s3_res) _test_file_manager_s3_restore_sync_retrieval_tier_exception(caplog) def _test_file_manager_s3_restore_sync(caplog: Any, s3_cli: Any, s3_res: Any) -> None: """Testing file manager restore file sync. Args: caplog: captured log. s3_cli: s3 client interface. s3_res: s3 resource interface. """ test_bucket = s3_res.Bucket("test_bucket") expected_single_restored_objects = 1 restored_objects = 0 manage_files( acon_path=f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/" f"acon_request_restore_to_destination_and_wait_single_object.json" ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 1" in str( s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000) ) assert expected_single_restored_objects == restored_objects restored_objects = 0 expected_restored_objects = 4 manage_files( acon_path=f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/" f"acon_request_restore_to_destination_and_wait_directory.json" ) for bucket_object in test_bucket.objects.all(): obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key) if obj.restore is not None and 'ongoing-request="false"' in obj.restore: restored_objects += 1 assert "'KeyCount': 5" in str( s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000) ) assert expected_restored_objects == restored_objects def _test_file_manager_s3_restore_sync_retrieval_tier_exception(caplog: Any) -> None: """Testing file manager restore sync operation when raising exception. Args: caplog: captured log. """ with pytest.raises(ValueError) as exception: manage_files( acon_path=f"file://{TEST_RESOURCES}/request_restore_to_destination_" f"and_wait/acon_request_restore_to_destination_and_wait_" f"single_object_raise_error.json" ) assert ( "Retrieval Tier Bulk not allowed on this operation! " "This kind of restore should be used just with `Expedited` retrieval tier " "to save cluster costs." in str(exception.value) ) ================================================ FILE: tests/feature/test_full_load.py ================================================ """Test full loads.""" from typing import List import pytest from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "full_load" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ ["with_filter", InputFormat.PARQUET.value], ["with_filter_partition_overwrite", InputFormat.DELTAFILES.value], ["full_overwrite", InputFormat.DELTAFILES.value], ], ) def test_batch_full_load(scenario: List[str]) -> None: """Test full loads in batch mode. Args: scenario: scenario to test. with_filter - loads in full but applies a filter to the source. with_filter_partition_overwrite - loads in full but only overwrites partitions that are contained in the data being loaded, keeping untouched partitions in the target table, therefore not doing a complete overwrite. full_overwrite - loads in full and overwrites target table. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario[0]}/batch_init.json" ) load_data(acon=acon) LocalStorage.clean_folder( f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-02.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario[0]}/batch.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data", file_format=scenario[1], ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/test_gab.py ================================================ """Module with integration tests for gab feature.""" from typing import Any, Optional import pendulum import pytest from _pytest.fixtures import SubRequest from pyspark.sql import DataFrame from pyspark.sql.functions import col, to_date from pyspark.sql.types import Row from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import execute_gab, load_data from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "gab" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" _LOGGER = LoggingHandler(__name__).get_logger() _CALENDAR_MIN_DATE = pendulum.from_format("2016-01-01", "YYYY-MM-DD") _CALENDAR_MAX_DATE = pendulum.from_format("2023-01-01", "YYYY-MM-DD") _SETUP_DELTA_TABLES = { "dim_calendar": "calendar", "lkp_query_builder": "lkp_query_builder", "gab_use_case_results": "gab_use_case_results", "gab_log_events": "gab_log_events", } _USE_CASE_TABLES = ["order_events", "dummy_sales_kpi"] def _create_gab_tables() -> None: """Create necessary tables to use GAB.""" for table_name, table_column_file in _SETUP_DELTA_TABLES.items(): DataframeHelpers.create_delta_table( cols=SchemaUtils.from_file_to_dict( f"file:///{TEST_RESOURCES}/setup/column_list/{table_column_file}.json" ), table=table_name, ) def _generate_calendar_test_dates() -> list: """Generate calendar date between the test period.""" calendar_dates: list[Row] = [] calendar_date = _CALENDAR_MIN_DATE for _ in range(1, _CALENDAR_MIN_DATE.diff(_CALENDAR_MAX_DATE).in_days()): calendar_date = calendar_date.add(days=1) calendar_dates.append(Row(value=calendar_date.strftime("%Y-%m-%d"))) return calendar_dates def _transform_dates_list_to_dataframe(dates: list) -> DataFrame: """Create calendar dates DataFrame from a list of dates. Args: dates: list of dates to create the calendar DataFrame. """ calendar_dates = ExecEnv.SESSION.createDataFrame(dates) calendar_dates = calendar_dates.withColumn( "calendar_date", to_date(col("value"), "yyyy-MM-dd") ).drop(calendar_dates.value) return calendar_dates def _feed_dim_calendar(df: DataFrame) -> DataFrame: """Feed dim calendar table.""" df.createOrReplaceTempView("dates_completed") df_cal = ExecEnv.SESSION.sql( """ WITH monday_calendar AS ( SELECT calendar_date, WEEKOFYEAR(calendar_date) AS weeknum_mon, DATE_FORMAT(calendar_date, 'E') AS day_en, MIN(calendar_date) OVER (PARTITION BY CONCAT(DATE_PART( 'YEAROFWEEK', calendar_date ), WEEKOFYEAR(calendar_date)) ORDER BY calendar_date) AS weekstart_mon FROM dates_completed ORDER BY calendar_date ), monday_calendar_plus_week_num_sunday AS ( SELECT monday_calendar.*, LEAD(weeknum_mon) OVER(ORDER BY calendar_date) AS weeknum_sun FROM monday_calendar ), calendar_complementary_values AS ( SELECT calendar_date, weeknum_mon, day_en, weekstart_mon, weekstart_mon+6 AS weekend_mon, LEAD(weekstart_mon-1) OVER(ORDER BY calendar_date) AS weekstart_sun, DATE(DATE_TRUNC('MONTH', calendar_date)) AS month_start, DATE(DATE_TRUNC('QUARTER', calendar_date)) AS quarter_start, DATE(DATE_TRUNC('YEAR', calendar_date)) AS year_start FROM monday_calendar_plus_week_num_sunday ) SELECT calendar_date, day_en, weeknum_mon, weekstart_mon, weekend_mon, weekstart_sun, weekstart_sun+6 AS weekend_sun, month_start, add_months(month_start, 1)-1 AS month_end, quarter_start, ADD_MONTHS(quarter_start, 3)-1 AS quarter_end, year_start, ADD_MONTHS(year_start, 12)-1 AS year_end FROM calendar_complementary_values """ ) return df_cal def _feed_table_with_test_data( table_name: str, source_dataframe: Optional[DataFrame] = None, transformer_specs: list = None, input_id_to_write: str = "data_to_load", ) -> None: """Feed table with test data. Args: table_name: name of the table to feed. source_dataframe: dataframe to feed the table, present when load_type is dataframe. transformer_specs: acon transformations. input_id_to_write: input id used in the write step. """ input_spec: dict[str, Any] if source_dataframe: input_spec = { "spec_id": "data_to_load", "read_type": "batch", "data_format": "dataframe", "df_name": source_dataframe, } else: input_spec = { "spec_id": "data_to_load", "read_type": "batch", "data_format": "csv", "schema_path": f"file:///{TEST_RESOURCES}/setup/schema/{table_name}.json", "options": { "header": True, "delimiter": "|", "mode": "FAILFAST", "nullValue": "null", }, "location": f"file:///{TEST_RESOURCES}/setup/data/{table_name}.csv", } acon = { "input_specs": [input_spec], "transform_specs": transformer_specs if transformer_specs else [], "output_specs": [ { "spec_id": "loaded_table", "input_id": input_id_to_write, "write_type": "overwrite", "data_format": "delta", "db_table": f"test_db.{table_name}", }, ], } load_data(acon=acon) def _create_and_load_source_data_for_use_case(source_table: str) -> None: """Create and load source for use case. Args: source_table: source table to create/feed the data. """ DataframeHelpers.create_delta_table( cols=SchemaUtils.from_file_to_dict( f"file:///{TEST_RESOURCES}/setup/column_list/{source_table}.json" ), table=source_table, ) _feed_table_with_test_data(table_name=source_table) def _import_use_case_sql(use_case_name: str) -> None: """Import use case SQL stage files. Args: use_case_name: name of the use case. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/usecases/{use_case_name}/*.sql", f"{TEST_LAKEHOUSE_IN}/usecases_sql/{use_case_name}/", ) def _setup_use_case(use_case_name: str) -> None: """Set up the use case. Args: use_case_name: name of hte use case. """ _create_and_load_source_data_for_use_case(use_case_name) _import_use_case_sql(use_case_name) @pytest.fixture(scope="session", autouse=True) def _gab_setup() -> None: """Execute the GAB setup. Create and load config gab tables. """ _LOGGER.info("Creating gab config tables...") _create_gab_tables() _feed_table_with_test_data(table_name="lkp_query_builder") calendar_dates = _generate_calendar_test_dates() calendar_dates_df = _transform_dates_list_to_dataframe(calendar_dates) _feed_table_with_test_data( table_name="dim_calendar", source_dataframe=calendar_dates_df, input_id_to_write="transformed_data", transformer_specs=[ { "spec_id": "transformed_data", "input_id": "data_to_load", "transformers": [ { "function": "custom_transformation", "args": {"custom_transformer": _feed_dim_calendar}, } ], } ], ) _LOGGER.info("Created with success...") @pytest.fixture(scope="session", autouse=True, params=[_USE_CASE_TABLES]) def _run_setup_use_case(request: SubRequest) -> None: """Create and load use case gab tables. Args: request: fixture request, giving access to the `params`. """ _LOGGER.info("Creating use case config tables...") for use_case in request.param: _setup_use_case(use_case) _LOGGER.info("Created with success...") @pytest.mark.usefixtures("_gab_setup", "_run_setup_use_case") @pytest.mark.parametrize( "scenario", [ { "use_case_name": "order_events", "gold_assets": ["vw_orders_all", "vw_orders_filtered"], "gold_asset_schema": "vw_orders", "use_case_stages": "order_events", }, { "use_case_name": "order_events_snapshot", "gold_assets": ["vw_orders_all_snapshot", "vw_orders_filtered_snapshot"], "gold_asset_schema": "vw_orders", "use_case_stages": "order_events", }, { "use_case_name": "order_events_nam", "gold_assets": [ "vw_nam_orders_all_snapshot", "vw_nam_orders_filtered_snapshot", ], "gold_asset_schema": "vw_orders", "use_case_stages": "order_events", }, { "use_case_name": "order_events_negative_timezone_offset", "gold_assets": [ "vw_negative_offset_orders_all", "vw_negative_offset_orders_filtered", ], "gold_asset_schema": "vw_orders", "use_case_stages": "order_events", }, { "use_case_name": "dummy_sales_kpi", "gold_assets": ["vw_dummy_sales_kpi"], "gold_asset_schema": "vw_dummy_sales_kpi", "use_case_stages": "dummy_sales_kpi", }, { "use_case_name": "skip_use_case_by_empty_reconciliation", "query_label": "order_events_empty_reconciliation_window", "use_case_stages": "order_events", }, { "use_case_name": "skip_use_case_by_empty_requested_cadence", "query_label": "order_events_negative_timezone_offset", "use_case_stages": "order_events", }, { "use_case_name": "skip_use_case_by_not_configured_cadence", "query_label": "order_events_negative_timezone_offset", "use_case_stages": "order_events", }, { "use_case_name": "skip_use_case_by_unexisting_cadence", "query_label": "order_events_unexisting_cadence", "use_case_stages": "order_events", }, ], ) def test_gold_asset_builder(scenario: dict, caplog: Any) -> None: """Test the feature of using gab to generate gold assets. Args: scenario: scenario to test. caplog: captured log. Scenarios: order_events: tests gab features: - Cadence - Recon Window - Metrics - Extended Window Calculator Also test the generation of two different views for the same asset. order_events_snapshot: tests gab features: - Cadence - Recon Window - Metrics - Extended Window Calculator - Snapshot Also test the generation of two different views for the same asset. order_events_nam: tests gab features: - Cadence - Recon Window - Metrics - Extended Window Calculator - Snapshot Also test the generation of two different views for the same asset and the use case `query_type` equals to `NAM`. order_events_negative_timezone_offset: tests gab features: - Cadence - Recon Window - Metrics - Extended Window Calculator - Offset - Snapshot Also test the generation of two different views for the same asset. dummy_sales_kpi: tests almost all gab features: - Cadence - Recon Window - Metrics - Extended Window Calculator Also test multiple stages for the asset creation. """ use_case_name = scenario["use_case_name"] execute_gab( f"file://{TEST_RESOURCES}/usecases/{scenario['use_case_stages']}/scenario/" f"{use_case_name}.json" ) if not use_case_name.startswith("skip"): for expected_gold_asset in scenario["gold_assets"]: result_df = ExecEnv.SESSION.sql( f"SELECT * FROM test_db.{expected_gold_asset}" # nosec ) control_df = DataframeHelpers.read_from_file( f"{TEST_RESOURCES}/control/data/{expected_gold_asset}.csv", schema=SchemaUtils.from_file_to_dict( f"file:///{TEST_RESOURCES}/control/schema/" f"{scenario['gold_asset_schema']}.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) else: assert ( f"Skipping use case {scenario['query_label']}. No cadence processed " "for the use case." in caplog.text ) ================================================ FILE: tests/feature/test_heartbeat.py ================================================ """Module with integration tests for heartbeat feature.""" import datetime from unittest.mock import MagicMock, patch import pytest from pyspark.sql import DataFrame from pyspark.sql.functions import lit from pyspark.sql.types import TimestampType from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import ( execute_heartbeat_sensor_data_feed, execute_sensor_heartbeat, trigger_heartbeat_sensor_jobs, update_heartbeat_sensor_status, ) from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "heartbeat" FEATURE_TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" _LOGGER = LoggingHandler(__name__).get_logger() def _create_heartbeat_table(scenario_name: str, tables: dict) -> None: """Create the necessary tables required for using Heartbeat. Args: scenario_name (str): The name of the scenario. tables (dict): Table names. """ for _, table_name in tables.items(): DataframeHelpers.create_delta_table( cols=SchemaUtils.from_file_to_dict( f"file:///{FEATURE_TEST_RESOURCES}/setup/" f"{scenario_name}/column_list/{table_name}.json" ), table=table_name, ) def _test_heartbeat_sensor_data_feed( heartbeat_data_file_path: str, heartbeat_control_table_name: str, ctrl_heartbeat_df: DataFrame, ) -> None: """Test the function that populates the heartbeat control table. Args: heartbeat_data_file_path (str): Path to the CSV file used to populate the control table. heartbeat_control_table_name (str): Name of the target control table. ctrl_heartbeat_df (DataFrame): Reference DataFrame used to validate the table contents. """ _LOGGER.info("Testing execute_heartbeat_sensor_data_feed function") execute_heartbeat_sensor_data_feed( heartbeat_data_file_path, heartbeat_control_table_name ) heartbeat_df = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}") assert not DataframeHelpers.has_diff(heartbeat_df, ctrl_heartbeat_df) @patch( "lakehouse_engine.algorithms.sensors.heartbeat.Heartbeat._execute_batch_of_sensor", MagicMock( return_value={ "sensor_id": "dummy_delta_table", "trigger_job_id": "1927384615203749", } ), ) @patch("lakehouse_engine.algorithms.sensors.heartbeat.current_timestamp") def _test_execute_sensor_heartbeat( mocked_timestamp: MagicMock, acon: dict, heartbeat_control_table_name: str, ctrl_heartbeat_df: DataFrame, results: dict, ) -> None: """Test the execution of the sensor heartbeat process. This test mocks the internal `_execute_batch_of_sensor` method to simulate the heartbeat execution, then validates the resulting state in the heartbeat control table after the execution of the execute_sensor_heartbeat function. Args: mocked_timestamp (MagicMock): A static timestamp for testing. acon (dict): Acon used to trigger the heartbeat execution. heartbeat_control_table_name (str): Name of the control table to validate. ctrl_heartbeat_df (DataFrame): Reference DataFrame for asserting table contents. results (dict): Reference values to compare. """ mocked_timestamp.return_value = lit( datetime.datetime.strptime("2025/08/14 23:00", "%Y/%m/%d %H:%M") ).cast(TimestampType()) execute_sensor_heartbeat(acon=acon) heartbeat_result = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}") assert ( heartbeat_result.filter("status = 'NEW_EVENT_AVAILABLE'").count() == results["new_events_available_count"] ) assert not DataframeHelpers.has_diff(ctrl_heartbeat_df, heartbeat_result) @patch("lakehouse_engine.algorithms.sensors.heartbeat.current_timestamp") @patch( "lakehouse_engine.core.sensor_manager.datetime", ) def _test_update_heartbeat_sensor_status( mocked_timestamp_sensor: MagicMock, mocked_timestamp_heartbeat: MagicMock, heartbeat_control_table_name: str, sensor_table_name: str, job_id: str, ctrl_heartbeat_df: DataFrame, ctrl_sensor_df: DataFrame, ) -> None: """Test the update of sensor and heartbeat control table statuses. This test validates that the `update_heartbeat_sensor_status` function correctly updates timestamps and status fields in both the sensor and heartbeat control tables. It also compares the updated tables against expected control DataFrames. Args: mocked_timestamp_sensor (MagicMock): A static timestamp for testing sensor table. mocked_timestamp_heartbeat (MagicMock): A static timestamp for testing heartbeat table. heartbeat_control_table_name (str): Name of the heartbeat control table to validate. sensor_table_name (str): Name of the sensor table to validate. job_id (str): Job identifier used in the update process. ctrl_heartbeat_df (DataFrame): Expected state of the updated heartbeat control table. ctrl_sensor_df (DataFrame): Expected state of the updated sensor table. """ mocked_timestamp_sensor.now.return_value = datetime.datetime( 2025, 8, 14, 23, 00, 00, 00000 ) mocked_timestamp_heartbeat.return_value = lit( datetime.datetime.strptime("2025/08/14 23:00", "%Y/%m/%d %H:%M") ).cast(TimestampType()) update_heartbeat_sensor_status( heartbeat_control_table_name, sensor_table_name, job_id ) heartbeat_data = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}") sensor_data = ExecEnv.SESSION.table(f"{sensor_table_name}") _LOGGER.info("Comparing heartbeat and sensor tables with control tables") assert not DataframeHelpers.has_diff(ctrl_sensor_df, sensor_data) assert not DataframeHelpers.has_diff(ctrl_heartbeat_df, heartbeat_data) @patch( "lakehouse_engine.core.sensor_manager.SensorJobRunManager.run_job", MagicMock(return_value=("run_id", None)), ) @patch("lakehouse_engine.algorithms.sensors.heartbeat.current_timestamp") def _trigger_heartbeat_sensor_jobs( mocked_timestamp_heartbeat: MagicMock, acon: dict, heartbeat_control_table_name: str, heartbeat_control_table_updated: DataFrame, ) -> None: """Test the triggering of sensor heartbeat jobs. This test mocks the `run_job` method to simulate job execution, triggers the heartbeat sensor jobs, and verifies that the heartbeat control table reflects the expected changes. Args: mocked_timestamp_heartbeat (MagicMock): A static timestamp for testing heartbeat table. acon (dict): Acon used to trigger the sensor jobs. heartbeat_control_table_name (str): Name of the heartbeat control table to validate. heartbeat_control_table_updated (DataFrame): Expected state of the control table after job execution. """ mocked_timestamp_heartbeat.return_value = lit( datetime.datetime.strptime("2025/08/14 23:00", "%Y/%m/%d %H:%M") ).cast(TimestampType()) trigger_heartbeat_sensor_jobs(acon) heartbeat_table_job_run = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}") assert not DataframeHelpers.has_diff( heartbeat_table_job_run, heartbeat_control_table_updated ) @pytest.mark.parametrize( "scenario", [ { "use_case_name": "default", "control_files": { "ctrl_heart_tbl_heartb_feed_fname": "ctr_heart_tbl_heartb_feed.csv", "ctrl_heart_tbl_exe_sns_hb_fname": "ctrl_heart_tbl_exec_sensor.csv", "ctrl_heart_tbl_updated_fname": "ctrl_heart_tbl_updated.csv", "ctrl_heart_tbl_trigger_job_fname": "ctrl_heart_tbl_trigger_job.csv", "ctrl_sensor_tbl_upd_status_fname": "ctrl_sensor_tbl_upd_status.json", "ctrl_heart_tbl_schema_fname": "ctrl_heart_tbl_schema.json", }, "tables": { "heartbeat_sensor_control_table": "heartbeat_sensor_control_table", "sensor_table": "sensor_table", }, "setup": { "setup_heartbeat_data": "setup_heartbeat_data.csv", "setup_sensor_data": "setup_sensor_data.json", "schema_sensor_df": "schema_sensor_df.json", }, "execute_sensor_heartbeat_results": {"new_events_available_count": 1}, "job_id": "1927384615203749", "trigger_heartbeat_sensor_jobs_records": { "heartbeat": """ ("delta_table","dummy_order","batch", "dummy_heartbeat_asset",NULL,NULL,NULL, "1015557820139870","data-product_job_name_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"UNPAUSED","true")""", "sensors": """ ("dummy_order", array("dummy_heartbeat_asset"),"ACQUIRED_NEW_DATA", NULL,NULL,"LOAD_DATE","10155578201985")""", }, }, { "use_case_name": "heartbeat_paused_sensor_new_record", "control_files": { "ctrl_heart_tbl_heartb_feed_fname": "ctr_heart_tbl_heartb_feed.csv", "ctrl_heart_tbl_exe_sns_hb_fname": "ctrl_heart_tbl_exec_sensor.csv", "ctrl_heart_tbl_updated_fname": "ctrl_heart_tbl_updated.csv", "ctrl_heart_tbl_trigger_job_fname": "ctrl_heart_tbl_trigger_job.csv", "ctrl_sensor_tbl_upd_status_fname": "ctrl_sensor_tbl_upd_status.json", "ctrl_heart_tbl_schema_fname": "ctrl_heart_tbl_schema.json", }, "tables": { "heartbeat_sensor_control_table": "heartbeat_sensor_control_table", "sensor_table": "sensor_table", }, "setup": { "setup_heartbeat_data": "setup_heartbeat_data.csv", "setup_sensor_data": "setup_sensor_data.json", "schema_sensor_df": "schema_sensor_df.json", }, "execute_sensor_heartbeat_results": {"new_events_available_count": 0}, "job_id": "2604918372561094", "trigger_heartbeat_sensor_jobs_records": { "heartbeat": """ ("delta_table","dummy_order","batch", "dummy_heartbeat_asset",NULL,NULL,NULL, "1015557820139870","data-product_job_name_orders","IN PROGRESS", NULL,NULL,NULL,"UNPAUSED","true")""", "sensors": """ ("dummy_order", array("dummy_heartbeat_asset"),"ACQUIRED_NEW_DATA", NULL,NULL,"LOAD_DATE","10155578201985")""", }, }, ], ) def test_heartbeat(scenario: dict) -> None: """Test the heartbeat feature. Tests the heartbeat feature by validating the four core functions invoked by the heartbeat algorithm. Args: scenario: The test scenario to execute. Scenarios: Default: A basic scenario that tests the four main steps of the Heartbeat algorithm: 1. `execute_heartbeat_sensor_data_feed`: Loads a CSV file into an empty Heartbeat control table. 2. `execute_sensor_heartbeat`: Simulates a Databricks job run. The return value is patched to avoid actual API calls. 3. `update_heartbeat_sensor_status`: Updates values in the Heartbeat and Sensor tables. 4. `trigger_heartbeat_sensor_jobs`: Triggers Databricks jobs. This function is also patched to prevent real job execution. Heartbeat_paused_sensor_new_record: Different state records that will have different behaviour. 1. A record wih job_state = 'PAUSED' and sensor_source = 'delta_table' is inserted into the `heartbeat` table. - Expected Behavior: No updates or changes throughout the test. 2. A record wih job_state = 'Null' and sensor_source = 'sap_bw' is inserted into heartbeat control table and sensor table. - Expected Behavior: Record is updated during the process to reflect activity. 3. A record wih job_state = 'COMPLETED' and sensor_source = 'kafka' is inserted into heartbeat control table. - Expected Behavior: - The record is updated during the process. - A corresponding entry is created in the `sensor` table. """ scenario_name = scenario["use_case_name"] _LOGGER.info(f"Setting up Test - {scenario_name}.") tables = scenario["tables"] control_files = scenario["control_files"] heartbeat_control_table_name = f"test_db.{tables['heartbeat_sensor_control_table']}" sensor_table_name = f"test_db.{tables['sensor_table']}" acon = { "heartbeat_sensor_db_table": heartbeat_control_table_name, "lakehouse_engine_sensor_db_table": sensor_table_name, "data_format": "delta", "sensor_source": "delta_table", "token": "my-token", "domain": "my-adidas-domain.cloud.databricks.com", } _create_heartbeat_table(scenario_name, tables) LocalStorage.copy_dir( f"{FEATURE_TEST_RESOURCES}/setup/{scenario_name}/data/", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/", ) LocalStorage.copy_dir( f"{FEATURE_TEST_RESOURCES}/control/{scenario_name}/data/", f"{TEST_LAKEHOUSE_CONTROL}/{scenario_name}/data/", ) setup_heartbeat_data_file_path = ( f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/" f"{scenario['setup']['setup_heartbeat_data']}" ) ctrl_heart_tbl_heartb_feed_fname = control_files["ctrl_heart_tbl_heartb_feed_fname"] ctrl_heart_tbl_heartb_feed_file_path = ( f"{TEST_LAKEHOUSE_CONTROL}/" f"{scenario_name}/data/{ctrl_heart_tbl_heartb_feed_fname}" ) ctrl_heart_tbl_schema_file_name = control_files["ctrl_heart_tbl_schema_fname"] ctrl_heart_tbl_schema_file_path = ( f"file:///{FEATURE_TEST_RESOURCES}/control/" f"{scenario_name}/schema/{ctrl_heart_tbl_schema_file_name}" ) ctrl_heartbeat_df = DataframeHelpers.read_from_file( ctrl_heart_tbl_heartb_feed_file_path, schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path), ) _test_heartbeat_sensor_data_feed( setup_heartbeat_data_file_path, heartbeat_control_table_name, ctrl_heartbeat_df ) _LOGGER.info("Testing execute_sensor_heartbeat function") ctrl_heart_tbl_exe_sns_file_name = control_files["ctrl_heart_tbl_exe_sns_hb_fname"] ctrl_heart_tbl_exe_sns_file_path = ( f"{TEST_LAKEHOUSE_CONTROL}/{scenario_name}/" f"data/{ctrl_heart_tbl_exe_sns_file_name}" ) ctrl_heart_tbl_exe_sns_df = DataframeHelpers.read_from_file( ctrl_heart_tbl_exe_sns_file_path, schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path), ) execute_sensor_results = scenario["execute_sensor_heartbeat_results"] _test_execute_sensor_heartbeat( acon=acon, heartbeat_control_table_name=heartbeat_control_table_name, ctrl_heartbeat_df=ctrl_heart_tbl_exe_sns_df, results=execute_sensor_results, ) _LOGGER.info("Testing update_heartbeat_sensor_status function") sensor_df_schema = ( f"file:///{FEATURE_TEST_RESOURCES}/setup/" f"{scenario_name}/schema/{scenario['setup']['schema_sensor_df']}" ) ctrl_heart_table_upd = ( f"{FEATURE_TEST_RESOURCES}/control/{scenario_name}/" f"data/{scenario['control_files']['ctrl_heart_tbl_updated_fname']}" ) setup_sensor_file_name = scenario["setup"]["setup_sensor_data"] sensor_table_data_path = ( f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/{setup_sensor_file_name}" ) ctrl_sensor_tbl_upd_status_fname = control_files["ctrl_sensor_tbl_upd_status_fname"] ctrl_sensor_upd_path = ( f"{TEST_LAKEHOUSE_CONTROL}/{scenario_name}/" f"data/{ctrl_sensor_tbl_upd_status_fname}" ) sensors_data = DataframeHelpers.read_from_file( sensor_table_data_path, file_format="json", schema=SchemaUtils.from_file_to_dict(sensor_df_schema), ) ctrl_sensor_upd_sensor_status_df = DataframeHelpers.read_from_file( ctrl_sensor_upd_path, file_format="json", schema=SchemaUtils.from_file_to_dict(sensor_df_schema), ) ctrl_heart_tbl_df_upd_sns_status = DataframeHelpers.read_from_file( ctrl_heart_table_upd, schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path), ) sensors_data.write.format("delta").mode("overwrite").saveAsTable(sensor_table_name) job_id = scenario["job_id"] _test_update_heartbeat_sensor_status( heartbeat_control_table_name=heartbeat_control_table_name, sensor_table_name=sensor_table_name, job_id=job_id, ctrl_heartbeat_df=ctrl_heart_tbl_df_upd_sns_status, ctrl_sensor_df=ctrl_sensor_upd_sensor_status_df, ) _LOGGER.info("Testing trigger_heartbeat_sensor_jobs function") _LOGGER.info(f"acon: {acon}") _LOGGER.info("Preparing heartbeat and sensor table") records_to_insert = scenario["trigger_heartbeat_sensor_jobs_records"] ExecEnv.SESSION.sql( f"""INSERT INTO {heartbeat_control_table_name} VALUES {records_to_insert["heartbeat"]}""" # nosec ) ExecEnv.SESSION.sql( f"""INSERT INTO {sensor_table_name} VALUES {records_to_insert["sensors"]}""" # nosec ) ctrl_heart_tbl_trig_job_fname = control_files["ctrl_heart_tbl_trigger_job_fname"] ctrl_heart_tbl_trig_job_path = ( f"file:///{FEATURE_TEST_RESOURCES}/control/" f"{scenario_name}/data/{ctrl_heart_tbl_trig_job_fname}" ) ctrl_heartbeat_update_df = DataframeHelpers.read_from_file( ctrl_heart_tbl_trig_job_path, schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path), ) _trigger_heartbeat_sensor_jobs( acon=acon, heartbeat_control_table_name=heartbeat_control_table_name, heartbeat_control_table_updated=ctrl_heartbeat_update_df, ) for _, table_name in tables.items(): LocalStorage.clean_folder(f"{LAKEHOUSE}{table_name}") ExecEnv.SESSION.sql(f"""DROP TABLE IF EXISTS test_db.{table_name}""") # nosec ================================================ FILE: tests/feature/test_jdbc_reader.py ================================================ """Test jdbc reader.""" from typing import List import pytest from pyspark.sql.utils import IllegalArgumentException from lakehouse_engine.engine import load_data from lakehouse_engine.transformers.exceptions import WrongArgumentsException from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "jdbc_reader" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" """Same as spark, we provide two different ways to run jdbc reader. We can use the jdbc() function, passing inside all the arguments needed for Spark to work and we can even combine this with additional options passed trough .options(). Other way is using .format("jdbc") and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution. JDBC Function Scenario - Description: correct_arguments - we are providing jdbc_args and options by passing arguments in a correct way. wrong_arguments - we are providing jdbc_args and options, but wrong arguments are filled to validate if spark reports the error messages properly. JDBC Format Scenario - Description: correct_arguments - we are providing options to .format(jdbc) by passing arguments in a correct way. wrong_arguments - we are providing options to .format(jdbc), but wrong arguments are filled to validate if spark reports the error messages properly. predicates - predicates on spark read works on jdbc() function only, but if you mistake and pass to .format(jdbc) as a option, spark won't show any error, so we decided to add a validation and raise the error, this scenario validates it. """ TEST_SCENARIOS = [ ["jdbc_function", "correct_arguments"], ["jdbc_function", "wrong_arguments"], ["jdbc_format", "correct_arguments"], ["jdbc_format", "wrong_arguments"], ["jdbc_format", "predicates"], ] @pytest.mark.parametrize("scenario", TEST_SCENARIOS) def test_jdbc_reader(scenario: List[str]) -> None: """Test loads from jdbc source. Args: scenario: scenario to test. """ if scenario[0] == "jdbc_format" and scenario[1] == "wrong_arguments": with pytest.raises(IllegalArgumentException, match="Option.*is required."): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json" ) elif scenario[0] == "jdbc_format" and scenario[1] == "predicates": with pytest.raises( WrongArgumentsException, match="Predicates can only be used with jdbc_args." ): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json" ) elif scenario[0] == "jdbc_function" and scenario[1] == "wrong_arguments": with pytest.raises( TypeError, match=r"jdbc\(\) got an unexpected keyword argument.*" ): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json" ) else: LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/", ) source_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data" ) DataframeHelpers.write_into_jdbc_table( source_df, f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/tests.db", f"{scenario[0]}", ) load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/", ) result_df = DataframeHelpers.read_from_table(f"test_db.{scenario[0]}_table") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data" ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/test_materialize_cdf.py ================================================ """Test materialize cdf to external location.""" from typing import Any import pytest from delta.tables import DeltaTable from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data, manage_table from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "materialize_cdf" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize("scenario", ["streaming_with_cdf"]) def test_streaming_with_cdf(scenario: str, caplog: Any) -> None: """Test materialize cdf function. Args: scenario: scenario name. caplog: captured log. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/table/streaming_with_cdf.sql", f"{TEST_LAKEHOUSE_IN}/data/table/", ) manage_table(f"file://{TEST_RESOURCES}/acon_create_table.json") LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/streaming_without_clean_cdf.json" ) load_data(acon=acon) assert "Writing CDF to external table..." in caplog.text LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/part-01_cdf.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/control_schema.json", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/control_schema.json", ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/control_schema.json" ), ) result_df_delta = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data", file_format=InputFormat.DELTAFILES.value, ).drop("_commit_timestamp") # once we are writing the cdf as delta, it can also be read as parquet. # because the _commit_timestamp field is a partition field (comes from the folder), # not from the parquet file, we need to enforce a schema where _commit_timestamp is # a string, not an int (as automatically inferred from the folder by spark). result_df_parquet = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data", file_format=InputFormat.PARQUET.value, schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/control_schema.json" ), ).drop("_commit_timestamp") assert not DataframeHelpers.has_diff(result_df_delta, control_df) assert not DataframeHelpers.has_diff(result_df_parquet, control_df) # to be able to execute vacuum on expose cdf terminator spec it is # necessary to update _commit_timestamp to an old value, for that we # are enforcing the timestamp with the following delta commands. delta_table = DeltaTable.forPath( ExecEnv.SESSION, f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data", ) delta_table.update(set={"_commit_timestamp": "'20211105132711'"}) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/part-02.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/streaming_with_clean_and_vacuum.json" ) load_data(acon=acon) assert "Writing CDF to external table..." in caplog.text assert "Cleaning CDF table..." in caplog.text assert "Vacuuming CDF table..." in caplog.text result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data", file_format=InputFormat.DELTAFILES.value, ) assert result_df.count() == 6 ================================================ FILE: tests/feature/test_notification.py ================================================ """Mail notifications tests.""" import re import typing import pytest from lakehouse_engine.core.definitions import TerminatorSpec from lakehouse_engine.engine import send_notification from lakehouse_engine.terminators.notifiers.email_notifier import EmailNotifier from lakehouse_engine.terminators.notifiers.exceptions import ( NotifierConfigException, NotifierTemplateConfigException, NotifierTemplateNotFoundException, ) from lakehouse_engine.utils.logging_handler import LoggingHandler from tests.conftest import FEATURE_RESOURCES from tests.utils.smtp_server import SMTPServer LOGGER = LoggingHandler(__name__).get_logger() TEST_ATTACHEMENTS_PATH = FEATURE_RESOURCES + "/notification/" @pytest.mark.parametrize( "scenario", [ { "name": "Email Notification Template", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "template": "failure_notification_email", "from": "test-email@email.com", "cc": ["test-email1@email.com", "test-email2@email.com"], "mimetype": "text/text", "exception": "test-exception", }, ), "expected": """ Job local in workspace local has failed with the exception: test-exception""", }, { "name": "Email Notification Free Form", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "mimetype": "text/text", "subject": "Test Email", "message": "Test message for the email.", "attachments": [ f"{TEST_ATTACHEMENTS_PATH}test_attachement.txt", f"{TEST_ATTACHEMENTS_PATH}test_image.png", ], }, ), "expected": "Test message for the email.", "expected_attachments": ["test_attachement.txt", "test_image.png"], }, { "name": "Email Notification Free Form", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "mimetype": "text/html", "subject": "Test Email", "message": """Test message.""", }, ), "expected": "Test message.", }, { "name": "Error: non-existent template", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "template": "missing_template", }, ), "expected": "Template missing_template does not exist", }, { "name": "Error: malformed definition", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], }, ), "expected": "Malformed Notification Definition", }, { "name": "Error: Using disallowed smtp server", "spec": TerminatorSpec( function="notify", args={ "server": "smtp.test.com", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "mimetype": "text/text", "subject": "Test Email", "message": "Test message for the email.", }, ), "expected": "Trying to use disallowed smtp server: " "'smtp.test.com'.\n" "Disallowed smtp servers: ['smtp.test.com']", }, ], ) def test_email_notification(scenario: dict) -> None: """Testing send email notification with template. Args: scenario: scenario to test. """ spec: TerminatorSpec = scenario["spec"] name = scenario["name"] expected_output = scenario["expected"] notification_type = spec.args["type"] LOGGER.info(f"Executing notification test: {name}") if notification_type == "email": port = spec.args["port"] server = spec.args["server"] email_notifier = EmailNotifier(spec) if "Error: " in name: with pytest.raises( ( NotifierTemplateNotFoundException, NotifierConfigException, NotifierTemplateConfigException, ) ) as e: email_notifier.create_notification() email_notifier.send_notification() assert expected_output in str(e.value) else: smtp_server = SMTPServer(server, port) smtp_server.start() email_notifier.create_notification() email_notifier.send_notification() ( email_from, email_to, email_cc, email_bcc, mimetype, subject, message, attachments, ) = _parse_email_output(smtp_server.get_last_message().as_string()) assert email_from == spec.args["from"] if "to" in spec.args: assert email_to == spec.args["to"] if "cc" in spec.args: assert email_cc == spec.args["cc"] if "bcc" in spec.args: assert email_bcc == spec.args["bcc"] assert mimetype == spec.args["mimetype"] assert subject == spec.args["subject"] assert message == expected_output assert attachments == scenario.get("expected_attachments", []) smtp_server.stop() @pytest.mark.parametrize( "scenario", [ { "name": "Email Notification Template", "args": { "server": "localhost", "port": "1025", "type": "email", "template": "failure_notification_email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "cc": ["test-email3@email.com", "test-email4@email.com"], "exception": "test-exception", }, "expected": """ Job local in workspace local has failed with the exception: test-exception""", }, { "name": "Email Notification Free Form", "args": { "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "bcc": ["test-email1@email.com", "test-email2@email.com"], "mimetype": "text/text", "subject": "Test Email", "message": "Test message for the email.", "attachments": [ f"{TEST_ATTACHEMENTS_PATH}test_attachement.txt", f"{TEST_ATTACHEMENTS_PATH}test_image.png", ], }, "expected": "Test message for the email.", "expected_attachments": ["test_attachement.txt", "test_image.png"], }, { "name": "Error: non-existent template", "args": { "server": "localhost", "port": "1025", "type": "email", "template": "missing_template", }, "expected": "Template missing_template does not exist", }, { "name": "Error: Malformed Notification Definition", "args": { "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], }, "expected": "Malformed Notification Definition", }, { "name": "Error: Using disallowed smtp server", "args": { "server": "smtp.test.com", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "mimetype": "plain", "subject": "Test Email", "message": "Test message for the email.", }, "expected": "Trying to use disallowed smtp server: " "'smtp.test.com'.\n" "Disallowed smtp servers: ['smtp.test.com']", }, ], ) def test_email_notification_facade(scenario: dict) -> None: """Testing send email notification with template. Args: scenario: scenario to test. """ args = scenario["args"] name = scenario["name"] expected_output = scenario["expected"] notification_type = args["type"] LOGGER.info(f"Executing notification test: {name}") if notification_type == "email": port = args["port"] server = args["server"] if "Error: " in name: with pytest.raises( ( NotifierTemplateNotFoundException, NotifierConfigException, NotifierTemplateConfigException, ) ) as e: send_notification(args=args) assert expected_output in str(e.value) else: smtp_server = SMTPServer(server, port) smtp_server.start() send_notification(args=args) ( email_from, email_to, email_cc, email_bcc, mimetype, subject, message, attachments, ) = _parse_email_output(smtp_server.get_last_message().as_string()) assert email_from == args["from"] if "to" in args: assert email_to == args["to"] if "cc" in args: assert email_cc == args["cc"] if "bcc" in args: assert email_bcc == args["bcc"] assert mimetype == args["mimetype"] assert subject == args["subject"] assert message == expected_output assert attachments == scenario.get("expected_attachments", []) smtp_server.stop() def _parse_email_output( mail_content: str, ) -> typing.Tuple[str, list, list, list, str, str, str, list]: """Parse the mail that was received in the debug smtp server. Args: mail_content: The raw mail content. Returns: A tuple with the email from, email to, cc, bcc, subject and message. """ email_from = re.search("(?<=From: ).*", mail_content).group() email_to = re.search("(?<=To: ).*", mail_content).group().split(", ") email_cc = re.search("(?<=CC: ).*", mail_content).group().split(", ") email_bcc = re.search("(?<=BCC: ).*", mail_content).group().split(", ") mimetype = re.search("(?<=Content-Type: ).*(?=; charset)", mail_content).group() subject = re.search("(?<=Subject: ).*", mail_content).group() message = re.search("(?<=bit\n).*?(?=--=)", mail_content, re.S).group()[1:-1] attachments = re.findall("""(?<=filename=").*(?=")""", mail_content) return ( email_from, email_to, email_cc, email_bcc, mimetype, subject, message, attachments, ) ================================================ FILE: tests/feature/test_reconciliation.py ================================================ """Test reconciliation.""" from typing import Any, List, Union import pytest from lakehouse_engine.algorithms.exceptions import ReconciliationFailedException from lakehouse_engine.algorithms.reconciliator import ReconciliationType from lakehouse_engine.engine import execute_reconciliation from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.local_storage import LocalStorage TEST_PATH = "reconciliation" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" ACON_WITH_QUERIES = { "metrics": [ { "metric": "net_sales", "type": "absolute", "aggregation": "sum", "yellow": 0.05, "red": 0.1, }, { "metric": "net_sales", "type": "percentage", "aggregation": "avg", "yellow": 0.04, "red": 0.08, }, ], "truth_input_spec": { "spec_id": "truth", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/truth.json", }, "truth_preprocess_query": """ SELECT country, sum(net_sales) as net_sales FROM truth GROUP BY country """, "truth_preprocess_query_args": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK_DESER"}, } ], "current_input_spec": { "spec_id": "current_results", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/current.json", }, "current_preprocess_query": """ SELECT country, sum(net_sales) as net_sales FROM current GROUP BY country """, "current_preprocess_query_args": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK"}, } ], } ACON_WITHOUT_QUERIES = { "metrics": [ { "metric": "net_sales", "type": "absolute", "aggregation": "sum", "yellow": 0.01, "red": 0.05, }, { "metric": "net_sales", "type": "absolute", "aggregation": "avg", "yellow": 0.04, "red": 0.08, }, ], "truth_input_spec": { "spec_id": "truth", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/truth.json", }, "truth_preprocess_query_args": [{"function": "cache"}], "current_input_spec": { "spec_id": "current_results", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/current.json", }, "current_preprocess_query_args": [], # turn cache off as it is a default } ACON_WITH_QUERIES_EMPTY_DF_TRUE_CHECK = { "metrics": [ { "metric": "net_sales", "type": "absolute", "aggregation": "sum", "yellow": 0.05, "red": 0.1, }, { "metric": "net_sales", "type": "percentage", "aggregation": "avg", "yellow": 0.04, "red": 0.08, }, ], "truth_input_spec": { "spec_id": "truth", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/truth.json", }, "truth_preprocess_query": """ SELECT country, sum(net_sales) as net_sales FROM truth where 1 = 0 group by country """, "truth_preprocess_query_args": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK_DESER"}, } ], "current_input_spec": { "spec_id": "current_results", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse" "/in/feature/reconciliation/data/current.json", }, "current_preprocess_query": """ SELECT country, sum(net_sales) as net_sales FROM current WHERE 1 = 0 group by country """, "current_preprocess_query_args": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK"}, } ], "ignore_empty_df": True, } ACON_WITH_QUERIES_EMPTY_DF_FALSE_CHECK = { "metrics": [ { "metric": "net_sales", "type": "absolute", "aggregation": "sum", "yellow": 0.05, "red": 0.1, }, { "metric": "net_sales", "type": "percentage", "aggregation": "avg", "yellow": 0.04, "red": 0.08, }, ], "truth_input_spec": { "spec_id": "truth", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/truth.json", }, "truth_preprocess_query": """ SELECT country, sum(net_sales) as net_sales FROM truth where 1 = 0 group by country """, "truth_preprocess_query_args": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK_DESER"}, } ], "current_input_spec": { "spec_id": "current_results", "read_type": "batch", "data_format": "json", "options": {"multiline": "true"}, "location": "file:///app/tests/lakehouse/in/feature/" "reconciliation/data/current.json", }, "current_preprocess_query": """ SELECT country, sum(net_sales) as net_sales FROM current WHERE 1 = 0 group by country """, "current_preprocess_query_args": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK"}, } ], "ignore_empty_df": False, } ACONS = { "with_queries_pct": ACON_WITH_QUERIES, "with_files_abs": ACON_WITHOUT_QUERIES, "failed_reconciliation_pct": ACON_WITH_QUERIES, "empty_truth": ACON_WITHOUT_QUERIES, "different_rows": ACON_WITHOUT_QUERIES, "empty_df_true_check": ACON_WITH_QUERIES_EMPTY_DF_TRUE_CHECK, "empty_df_false_check": ACON_WITH_QUERIES_EMPTY_DF_FALSE_CHECK, } @pytest.mark.parametrize( "scenario", [ [ "with_queries_pct", "current.json", "truth.json", None, "The Reconciliation process has succeeded.", ], [ "with_files_abs", "current.json", "truth.json", None, "The Reconciliation process has succeeded.", ], [ "failed_reconciliation_pct", "current_fail.json", "truth.json", "Reconciliation result: {'net_sales_absolute_diff_sum': 100.0, " "'net_sales_percentage_diff_avg': 0.0625}", "The Reconciliation process has failed with status: red.", ], [ "empty_truth", "current.json", "truth_empty.json", None, "The reconciliation has failed because either the truth dataset or the " "current results dataset was empty.", ], [ "different_rows", "current_different_rows.json", "truth_different_rows.json", "Reconciliation result: {'net_sales_absolute_diff_sum': 500.0, " "'net_sales_absolute_diff_avg': 100.0}", "The Reconciliation process has failed with status: red.", ], [ "empty_df_true_check", "current.json", "truth.json", None, "The Reconciliation process has succeeded.", ], [ "empty_df_false_check", "current.json", "truth.json", None, "The reconciliation has failed because either the truth dataset or the " "current results dataset was empty.", ], ], ) def test_reconciliation(scenario: str, caplog: Any) -> None: """Test reconciliation. Args: scenario: scenario to test. with_queries - uses queries to get the truth data and the current data. Reconciliation type is percentage. with_files - uses files for the truth data and query for the current data. Reconciliation type is absolute. failed_reconciliation - same as 'with_queries' but with a failed reconciliation. Reconciliation type is percentage. empty_truth - scenario in which the truth data is empty. different_rows - the truth dataset and current results dataset have different rows, therefore reconciliation should fail. caplog: captured log. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/*.json", f"{TEST_LAKEHOUSE_IN}/data/", ) acon = ACONS[scenario[0]] acon["current_input_spec"][ # type: ignore "location" ] = f"file:///app/tests/lakehouse/in/feature/reconciliation/data/{scenario[1]}" acon["truth_input_spec"][ # type: ignore "location" ] = f"file:///app/tests/lakehouse/in/feature/reconciliation/data/{scenario[2]}" if scenario[0] in [ "failed_reconciliation_pct", "empty_truth", "different_rows", "empty_df_false_check", ]: with pytest.raises(ReconciliationFailedException) as e: execute_reconciliation(acon=acon) # type: ignore if scenario[3]: assert scenario[3] in caplog.text assert str(e.value) == scenario[4] else: execute_reconciliation(acon=acon) # type: ignore assert scenario[4] in caplog.text @pytest.mark.parametrize( "scenario", [ [ "pass", ReconciliationType.PCT.value, 0.05, 0.1, "current_nulls_and_zeros", "truth_nulls_and_zeros", "Reconciliation result: {'net_sales_percentage_diff_sum': 0.0, " "'net_sales_percentage_diff_avg': 0.0}", "The Reconciliation process has succeeded.", ], [ "fail_if_threshold_zero", ReconciliationType.PCT.value, 0, 0, "current_nulls_and_zeros_fail", "truth_nulls_and_zeros_fail", "Reconciliation result: {'net_sales_percentage_diff_sum': 1.0, " "'net_sales_percentage_diff_avg': 0.3333333333333333}", "The Reconciliation process has failed with status: red.", ], [ "fail_null_is_not_zero", ReconciliationType.PCT.value, 0.05, 0.1, "current_nulls_and_zeros_fail", "truth_nulls_and_zeros_fail", "Reconciliation result: {'net_sales_percentage_diff_sum': 1.0, " "'net_sales_percentage_diff_avg': 0.3333333333333333}", "The Reconciliation process has failed with status: red.", ], ], ) def test_nulls_and_zero_values_and_threshold( scenario: List[Union[str, float]], caplog: Any ) -> None: """Test truth and current datasets with nulls and zeros. Args: scenario: scenario to test. pass - reconciliation should pass even if there are 0s and nulls in the truth and current datasets. fail_if_threshold_zero - reconciliation should fail if users pass 0 as threshold as of course 0 indicates there's no difference. If that's the threshold then it will indicate that the reconciliation has failed. fail_null_is_not_zero - reconciliation should fail if in the first record of the current data we have a 0, and in the corresponding row of the truth data we have a null, because that indicates a percentage difference of 1 according to the recon algorithm, and therefore, the reconciliation should present those differences properly, instead of assuming that 0 is equal to null. caplog: captured log. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/*.json", f"{TEST_LAKEHOUSE_IN}/data/", ) acon = ACON_WITHOUT_QUERIES acon["current_input_spec"]["location"] = ( # type: ignore f"file:///app/tests/" f"lakehouse/in/feature/reconciliation/data/{scenario[4]}.json" ) acon["truth_input_spec"]["location"] = ( # type: ignore f"file:///app/tests/" f"lakehouse/in/feature/reconciliation/data/{scenario[5]}.json" ) acon["metrics"][0]["type"] = scenario[1] # type: ignore acon["metrics"][0]["yellow"] = scenario[2] # type: ignore acon["metrics"][0]["red"] = scenario[3] # type: ignore acon["metrics"][1]["type"] = scenario[1] # type: ignore acon["metrics"][1]["yellow"] = scenario[2] # type: ignore acon["metrics"][1]["red"] = scenario[3] # type: ignore if scenario[0] in ["fail_null_is_not_zero", "fail_if_threshold_zero"]: with pytest.raises(ReconciliationFailedException) as e: execute_reconciliation(acon=acon) assert scenario[6] in caplog.text assert str(e.value) == scenario[7] else: execute_reconciliation(acon=acon) assert scenario[6] in caplog.text ================================================ FILE: tests/feature/test_schema_evolution.py ================================================ """Test schema evolution on delta loads.""" from typing import Generator import pytest from pyspark.sql.utils import AnalysisException from lakehouse_engine.core.definitions import InputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "schema_evolution" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.fixture(autouse=True) def prepare_tests() -> Generator: """Run setup and cleanup steps before/after each test scenario.""" # Test setup yield # Test cleanup LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}") @pytest.mark.parametrize( "scenario", [ [ "auto_merge_enabled_add_column", "part-02", "batch_delta_enabled", "control_schema_add_column", ], [ "auto_merge_disabled_add_column", "part-02", "batch_delta_disabled", "control_schema_add_column", ], [ "auto_merge_enabled_remove_column", "part-03", "batch_delta_enabled", "control_schema", ], [ "auto_merge_disabled_remove_column", "part-03", "batch_delta_disabled", "control_schema", "customer", ], [ "auto_merge_enabled_cast_column", "part-04", "batch_delta_enabled", "control_schema", ], [ "auto_merge_disabled_cast_column", "part-04", "batch_delta_disabled", "control_schema", ], [ "auto_merge_enabled_rename_column_file", "part-05", "batch_delta_enabled", "control_schema_rename", ], [ "auto_merge_disabled_rename_column_file", "part-05", "batch_delta_disabled", "control_schema_rename", "request", ], [ "auto_merge_enabled_rename_column_transform", "part-06", "batch_delta_enabled", "control_schema", ], [ "auto_merge_disabled_rename_column_transform", "part-06", "batch_delta_disabled_rename", "control_schema", "ARTICLE", ], ], ) def test_schema_evolution_delta_load(scenario: str) -> None: """Test schema evolution on delta loads. Args: scenario: scenario to test. auto_merge_enabled_add_column - it performs the merge successfully and the new column is added to the schema (older rows assume null value for this column) auto_merge_disabled_add_column - it performs the merge successfully but the new column is ignored (is not added to the final schema). auto_merge_enabled_remove_column - it performs the merge successfully, the column is not removed from the final schema and the new rows assume the value null for this column. auto_merge_disabled_remove_column - purposely checks that the delta load fails when a column is removed. auto_merge_enabled_cast_column - it performs the merge successfully but the column type does not change automatically in the final schema. auto_merge_disabled_cast_column - it performs the merge successfully but the column type does not change automatically in the final schema. auto_merge_enabled_rename_column_file - it performs the merge successfully but assumes the renamed column as a new column (the column is renamed in the source schema only). auto_merge_disabled_rename_column_file - purposely checks that the delta load fails when a column is renamed (the column is renamed in the source schema only). auto_merge_enabled_rename_column_transform - it performs the merge successfully but ignores the renaming transformation specified in the acon. auto_merge_disabled_rename_column_transform - checks the behavior of the delta load when a column is renamed to lowercase, based on a transformation specified in the acon, without spark case-sensitive property. Scenario Properties: [scenario name, input file, acon file, control schema file, error message excerpt (optional)] """ _create_table("schema_evolution_delta_load", "delta_load") # initial load LocalStorage.copy_file( f"{TEST_RESOURCES}/delta_load/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/delta_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/delta_load/schema/source/source_part-01_schema.json", f"{TEST_LAKEHOUSE_IN}/delta_load/", ) load_data( f"file://{TEST_RESOURCES}/delta_load/batch_init_" f"{'enabled' if 'enabled' in scenario[0] else 'disabled'}.json" ) initial_schema = DataframeHelpers.read_from_table( "test_db.schema_evolution_delta_load" ).schema LocalStorage.copy_file( f"{TEST_RESOURCES}/delta_load/data/source/{scenario[1]}.csv", f"{TEST_LAKEHOUSE_IN}/delta_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/delta_load/schema/source/source_{scenario[1]}_schema.json", f"{TEST_LAKEHOUSE_IN}/delta_load/source_delta_schema.json", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/delta_load/{scenario[2]}.json" ) # tests with schema auto merge enabled if ( "enabled" in scenario[0] or scenario[0] == "auto_merge_disabled_rename_column_transform" ): load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/delta_load/data", file_format=InputFormat.DELTAFILES.value, ) schema_after_merge = DataframeHelpers.read_from_table( "test_db.schema_evolution_delta_load" ).schema LocalStorage.copy_file( f"{TEST_RESOURCES}/delta_load/data/control/{scenario[1]}.csv", f"{TEST_LAKEHOUSE_CONTROL}/delta_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/delta_load/schema/control/{scenario[3]}.json", f"{TEST_LAKEHOUSE_CONTROL}/delta_load/", ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/delta_load/data/{scenario[1]}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/delta_load/{scenario[3]}.json" ), ) # for the cast and rename tests, based on the transformations # specified in the acon file, the schema changes are ignored if scenario[0] == "auto_merge_enabled_cast_column" or scenario[0] == ( "auto_merge_enabled_rename_column_transform" ): assert initial_schema == schema_after_merge else: assert not DataframeHelpers.has_diff(result_df, control_df) # tests with schema auto merge disabled elif "disabled" in scenario[0]: # for "add column" and "cast column" tests the merge runs successfully # but the schema changes are ignored if "add" in scenario[0] or "cast" in scenario[0]: load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/delta_load/data", file_format=InputFormat.DELTAFILES.value, ) if scenario[0] == "auto_merge_disabled_add_column": assert "new_column" not in result_df.columns else: assert not isinstance(result_df["code"], str) # for the removing column tests, the merge throws an error else: with pytest.raises( AnalysisException, match=f".*Cannot resolve {scenario[4]} in UPDATE clause given.*", ): load_data(acon=acon) @pytest.mark.parametrize( "scenario", [ [ "auto_merge_enabled_add_column", "part-02", "batch_append_enabled", "control_schema_add_column", ], [ "auto_merge_disabled_add_column", "part-02", "batch_append_disabled", "control_schema_add_column", "A schema mismatch detected when writing to the Delta table", ], [ "auto_merge_enabled_remove_column", "part-03", "batch_append_enabled", "control_schema", ], [ "auto_merge_disabled_remove_column", "part-03", "batch_append_disabled", "control_schema", ], [ "auto_merge_enabled_cast_column", "part-04", "batch_append_enabled_cast", "control_schema", "Failed to merge fields", ], [ "auto_merge_disabled_cast_column", "part-04", "batch_append_disabled", "control_schema", ], [ "auto_merge_enabled_rename_column_file", "part-05", "batch_append_enabled", "control_schema_rename", ], [ "auto_merge_disabled_rename_column_file", "part-05", "batch_append_disabled", "control_schema_rename", "A schema mismatch detected", ], [ "auto_merge_enabled_rename_column_transform", "part-06", "batch_append_enabled", "control_schema", ], [ "auto_merge_disabled_rename_column_transform", "part-06", "batch_append_disabled", "control_schema", ], ], ) def test_schema_evolution_append_load(scenario: str) -> None: """Test schema evolution on append loads. Args: scenario: scenario to test. auto_merge_enabled_add_column - it performs the append load successfully and the new column is added to the schema (older rows assume null value for this column) auto_merge_disabled_add_column - purposely checks that the append load fails when a new column is added. auto_merge_enabled_remove_column - it performs the append load successfully, the column is not removed from the final schema and the new rows assume the value null for this column. auto_merge_disabled_remove_column - it performs the append load successfully, the column is not removed from the final schema and the new rows assume the value null for this column. auto_merge_enabled_cast_column - purposely checks that the append load fails when a cast transformation is added to the acon file. auto_merge_disabled_cast_column - purposely checks that the append load fails when a cast transformation is added to the acon file. auto_merge_enabled_rename_column_file - purposely checks that the append load fails when a column is renamed (the column is renamed in the source schema only). auto_merge_disabled_rename_column_file - purposely checks that the append load fails when a column is renamed (the column is renamed in the source schema only). auto_merge_enabled_rename_column_transform - it performs the append load successfully but ignores the renaming transformation specified in the acon. auto_merge_disabled_rename_column_transform - it performs the append load successfully but ignores the renaming transformation specified in the acon. Scenario Properties: [scenario name, input file, acon file, control schema file, error message excerpt (optional)] """ _create_table("schema_evolution_append_load", "append_load") # initial load LocalStorage.copy_file( f"{TEST_RESOURCES}/append_load/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/append_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/append_load/schema/source/source_part-01_schema.json", f"{TEST_LAKEHOUSE_IN}/append_load/", ) load_data( f"file://{TEST_RESOURCES}/append_load/batch_init_" f"{'enabled' if 'enabled' in scenario[0] else 'disabled'}.json" ) initial_schema = DataframeHelpers.read_from_table( "test_db.schema_evolution_append_load" ).schema LocalStorage.copy_file( f"{TEST_RESOURCES}/append_load/data/source/{scenario[1]}.csv", f"{TEST_LAKEHOUSE_IN}/append_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/append_load/schema/source/source_{scenario[1]}_schema.json", f"{TEST_LAKEHOUSE_IN}/append_load/source_append_schema.json", ) # tests with schema auto merge enabled if "enabled" in scenario[0]: # for the cast column test, the append throws an error acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/append_load/{scenario[2]}.json" ) if "cast" in scenario[0]: with pytest.raises(AnalysisException, match=f".*{scenario[4]}*"): load_data(acon=acon) else: load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/append_load/data", file_format=InputFormat.DELTAFILES.value, ) schema_after_append = DataframeHelpers.read_from_table( "test_db.schema_evolution_append_load" ).schema LocalStorage.copy_file( f"{TEST_RESOURCES}/append_load/data/control/{scenario[1]}.csv", f"{TEST_LAKEHOUSE_CONTROL}/append_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/append_load/schema/control/{scenario[3]}.json", f"{TEST_LAKEHOUSE_CONTROL}/append_load/", ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/append_load/data/{scenario[1]}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_CONTROL}/append_load/{scenario[3]}.json" ), ) # for rename test, based on the transformation specified in the # acon file, the schema change is ignored if scenario[0] == "auto_merge_enabled_rename_column_transform": assert initial_schema == schema_after_append else: assert not DataframeHelpers.has_diff(result_df, control_df) # tests with schema auto merge disabled elif "disabled" in scenario[0]: # for the renaming or adding column tests, the append throws an error acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/append_load/{scenario[2]}.json" ) if "rename_column_file" in scenario[0] or "add" in scenario[0]: with pytest.raises(AnalysisException, match=f".*{scenario[4]}*"): load_data(acon=acon) else: load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/append_load/data", file_format=InputFormat.DELTAFILES.value, ) schema_after_append = DataframeHelpers.read_from_table( "test_db.schema_evolution_append_load" ).schema assert initial_schema == schema_after_append @pytest.mark.parametrize( "scenario", [ [ "auto_merge_enabled", "part-02", "batch_merge_enabled", "control_schema_merge_enabled", ], [ "auto_merge_disabled", "part-02", "batch_merge_disabled", "", "Failed to merge", ], [ "overwrite_schema", "part-02", "batch_overwrite", "control_schema_overwrite", ], ], ) def test_schema_evolution_full_load(scenario: str) -> None: """Test schema evolution on full loads. Args: scenario: scenario to test. auto_merge_enabled - overwrites the data in the table but does not overwrite the schema (assumes the new column, keeps the removed column, ignores renaming and cast transformations) auto_merge_disabled - throws a mismatch schema error. overwrite_schema - overwrites the data and the schema of the table. Scenario Properties: [scenario name, input file, acon file, control schema file, error message excerpt (optional)] """ _create_table("schema_evolution_full_load", "full_load") # initial load LocalStorage.copy_file( f"{TEST_RESOURCES}/full_load/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/full_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/full_load/schema/source/source_part-01_schema.json", f"{TEST_LAKEHOUSE_IN}/full_load/source_schema.json", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/full_load/batch_init.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/full_load/data/source/{scenario[1]}.csv", f"{TEST_LAKEHOUSE_IN}/full_load/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/full_load/schema/source/source_{scenario[1]}_schema.json", f"{TEST_LAKEHOUSE_IN}/full_load/source_schema.json", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/full_load/{scenario[2]}.json") if scenario[0] == "auto_merge_disabled": with pytest.raises(AnalysisException, match=f".*{scenario[4]}*"): load_data(acon=acon) else: load_data(acon=acon) final_schema = SchemaUtils.from_table_schema( "test_db.schema_evolution_full_load" ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/full_load/data", file_format=InputFormat.DELTAFILES.value, ) LocalStorage.copy_file( f"{TEST_RESOURCES}/full_load/schema/control/{scenario[3]}.json", f"{TEST_LAKEHOUSE_CONTROL}/full_load/", ) control_schema = SchemaUtils.from_file( f"file://{TEST_LAKEHOUSE_CONTROL}/full_load/{scenario[3]}.json" ) assert final_schema == control_schema # with the rename transformation specified in acon, both the original # and the renamed field (ARTICLE and article) are not considered in # the final schema assert ("article", "ARTICLE") not in result_df.columns def _create_table(table_name: str, location: str) -> None: """Create test table.""" ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS test_db.{table_name}") ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS test_db.{table_name} ( actrequest_timestamp string, request string, datapakid int, partno int, record int, salesorder int, item int, recordmode string, date int, customer string, ARTICLE string, amount int, code int ) USING delta LOCATION '{TEST_LAKEHOUSE_OUT}/{location}/data' """ ) ================================================ FILE: tests/feature/test_sensors.py ================================================ """Module with integration tests for sensors feature.""" import json import os from datetime import datetime import pytest from pyspark.sql.types import StringType, StructField, StructType from lakehouse_engine.algorithms.exceptions import ( NoNewDataException, SensorAlreadyExistsException, ) from lakehouse_engine.core.definitions import SensorSpec, SensorStatus from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.sensor_manager import SensorControlTableManager from lakehouse_engine.engine import ( execute_sensor, generate_sensor_query, update_sensor_status, ) from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "sensors" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" _TEST_SENSOR_DELTA_TABLE_BASE_SCHEMA = { "sensor_id": "string", "assets": "array", "status": "string", "status_change_timestamp": "timestamp", "checkpoint_location": "string", } _TEST_SENSOR_DELTA_TABLE_SCHEMA = { **_TEST_SENSOR_DELTA_TABLE_BASE_SCHEMA, **{ "upstream_key": "string", "upstream_value": "string", }, } @pytest.mark.parametrize( "scenario", [ "1st_run", "has_new_data", "has_data_from_previous_execution", "upstream_acquired_new_data_but_not_processed", "no_new_data", ], ) def test_table_sensor(scenario: list) -> None: """Test the feature of using a sensor to read from a delta table. This specific test focuses on a delta table that is in itself the delta table where sensor information is stored. This is useful for data products consuming other data products sensor information to trigger their pipelines. Scenarios: 1st_run: initial setup. has_new_data: the first time the sensor detects new data from the upstream. has_data_from_previous_execution: the sensor does not detect new data from the upstream, but it had data detected from a previous execution of the pipeline for which the completion of the processing of all the data was not acknowledged (e.g., the pipeline failed before completing all the tasks). upstream_acquired_new_data_but_not_processed: tests the scenario where the upstream sensor has acquired new data, but because it's still not in processed state, the downstream sensoring this table cannot consider there's new data available from the upstream (e.g., a data product pipeline has identified new data from the source, but the pipeline failed, so the downstream data product pipeline's sensor cannot consider there's new data from the upstream). no_new_data: there's no new data from the upstream. """ upstream_table = "test_table_sensor_upstream" sensor_id = "sensor_id_1" control_db_table_name = "test_db.test_table_sensor" checkpoint_location = f"{TEST_LAKEHOUSE_IN}/test_table_sensor/" if scenario == "1st_run": DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_SCHEMA, table="test_table_sensor", ) DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_SCHEMA, table=upstream_table, enable_cdf=True, ) if scenario == "has_new_data": _insert_data_into_upstream_table(upstream_table) elif scenario == "upstream_acquired_new_data_but_not_processed": _insert_data_into_upstream_table( upstream_table, values=( f"('sensor_id_upstream_1', array('dummy_upstream_asset_1'), " f"'{SensorStatus.ACQUIRED_NEW_DATA.value}', " f"'2023-05-30 23:29:49.079522', null, null, null)" ), ) acon = { "sensor_id": sensor_id, "assets": ["dummy_asset_1"], "control_db_table_name": control_db_table_name, "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "delta", "db_table": f"test_db.{upstream_table}", "options": { "readChangeFeed": "true", }, }, "preprocess_query": generate_sensor_query("sensor_id_upstream_1"), "base_checkpoint_location": checkpoint_location, "fail_on_empty_result": True, } if scenario in ["has_new_data", "has_data_from_previous_execution"]: has_new_data = execute_sensor(acon=acon) sensor_table_data = SensorControlTableManager.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) assert sensor_table_data.status == SensorStatus.ACQUIRED_NEW_DATA.value assert has_new_data if scenario == "has_data_from_previous_execution": # this is the final scenario where we should have data from upstream. # therefore, we checkpoint to indicate that sensor has processed # all the new data. update_sensor_status( sensor_id, control_db_table_name, ) sensor_table_data = SensorControlTableManager.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) assert sensor_table_data.status == SensorStatus.PROCESSED_NEW_DATA.value else: with pytest.raises(NoNewDataException) as exception: execute_sensor(acon=acon) assert f"No data was acquired by {sensor_id} sensor." == str(exception.value) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "raise_exception_as_sensor_already_exists_by_sensor_id", "sensor_id": "sensor_id_2", "assets": ["dummy_asset_1"], }, { "scenario_name": "raise_exception_as_sensor_already_exists_by_assets", "sensor_id": "sensor_id_1", "assets": ["dummy_asset_2"], }, ], ) def test_if_sensor_already_exists(scenario: dict) -> None: """Test if the sensor already exists. This specific test focuses on the ways to identify if a sensor already exists. Scenarios: raise_exception_as_sensor_already_exists_by_sensor_id: raises exception if you try to create a sensor with a different sensor id but same asset. raise_exception_as_sensor_already_exists_by_assets: raises exception if you try to create a sensor with different assets but same sensor_id. """ sensor_id = "sensor_id_1" assets = ["dummy_asset_1"] control_db_table_name = "test_db.test_table_sensor" upstream_table = "test_table_sensor_upstream" checkpoint_location = f"{TEST_LAKEHOUSE_IN}/test_table_sensor/" LocalStorage.clean_folder(checkpoint_location) ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS {control_db_table_name}") ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS test_db.{upstream_table}") DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_SCHEMA, table="test_table_sensor", ) DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_SCHEMA, table=upstream_table, enable_cdf=True, ) _insert_data_into_upstream_table(upstream_table) acon = { "sensor_id": sensor_id, "assets": assets, "control_db_table_name": control_db_table_name, "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "delta", "db_table": f"test_db.{upstream_table}", "options": { "readChangeFeed": "true", }, }, "preprocess_query": generate_sensor_query("sensor_id_upstream_1"), "base_checkpoint_location": checkpoint_location, "fail_on_empty_result": True, } execute_sensor(acon=acon) with pytest.raises(SensorAlreadyExistsException) as exception: acon["sensor_id"] = scenario["sensor_id"] acon["assets"] = scenario["assets"] execute_sensor(acon=acon) assert "There's already a sensor registered with same id or assets!" == str( exception.value ) @pytest.mark.parametrize( "scenario", [ "1st_run", "2nd_run_with_new_data", "3rd_run_without_new_data", "4th_run_with_new_data", ], ) def test_jdbc_sensor(scenario: str) -> None: """Test the feature of sensoring new data from a jdbc upstream. Scenario: 1st_run - initial setup. 2nd_run_with_new_data - jdbc upstream has new data. 3rd_run_without_new_data - jdbc upstream does not have new data. 4th_run_with_new_data - jdbc upstream has new data again. """ upstream_jdbc_table = "test_jdbc_sensor_upstream" sensor_id = "sensor_id_1" sensor_table = "test_jdbc_sensor" control_db_table_name = f"test_db.{sensor_table}" os.makedirs(f"{TEST_LAKEHOUSE_IN}/{upstream_jdbc_table}", exist_ok=True) if scenario == "1st_run": DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_SCHEMA, table=sensor_table, ) _insert_into_jdbc_table(init=True) elif scenario == "2nd_run_with_new_data": _insert_into_jdbc_table(time=datetime.now()) elif scenario == "4th_run_with_new_data": _insert_into_jdbc_table(time=datetime.now()) acon = { "sensor_id": sensor_id, "assets": ["dummy_asset_1"], "control_db_table_name": control_db_table_name, "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/" f"{upstream_jdbc_table}/tests.db", "table": upstream_jdbc_table, "properties": {"driver": "org.sqlite.JDBC"}, }, }, "preprocess_query": generate_sensor_query( sensor_id=sensor_id, filter_exp="?upstream_key > '?upstream_value'", control_db_table_name=control_db_table_name, upstream_key="dummy_time", ), "fail_on_empty_result": True, } if scenario in ["2nd_run_with_new_data", "4th_run_with_new_data"]: has_new_data = execute_sensor(acon=acon) sensor_table_data = SensorControlTableManager.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) assert sensor_table_data.status == SensorStatus.ACQUIRED_NEW_DATA.value update_sensor_status( sensor_id, control_db_table_name, ) sensor_table_data = SensorControlTableManager.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) assert sensor_table_data.status == SensorStatus.PROCESSED_NEW_DATA.value assert has_new_data else: with pytest.raises(NoNewDataException) as exception: execute_sensor(acon=acon) assert f"No data was acquired by {sensor_id} sensor." == str(exception.value) def test_files_sensor() -> None: """Test the feature of sensoring a filesystem location (e.g., s3).""" sensor_id = "sensor_id_1" sensor_table = "test_files_sensor" control_db_table_name = f"test_db.{sensor_table}" checkpoint_location = f"{TEST_LAKEHOUSE_IN}/test_files_sensor/" files_location = f"{TEST_LAKEHOUSE_IN}/test_files_sensor/files/" DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_SCHEMA, table=sensor_table, ) schema = _insert_files_sensor_test_data(files_location) acon = { "sensor_id": sensor_id, "assets": ["dummy_asset_1"], "control_db_table_name": control_db_table_name, "input_spec": { "spec_id": "sensor_upstream", "read_type": "streaming", "data_format": "csv", "location": files_location, "schema": json.loads(schema.json()), }, "base_checkpoint_location": checkpoint_location, "fail_on_empty_result": False, } has_new_data = execute_sensor(acon=acon) assert has_new_data def test_update_sensor_status() -> None: """Test sensor update status logic.""" sensor_id = "sensor_id_1" sensor_table = "test_checkpoint_sensor" control_db_table_name = f"test_db.{sensor_table}" status = SensorStatus.ACQUIRED_NEW_DATA.value checkpoint_location = "s3://dummy-bucket/sensors/sensor_id_1" DataframeHelpers.create_delta_table( _TEST_SENSOR_DELTA_TABLE_BASE_SCHEMA, table="test_checkpoint_sensor", ) SensorControlTableManager.update_sensor_status( sensor_spec=SensorSpec( sensor_id=sensor_id, assets=["asset_1"], control_db_table_name=control_db_table_name, checkpoint_location=checkpoint_location, preprocess_query=None, input_spec=None, ), status=status, ) row = SensorControlTableManager.read_sensor_table_data( sensor_id=sensor_id, control_db_table_name=control_db_table_name ) assert ( row.sensor_id == sensor_id and row.status == SensorStatus.ACQUIRED_NEW_DATA.value and row.checkpoint_location == "s3://dummy-bucket/sensors/sensor_id_1" ) def _insert_data_into_upstream_table( table: str, db: str = "test_db", values: str = None, ) -> None: """Insert data into upstream table for testing sensoring based on tables. Args: table: table name. db: database name. values: string with the values operator for inserting data through SQL DML statement. """ if not values: values = ( f"('sensor_id_upstream_1', array('dummy_upstream_asset_1'), " f"'{SensorStatus.PROCESSED_NEW_DATA.value}', " f"'2023-05-30 23:28:49.079522', null, null, null)," f"('sensor_id_upstream_2', array('dummy_upstream_asset_2'), " f"'{SensorStatus.PROCESSED_NEW_DATA.value}', " f"'2023-05-30 23:28:49.089522', null, null, null)" ) ExecEnv.SESSION.sql(f"INSERT INTO {db}.{table} VALUES {values}") # nosec: B608 def _insert_files_sensor_test_data(files_location: str) -> StructType: """Insert test data for files sensor test. Args: files_location: location to insert the data. Returns: A dummy struct type. """ schema = StructType([StructField("dummy_field", StringType(), True)]) df = ExecEnv.SESSION.createDataFrame( [ ["a"], ["b"], ], schema, ) df.write.format("csv").save(files_location) return schema def _insert_into_jdbc_table( init: bool = False, time: datetime = None, ) -> None: """Insert data into the jdbc table for tests. Args: init: if to init the table or not with empty data. time: value to use for the dummy_time field, so that time-based filters can be applied to the table so that we know that new data is available from upstream. """ schema = StructType( [ StructField("dummy_field", StringType(), True), StructField("dummy_time", StringType(), True), ] ) if init: df = ExecEnv.SESSION.createDataFrame( [], schema, ) else: df = ExecEnv.SESSION.createDataFrame( [ ["a", str(time)], ["b", str(time)], ], schema, ) DataframeHelpers.write_into_jdbc_table( df, f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/test_jdbc_sensor_upstream/tests.db", "test_jdbc_sensor_upstream", ) ================================================ FILE: tests/feature/test_sftp_reader.py ================================================ """Test SFTP reader. Note: there is a limitation with the SFTP server/client which serves all files with the same access and modified time, so we use the biggest dates to cover those scenarios. Moreover, we also cover scenarios were no files are expected to be found, due to the date filters. """ import gzip import io import os from copy import deepcopy from io import TextIOWrapper from typing import Generator from zipfile import ZipFile import pandas as pd import pytest from paramiko import Transport from paramiko.sftp_client import SFTPClient from pytest_sftpserver.consts import ( # type: ignore SERVER_KEY_PRIVATE, SERVER_KEY_PUBLIC, ) from pytest_sftpserver.sftp.server import SFTPServer # type: ignore from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from lakehouse_engine.utils.logging_handler import LoggingHandler from tests.conftest import FEATURE_RESOURCES, LAKEHOUSE_FEATURE_OUT from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "sftp_reader" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" LOCAL_PATH = f"{TEST_RESOURCES}/data/" LOGGER = LoggingHandler(__name__).get_logger() FILES = os.listdir(LOCAL_PATH) @pytest.fixture(scope="module") def sftp_client(sftpserver: SFTPServer) -> Generator: """Create the sftp client to perform the tests. Args: sftpserver: a local SFTP-Server provided by the plugin pytest-sftpserver. """ conn_cred = {"username": "a", "password": "b"} transport = Transport((sftpserver.host, sftpserver.port)) transport.connect( hostkey=None, **conn_cred, pkey=None, gss_host=None, gss_auth=False, gss_kex=False, gss_deleg_creds=True, gss_trust_dns=True, ) client = SFTPClient.from_transport(transport) yield client client.close() transport.close() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_csv", "test_name": "between_dates", "sftp_files_format": "csv", "file_name": "file", "file_extension": ".csv", }, { "scenario_name": "sftp_csv", "test_name": "between_dates_fail", "sftp_files_format": "csv", "file_name": "file", "file_extension": ".csv", }, ], ) def test_sftp_reader_csv( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - csv type. This tests covers a connection using keys and tests a scenario between dates. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "key_type": "RSA", "pkey": LocalStorage.read_file(SERVER_KEY_PUBLIC).split()[1], "key_filename": SERVER_KEY_PRIVATE, "date_time_gt": "2022-01-01", "date_time_lt": ( "9999-12-31" if "fail" not in scenario["test_name"] else "2021-01-01" ), "file_name_contains": f"e{scenario['file_extension']}", "args": {"sep": "|"}, } acon = _get_test_acon(scenario, option_params) if "fail" not in scenario["test_name"]: _execute_and_validate(acon, scenario) else: with pytest.raises( ValueError, match="No files were found with the specified parameters." ): _execute_and_validate(acon, scenario) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_fwf", "test_name": "earliest_file", "sftp_files_format": "fwf", "file_name": "file5", "file_extension": ".txt", } ], ) def test_sftp_reader_fwf( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - fwf type. This test covers a connection using add auto policy and tests earliest file and additional args. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "add_auto_policy": True, "earliest_file": True, "file_name_contains": scenario["file_extension"], "args": {"index_col": False, "names": ["value"]}, } acon = _get_test_acon(scenario, option_params) _execute_and_validate(acon, scenario) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_gz_file", "test_name": "compressed_gz_file", "sftp_files_format": "csv", "file_name": "file6.compress", "file_extension": ".gz", }, ], ) def test_sftp_reader_gz_file( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - compressed gz type. This tests covers a connection using keys and tests a scenario of extracting a compressed gz file. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "key_type": "RSA", "pkey": LocalStorage.read_file(SERVER_KEY_PUBLIC).split()[1], "file_name_contains": "file6", "args": {"sep": "|"}, } acon = _get_test_acon(scenario, option_params) _execute_and_validate(acon, scenario) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_json", "test_name": "greater_than", "sftp_files_format": "json", "file_name": "file3", "file_extension": ".json", } ], ) def test_sftp_reader_json( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - json type. This tests covers a connection with add auto policy and tests date time greater than specified date and additional args. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "add_auto_policy": True, "date_time_gt": "2022-01-01", "file_name_contains": scenario["file_extension"], "args": {"lines": True, "orient": "columns"}, } acon = _get_test_acon(scenario, option_params) _execute_and_validate(acon, scenario) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_mult_files", "test_name": "file_name_contains", "sftp_files_format": "csv", "file_name": "*", "file_extension": ".csv", } ], ) def test_sftp_reader_mult_files( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - multiple files. This test covers a connection with add auto policy and tests file contains with additional args. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "add_auto_policy": True, "file_name_contains": scenario["file_extension"], "args": {"sep": "|"}, } acon = _get_test_acon(scenario, option_params) _execute_and_validate(acon, scenario) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_xml", "test_name": "lower_than", "sftp_files_format": "xml", "file_name": "file4", "file_extension": ".xml", }, { "scenario_name": "sftp_xml", "test_name": "lower_than_fails", "sftp_files_format": "xml", "file_name": "file4", "file_extension": ".xml", }, ], ) def test_sftp_reader_xml( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - xml type. This test covers a connection with add auto policy and date time lower than specified date. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "add_auto_policy": True, "date_time_lt": ( "9999-12-31" if "fail" not in scenario["test_name"] else "2022-01-01" ), "file_name_contains": scenario["file_extension"], } acon = _get_test_acon(scenario, option_params) if "fail" not in scenario["test_name"]: _execute_and_validate(acon, scenario) else: with pytest.raises( ValueError, match="No files were found with the specified parameters." ): _execute_and_validate(acon, scenario) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sftp_zip_file", "test_name": "compressed_zip_file", "sftp_files_format": "csv", "file_name": "file7", "file_extension": ".zip", }, ], ) def test_sftp_reader_zip_file( sftp_client: SFTPClient, sftpserver: SFTPServer, scenario: dict, remote_location: dict, ) -> None: """Test loads from sftp source - compressed zip type. This tests covers a connection using keys and tests a scenario of extracting a compressed zip file. Args: sftp_client: sftp client used to perform tests. sftpserver: a local SFTP-Server created by pytest_sftpserver. scenario: scenario being tested. remote_location: serve files on remote location. """ LOGGER.info(f"Starting Scenario {scenario['scenario_name']}") with sftpserver.serve_content(deepcopy(remote_location)): rename_remote_files(sftp_client) option_params = { "hostname": sftpserver.host, "username": "dummy_user", "password": "dummy_password", "port": sftpserver.port, "key_type": "RSA", "pkey": LocalStorage.read_file(SERVER_KEY_PUBLIC).split()[1], "sub_dir": True, "file_name_contains": "file7", "args": {"sep": "|"}, } acon = _get_test_acon(scenario, option_params) _execute_and_validate(acon, scenario) def test_sftp_server_available(sftpserver: SFTPServer) -> None: """Test availability of sftp server. Args: sftpserver: a local SFTP-Server created by pytest_sftpserver. """ assert isinstance(sftpserver, SFTPServer) assert sftpserver.is_alive() assert str(sftpserver.port) in sftpserver.url def _execute_and_validate( acon: dict, scenario: dict, ) -> None: """Execute the load and compare data of result and control. Args: acon: acon dict to be tested. scenario: scenario to be tested. """ load_data(acon=acon) result = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/{scenario['test_name']}/data" ) if scenario["scenario_name"] == "sftp_fwf": control = ( ExecEnv.SESSION.read.format("text") .option("lineSep", "\n") .load( f"{TEST_RESOURCES}/data/{scenario['file_name']}" f"{scenario['file_extension']}" ) ) elif scenario["scenario_name"] == "sftp_json": control = DataframeHelpers.read_from_file( f"{TEST_RESOURCES}/data/{scenario['file_name']}" f"{scenario['file_extension']}", file_format="json", ) elif scenario["scenario_name"] == "sftp_xml": control = ( ExecEnv.SESSION.read.format("xml") .option("rowTag", "row") .load( f"{TEST_RESOURCES}/data/{scenario['file_name']}" f"{scenario['file_extension']}" ) ) elif scenario["scenario_name"] == "sftp_zip_file": with ZipFile( f"{TEST_RESOURCES}/data/{scenario['file_name']}" f"{scenario['file_extension']}", "r", ) as zf: file = pd.read_csv(TextIOWrapper(zf.open(zf.namelist()[0])), sep="|") control = ExecEnv.SESSION.createDataFrame(file) else: control = DataframeHelpers.read_from_file( f"{TEST_RESOURCES}/data/{scenario['file_name']}" f"{scenario['file_extension']}" ) assert not DataframeHelpers.has_diff(result, control) def _get_test_acon( scenario: dict, option_params: dict, ) -> dict: """Creates a test ACON with the desired logic for the algorithm. Args: scenario: the scenario being tested. option_params: option params for the scenario being tested. Returns: dict: the ACON for the algorithm configuration. """ return { "input_specs": [ { "spec_id": "sftp_source", "read_type": "batch", "data_format": "sftp", "sftp_files_format": scenario["sftp_files_format"], "location": "remote_location", "options": option_params, } ], "output_specs": [ { "spec_id": "sftp_bronze", "input_id": "sftp_source", "write_type": "overwrite", "data_format": "csv", "options": {"header": True, "delimiter": "|", "inferSchema": True}, "location": f"file:///{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/" f"{scenario['test_name']}/data", } ], } @pytest.fixture(scope="module") def remote_location() -> dict: """Get files to serve on a remote sftp location. For creating compressed file in the remote location, it is necessary to read, decompress, cast it to bytes and then send it to the location. For regular files, only file read is necessary. Returns: A dict with the files for the remote location configured. """ remote_location: dict = {"remote_location": {}} for file in FILES: if file.endswith(".gz"): file_name = file.rsplit(".", 1)[0] with gzip.GzipFile(f"{LOCAL_PATH}{file}", "rb") as compressed_file: file_data_string = compressed_file.read().decode() file_bytes = gzip.compress(file_data_string.encode("utf-8")) remote_location["remote_location"][f"{file_name}"] = file_bytes elif file.endswith(".zip"): file_name = file.rsplit(".", 1)[0] with ZipFile(f"{LOCAL_PATH}{file}", "r") as f: with f.open(f"{file_name}.csv") as zfile: data = zfile.read().decode() bytesfile = io.BytesIO() with ZipFile(bytesfile, mode="w") as zf: zf.writestr(f"{file_name}.csv", data) zf.close() file_bytes = bytesfile.getvalue() remote_location["remote_location"].update({"sub_dir": {}}) remote_location["remote_location"]["sub_dir"][f"{file_name}"] = file_bytes else: file_name = file.split(".")[0] remote_location["remote_location"][f"{file_name}"] = LocalStorage.read_file( f"{LOCAL_PATH}{file}" ) return remote_location def rename_remote_files(sftp_client: SFTPClient) -> None: """Rename files served remotely in SFTP.""" for file in FILES: file_name = file.rsplit(".", 1)[0] try: sftp_client.rename( f"/remote_location/{file_name}", f"/remote_location/{file}", ) except IOError: pass try: sftp_client.rename( f"/remote_location/sub_dir/{file_name}", f"/remote_location/sub_dir/{file}", ) except IOError: pass ================================================ FILE: tests/feature/test_sharepoint_reader.py ================================================ """Test Sharepoint reader.""" import json from pathlib import Path from typing import Any, Dict, List, Set from unittest.mock import Mock, patch import pytest from lakehouse_engine.core.definitions import SharepointFile from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from tests.conftest import FEATURE_RESOURCES from tests.utils.local_storage import LocalStorage from tests.utils.mocks import MockRESTResponse TEST_NAME = "sharepoint" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_SCENARIOS_READER_SUCCESS: List[List[str]] = [ ["reader", "read_single_csv_success"], ["reader", "read_single_csv_full_path_success"], ["reader", "read_folder_csv_success"], ["reader", "read_folder_csv_pattern_success"], ["reader", "read_single_csv_archive_enabled_success"], ["reader", "read_folder_csv_archive_enabled_success"], ["reader", "read_single_csv_archive_default_enabled_success"], ["reader", "read_single_csv_archive_success_subfolder_override_success"], ["reader", "read_folder_csv_archive_success_subfolder_override_success"], ] TEST_SCENARIOS_READER_FAILURES: List[List[str]] = [ [ "reader", "read_folder_csv_one_file_schema_mismatch_should_archive_error", r"Schema mismatch", ], ["reader", "read_single_csv_empty_file_should_archive_error", r"is empty"], [ "reader", "read_folder_csv_no_csv_files_should_fail", r"No CSV files found in folder: sp_test", ], [ "reader", "read_folder_csv_pattern_matches_no_files_should_fail", r"No CSV files found in folder: sp_test", ], [ "reader", "read_folder_csv_one_file_schema_mismatch_" "custom_error_subfolder_should_archive_error", r"Schema mismatch", ], [ "reader", "read_single_csv_download_error_should_archive_error", r"Download failed", ], [ "reader", "read_single_csv_spark_load_fails_should_archive_error", r"Failed to read Sharepoint file", ], ] TEST_SCENARIOS_READER_EXCEPTIONS: List[List[str]] = [ [ "reader", "read_single_csv_full_path_with_file_name_should_fail", "When `folder_relative_path` points to a file, `file_name` must be None.", ], [ "reader", "read_folder_path_does_not_exist_should_fail", "Folder 'missing_folder' does not exist in Sharepoint.", ], [ "reader", "read_file_name_and_file_pattern_conflict_should_fail", "Conflicting options: provide either `file_name` or `file_pattern`", ], [ "reader", "read_file_name_unsupported_extension_should_fail", "`file_name` must end with one of", ], [ "reader", "read_folder_relative_path_looks_like_file_unsupported_extension_should_fail", "`folder_relative_path` appears to be a file path but does not end with one of", ], [ "reader", "read_unsupported_file_type_should_fail", "`file_type` must be one of", ], [ "reader", "read_single_csv_full_path_with_file_pattern_should_fail", "When `folder_relative_path` points to a file, `file_pattern` must be None.", ], [ "reader", "read_single_csv_full_path_with_file_type_should_fail", "When `folder_relative_path` points to a file, `file_type` must be None", ], ] # Helper functions def _read_bytes(path_value: str) -> bytes: """Read a test file as bytes.""" return Path(path_value).read_bytes() def _get_output_path_by_scenario() -> Dict[str, str]: """Return the delta output location for each success scenario.""" return { "read_single_csv_success": ( "/app/tests/lakehouse/out/feature/sharepoint/reader/delta/" ), "read_single_csv_full_path_success": ( "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path/" ), "read_folder_csv_success": ( "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder/" ), "read_folder_csv_pattern_success": ( "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_pattern/" ), "read_single_csv_archive_enabled_success": ( "/app/tests/lakehouse/out/feature/sharepoint/" "reader/delta_single_archive_enabled/" ), "read_folder_csv_archive_enabled_success": ( "/app/tests/lakehouse/out/feature/sharepoint/" "reader/delta_folder_archive_enabled/" ), "read_single_csv_archive_default_enabled_success": ( "/app/tests/lakehouse/out/feature/sharepoint/" "reader/delta_single_archive_default_enabled/" ), "read_single_csv_archive_success_subfolder_override_success": ( "/app/tests/lakehouse/out/feature/sharepoint/" "reader/delta_single_archive_success_subfolder_override/" ), "read_folder_csv_archive_success_subfolder_override_success": ( "/app/tests/lakehouse/out/feature/sharepoint/" "reader/delta_folder_archive_success_subfolder_override/" ), } def _setup_sharepoint_reader_mocks_for_success( scenario_name: str, mock_list_items_in_path: Mock, mock_get_file_metadata: Mock, ) -> None: """Configure SharePoint mocks used by Sharepoint reader success scenarios. Args: scenario_name: Test scenario identifier. mock_list_items_in_path: Mock for SharepointUtils.list_items_in_path. mock_get_file_metadata: Mock for SharepointUtils.get_file_metadata. """ is_folder_read_scenario = scenario_name.startswith("read_folder_") if is_folder_read_scenario: mock_list_items_in_path.return_value = [ {"name": "sample_1.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, {"name": "sample_2.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, {"name": "other.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, {"name": "ignore.txt", "createdDateTime": "", "lastModifiedDateTime": ""}, ] file_bytes_by_path: Dict[str, bytes] = { "sp_test/sample_1.csv": _read_bytes( f"{TEST_RESOURCES}/reader/data/sample_1.csv" ), "sp_test/sample_2.csv": _read_bytes( f"{TEST_RESOURCES}/reader/data/sample_2.csv" ), "sp_test/other.csv": _read_bytes(f"{TEST_RESOURCES}/reader/data/other.csv"), } def get_file_metadata_side_effect_for_folder(file_path: str) -> SharepointFile: """Side effect function for `get_file_metadata` mock in folder scenarios.""" return SharepointFile( file_name=file_path.split("/")[-1], time_created="", time_modified="", content=file_bytes_by_path[file_path], _folder=file_path.rsplit("/", 1)[0], ) mock_get_file_metadata.side_effect = get_file_metadata_side_effect_for_folder return content = _read_bytes(f"{TEST_RESOURCES}/reader/data/sample_1.csv") def get_file_metadata_side_effect_for_single_file(file_path: str) -> SharepointFile: """Side effect function for `get_file_metadata` mock in single file scenarios. Args: file_path: The path of the file for which metadata is being requested. Returns: A SharepointFile object with the content set to the bytes read from the test file. """ folder = file_path.rsplit("/", 1)[0] if "/" in file_path else "sp_test" return SharepointFile( file_name=file_path.split("/")[-1], time_created="", time_modified="", content=content, _folder=folder, ) mock_get_file_metadata.side_effect = get_file_metadata_side_effect_for_single_file def _assert_archive_calls_for_success( scenario_name: str, mock_archive_sharepoint_file: Mock, ) -> None: """Assert archive behavior for Sharepoint reader success scenarios. Args: scenario_name: Test scenario identifier. mock_archive_sharepoint_file: Mock for SharepointUtils.archive_sharepoint_file. """ is_folder_read_scenario = scenario_name.startswith("read_folder_") folder_expected_calls_by_scenario: Dict[str, int] = { "read_folder_csv_success": 3, "read_folder_csv_pattern_success": 2, "read_folder_csv_archive_enabled_success": 3, "read_folder_csv_archive_success_subfolder_override_success": 3, } folder_archive_enabled_scenarios: Set[str] = { "read_folder_csv_archive_enabled_success", "read_folder_csv_archive_success_subfolder_override_success", } single_file_archive_enabled_scenarios: Set[str] = { "read_single_csv_archive_enabled_success", "read_single_csv_archive_default_enabled_success", "read_single_csv_archive_success_subfolder_override_success", } success_subfolder_by_scenario: Dict[str, str] = { "read_single_csv_archive_success_subfolder_override_success": "processed", "read_folder_csv_archive_success_subfolder_override_success": "processed", } expected_success_subfolder = success_subfolder_by_scenario.get( scenario_name, "done" ) if is_folder_read_scenario: expected_calls = folder_expected_calls_by_scenario[scenario_name] assert mock_archive_sharepoint_file.call_count == expected_calls expected_move_enabled = scenario_name in folder_archive_enabled_scenarios for call in mock_archive_sharepoint_file.call_args_list: assert call.kwargs["move_enabled"] is expected_move_enabled if expected_move_enabled: to_path = call.kwargs["to_path"] assert to_path is not None assert to_path.endswith(f"/{expected_success_subfolder}") return mock_archive_sharepoint_file.assert_called_once() expected_move_enabled = scenario_name in single_file_archive_enabled_scenarios assert ( mock_archive_sharepoint_file.call_args.kwargs["move_enabled"] is expected_move_enabled ) if expected_move_enabled: to_path = mock_archive_sharepoint_file.call_args.kwargs["to_path"] assert to_path is not None assert to_path.endswith(f"/{expected_success_subfolder}") def _assert_sharepoint_reader_success_output( scenario_name: str, output_path: str, ) -> None: """Assert the delta output produced by Sharepoint reader success scenarios. Args: scenario_name: Test scenario identifier. output_path: Delta output location for the scenario. """ data_frame = ExecEnv.SESSION.read.format("delta").load(output_path) assert data_frame.columns == ["col_a", "col_b"] if scenario_name in { "read_folder_csv_success", "read_folder_csv_archive_enabled_success", "read_folder_csv_archive_success_subfolder_override_success", }: assert data_frame.count() == 3 rows = [row.asDict() for row in data_frame.orderBy("col_a").collect()] assert rows == [ {"col_a": 1, "col_b": 2}, {"col_a": 3, "col_b": 4}, {"col_a": 999, "col_b": 999}, ] elif scenario_name == "read_folder_csv_pattern_success": assert data_frame.count() == 2 rows = [row.asDict() for row in data_frame.orderBy("col_a").collect()] assert rows == [ {"col_a": 1, "col_b": 2}, {"col_a": 3, "col_b": 4}, ] @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.archive_sharepoint_file" ) @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.get_file_metadata") @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.list_items_in_path") @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists", return_value=True, ) @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app") @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token") @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request") @pytest.mark.parametrize("scenario", TEST_SCENARIOS_READER_SUCCESS) def test_sharepoint_reader_success( mock_make_request: Any, mock_get_token: Any, mock_create_app: Any, mock_check_if_endpoint_exists: Any, mock_list_items_in_path: Any, mock_get_file_metadata: Any, mock_archive_sharepoint_file: Any, scenario: List[str], ) -> None: """Test Sharepoint reader happy paths (single file, full path, folder).""" scenario_name = scenario[1] output_path_by_scenario = _get_output_path_by_scenario() mock_archive_sharepoint_file.return_value = None mock_make_request.return_value = None _setup_sharepoint_reader_mocks_for_success( scenario_name=scenario_name, mock_list_items_in_path=mock_list_items_in_path, mock_get_file_metadata=mock_get_file_metadata, ) output_path = output_path_by_scenario[scenario_name] LocalStorage.clean_folder(output_path) load_data(f"file://{TEST_RESOURCES}/{scenario[0]}/acons/{scenario_name}.json") _assert_archive_calls_for_success( scenario_name=scenario_name, mock_archive_sharepoint_file=mock_archive_sharepoint_file, ) _assert_sharepoint_reader_success_output( scenario_name=scenario_name, output_path=output_path, ) @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.archive_sharepoint_file" ) @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.get_file_metadata") @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.list_items_in_path") @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists", return_value=True, ) @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app") @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token") @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request") @pytest.mark.parametrize("scenario", TEST_SCENARIOS_READER_FAILURES) def test_sharepoint_reader_failures( mock_make_request: Any, mock_get_token: Any, mock_create_app: Any, mock_check_if_endpoint_exists: Any, mock_list_items_in_path: Any, mock_get_file_metadata: Any, mock_archive_sharepoint_file: Any, scenario: List[str], tmp_path: Path, ) -> None: """Test Sharepoint reader runtime failure scenarios. This test covers failures that happen during file processing (for example schema mismatches, empty files, or folder contents that result in non readable CSVs). These are different from `test_sharepoint_reader_exceptions`, which validates fail-fast configuration errors (invalid option combinations, unsupported file types) that should raise before any file processing starts. For runtime failures where archiving is enabled, the reader should move the problematic file(s) to the configured error subfolder (default: "error"). The assertions at the end verify: - the job failed with the expected error message - archiving was invoked with `move_enabled=True` - the archive target folder matches the expected error subfolder - the archived file is one of the files involved in the scenario """ scenario_name = scenario[1] expected_error_regex = scenario[2] mock_archive_sharepoint_file.return_value = None mock_make_request.return_value = None should_assert_no_archive_calls = False expected_error_subfolder = "error" allowed_file_names: Set[str] = set() should_patch_spark_load = False # Scenario-specific mocking + expectations (no load_data here) if "schema_mismatch" in scenario_name: expected_error_subfolder = ( "failed" if "custom_error_subfolder" in scenario_name else "error" ) allowed_file_names = {"sample_1.csv", "bad_schema.csv"} mock_list_items_in_path.return_value = [ {"name": "sample_1.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, { "name": "bad_schema.csv", "createdDateTime": "", "lastModifiedDateTime": "", }, ] file_bytes_by_path: Dict[str, bytes] = { "sp_test/sample_1.csv": _read_bytes( f"{TEST_RESOURCES}/reader/data/sample_1.csv" ), "sp_test/bad_schema.csv": _read_bytes( f"{TEST_RESOURCES}/reader/data/bad_schema.csv" ), } def get_file_metadata_side_effect(file_path: str) -> SharepointFile: return SharepointFile( file_name=file_path.split("/")[-1], time_created="", time_modified="", content=file_bytes_by_path[file_path], _folder=file_path.rsplit("/", 1)[0], ) mock_get_file_metadata.side_effect = get_file_metadata_side_effect elif scenario_name == "read_single_csv_empty_file_should_archive_error": allowed_file_names = {"empty.csv"} def get_file_metadata_side_effect(file_path: str) -> SharepointFile: return SharepointFile( file_name="empty.csv", time_created="", time_modified="", content=b"", _folder="sp_test", ) mock_get_file_metadata.side_effect = get_file_metadata_side_effect elif scenario_name == "read_folder_csv_no_csv_files_should_fail": should_assert_no_archive_calls = True mock_list_items_in_path.return_value = [ {"name": "ignore.txt", "createdDateTime": "", "lastModifiedDateTime": ""}, {"name": "readme.md", "createdDateTime": "", "lastModifiedDateTime": ""}, ] elif scenario_name == "read_folder_csv_pattern_matches_no_files_should_fail": should_assert_no_archive_calls = True mock_list_items_in_path.return_value = [ {"name": "sample_1.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, {"name": "sample_2.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, {"name": "other.csv", "createdDateTime": "", "lastModifiedDateTime": ""}, ] elif scenario_name == "read_single_csv_download_error_should_archive_error": allowed_file_names = {"sample_1.csv"} first_sharepoint_file = SharepointFile( file_name="sample_1.csv", time_created="", time_modified="", content=b"not-empty", _folder="sp_test", ) mock_get_file_metadata.side_effect = [ first_sharepoint_file, ValueError("Download failed"), ] elif scenario_name == "read_single_csv_spark_load_fails_should_archive_error": should_patch_spark_load = True allowed_file_names = {"sample_1.csv"} sp_file_first = SharepointFile( file_name="sample_1.csv", time_created="", time_modified="", content=b"col_a,col_b\n1,2\n", _folder="sp_test", ) sp_file_second = SharepointFile( file_name="sample_1.csv", time_created="", time_modified="", content=b"col_a,col_b\n1,2\n", _folder="sp_test", ) mock_get_file_metadata.side_effect = [sp_file_first, sp_file_second] else: raise ValueError(f"Unhandled failure scenario: {scenario_name}") # Execute + assert error (exactly once per scenario) acon_path = f"file://{TEST_RESOURCES}/{scenario[0]}/acons/{scenario_name}.json" if should_patch_spark_load: fake_local_file: Path = tmp_path / "fake.csv" fake_local_file.write_text("dummy") with ( patch( "lakehouse_engine.utils.sharepoint_utils." "SharepointUtils.save_to_staging_area", return_value=str(fake_local_file), ), patch( "pyspark.sql.readwriter.DataFrameReader.load", side_effect=Exception("Spark load failed"), ), ): with pytest.raises(ValueError, match=expected_error_regex): load_data(acon_path) else: with pytest.raises(ValueError, match=expected_error_regex): load_data(acon_path) # For scenarios that fail before reading any CSV file (folder contains no CSVs, or # the pattern filters everything out), there is no concrete CSV file to archive. # We assert no archive attempts are made. if should_assert_no_archive_calls: assert mock_archive_sharepoint_file.call_count == 0 assert mock_get_file_metadata.call_count == 0 return # For processing-time failures, the reader should attempt to archive the failing # file(s) into the configured error subfolder (default: "error"). # We assert at least one archive call targeted that error folder with move enabled, # and that the archived file belongs to this scenario. error_calls = [ c for c in mock_archive_sharepoint_file.call_args_list if (c.kwargs.get("to_path") or "").endswith(f"/{expected_error_subfolder}") ] assert len(error_calls) >= 1 for c in error_calls: assert c.kwargs["move_enabled"] is True sp_file = c.kwargs.get("sp_file") assert sp_file is not None assert sp_file.file_name in allowed_file_names @pytest.mark.parametrize("scenario", TEST_SCENARIOS_READER_EXCEPTIONS) @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app") @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token", return_value="fake-token", ) @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request", side_effect=[ # site id MockRESTResponse( status_code=200, json_data=json.loads( open(f"{TEST_RESOURCES}/reader/mocks/get_site_id.json").read() ), ), # drive id MockRESTResponse( status_code=200, json_data=json.loads( open(f"{TEST_RESOURCES}/reader/mocks/get_drive_id.json").read() ), ), ], ) @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists", return_value=True, ) def test_sharepoint_reader_exceptions( mock_check_if_endpoint_exists: Any, mock_make_request: Any, mock_get_token: Any, mock_create_app: Any, scenario: List[str], ) -> None: """Test Sharepoint reader invalid configs that must fail fast.""" scenario_name = scenario[1] if scenario_name == "read_folder_path_does_not_exist_should_fail": mock_check_if_endpoint_exists.return_value = False with pytest.raises(ValueError, match=scenario[2]): load_data(f"file://{TEST_RESOURCES}/{scenario[0]}/acons/{scenario_name}.json") ================================================ FILE: tests/feature/test_sharepoint_writer.py ================================================ """Test Sharepoint utils.""" import json from typing import Any, List from unittest.mock import MagicMock, patch import pytest from lakehouse_engine.engine import load_data from lakehouse_engine.io.exceptions import ( EndpointNotFoundException, InputNotFoundException, NotSupportedException, ) from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.local_storage import LocalStorage from tests.utils.mocks import MockRESTResponse """ Tests for Sharepoint-related utilities and functionality. This test suite validates the behavior of the Sharepoint writer, ensuring that it handles various scenarios correctly. The tests cover validation of mandatory inputs, unsupported operations, endpoint existence checks, and successful writing to Sharepoint. Scenarios tested: - Attempting to use streaming with the Sharepoint writer raises a `NotSupportedException`. - Missing mandatory options (`site_name`, `drive_name`, `local_path`) raises an `InputNotFoundException`. - Providing an invalid endpoint raises an `EndpointNotFoundException`. - Successful writing to Sharepoint and associated log validation. Mocks: - `SharepointWriter._get_sharepoint_utils` is patched to simulate the behavior of the Sharepoint utilities without making actual external calls. - Mock REST responses simulate Sharepoint API interactions for success cases. Dependencies: - Uses pytest for parameterized testing of different scenarios. - Relies on a local storage utility for preparing test data and file operations. """ TEST_NAME = "sharepoint" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_SCENARIOS_EXCEPTIONS = [ [ "streaming_exception", "Sharepoint writer doesn't support streaming!", ], [ "drive_exception", "Please provide all mandatory Sharepoint options. \n" "Expected: site_name, drive_name and local_path. " "Value should not be None.\n" "Provided: site_name=mock_site, \n" "drive_name=, \n" "local_path=mock_path", ], [ "site_exception", "Please provide all mandatory Sharepoint options. \n" "Expected: site_name, drive_name and local_path. " "Value should not be None.\n" "Provided: site_name=, \n" "drive_name=mock_drive, \n" "local_path=mock_path", ], [ "local_path_exception", "Please provide all mandatory Sharepoint options. \n" "Expected: site_name, drive_name and local_path. " "Value should not be None.\n" "Provided: site_name=mock_site, \n" "drive_name=mock_drive, \n" "local_path=", ], ["endpoint_exception", "The provided endpoint does not exist!"], ] TEST_SCENARIOS_WRITER = [ [ "writer", "write_to_local_success", f"Deleted the local folder: {TEST_LAKEHOUSE_OUT}/writer/data", ], ] @pytest.mark.parametrize("scenario", TEST_SCENARIOS_EXCEPTIONS) @patch( "lakehouse_engine.io.writers.sharepoint_writer.SharepointWriter._get_sharepoint_utils" # noqa ) def test_sharepoint_writer_exceptions( mock_get_sharepoint_utils: MagicMock, scenario: List[str] ) -> None: """Test writing to Sharepoint from csv source. Args: scenario: scenario to test. mock_get_sharepoint_utils: patch sharepoint_utils. """ mock_sharepoint_utils = MagicMock() mock_get_sharepoint_utils.return_value = mock_sharepoint_utils LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) if scenario[1] == "streaming_exception": with pytest.raises(NotSupportedException, match=scenario[2]): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/acons/streaming_exception.json" ) elif scenario[1] == "site_exception": with pytest.raises(InputNotFoundException, match=scenario[2]): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/acons/site_exception.json" ) elif scenario[1] == "drive_exception": with pytest.raises(InputNotFoundException, match=scenario[2]): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/acons/drive_exception.json" ) elif scenario[1] == "local_path_exception": with pytest.raises(InputNotFoundException, match=scenario[2]): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/acons/local_path_exception.json" ) elif scenario[1] == "endpoint_exception": mock_sharepoint_utils.check_if_endpoint_exists.return_value = False with pytest.raises(EndpointNotFoundException, match=scenario[2]): load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/acons/endpoint_exception.json" ) @pytest.mark.parametrize("scenario", TEST_SCENARIOS_WRITER) @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists", return_value=True, # noqa ) @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app") # noqa @patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token") # noqa @patch( "lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request", side_effect=[ MockRESTResponse( status_code=200, json_data=json.loads( open(f"{TEST_RESOURCES}/writer/mocks/get_site_id.json").read() ), ), MockRESTResponse( status_code=200, json_data=json.loads( open(f"{TEST_RESOURCES}/writer/mocks/get_drive_id.json").read() ), ), MockRESTResponse( status_code=200, json_data=json.loads( open(f"{TEST_RESOURCES}/writer/mocks/create_upload_session.json").read() ), ), MockRESTResponse(status_code=200), # final upload to sharepoint ], ) # noqa def test_sharepoint_writer( _: Any, __: Any, ___: Any, _make_requests: Any, scenario: List[str], caplog: Any ) -> None: """Test writing to Sharepoint from csv source. Args: scenario: scenario to test. caplog: fetch logs. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) if scenario[0] == "writer" and scenario[1] == "write_to_local_success": LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) load_data( f"file://{TEST_RESOURCES}/{scenario[0]}/acons/write_to_local_success.json" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data/", ) assert scenario[2] in caplog.text ================================================ FILE: tests/feature/test_table_manager.py ================================================ """Test table manager.""" import logging from typing import Any import pytest from pyspark.sql.utils import AnalysisException from lakehouse_engine.engine import manage_table from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.local_storage import LocalStorage TEST_PATH = "table_manager" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenarios", [ { "table_and_view_name": ["SimpleSplitScenario"], "locations_name": ["simple_split_scenario"], "create_tbl_sql": "test_table_simple_split_scenario.sql", "create_tbl_json": "acon_create_table_simple_split_scenario", "execute_sql_json": "acon_execute_sql_simple_split_scenario", "create_vw_sql": "test_view_simple_split_scenario", "create_vw_json": "acon_create_view_simple_split_scenario", "describe_tbl_json": "acon_describe_simple_split_scenario", "vacuum_tbl_json": "acon_vacuum_table_simple_split_scenario", "vacuum_loc_json": "acon_vacuum_location_simple_split_scenario", "optimize_tbl_json_": "optimize_table_simple_split_scenario", "optimize_loc_json": "optimize_location_simple_split_scenario", "compute_statistics_tbl_json": ["table_stats_simple_split_scenario"], "show_tbl_prop_json": "show_tbl_properties_simple_split_scenario", "tbl_primary_keys_json": "get_tbl_pk_simple_split_scenario", "drop_vw_json": "acon_drop_view_simple_split_scenario", "delete_json": "acon_delete_where_table_simple_split_scenario", "drop_tbl_json": "acon_drop_table_simple_split_scenario", }, { "table_and_view_name": [ "ComplexDefaultScenario1", "ComplexDefaultScenario2", ], "locations_name": [ "complex_default_scenario1", "complex_default_scenario2", ], "create_tbl_sql": "test_table_complex_default_scenario.sql", "create_tbl_json": "acon_create_table_complex_default_scenario", "execute_sql_json": "acon_execute_sql_complex_default_scenario", "create_vw_sql": "test_view_complex_default_scenario", "create_vw_json": "acon_create_view_complex_default_scenario", "compute_statistics_tbl_json": [ "table_stats_complex_default_scenario1", "table_stats_complex_default_scenario2", ], }, { "table_and_view_name": [ "ComplexDifferentDelimiterScenario1", "ComplexDifferentDelimiterScenario2", ], "locations_name": [ "complex_different_delimiter_scenario1", "complex_different_delimiter_scenario2", ], "create_tbl_sql": "test_table_complex_different_delimiter_scenario.sql", "create_tbl_json": "acon_create_table_complex_different_delimiter_scenario", "execute_sql_json": "acon_execute_sql_complex_different_delimiter_scenario", "create_vw_sql": "test_view_complex_different_delimiter_scenario", "create_vw_json": "acon_create_view_complex_different_delimiter_scenario", "compute_statistics_tbl_json": [ "table_stats_complex_different_delimiter_scenario1", "table_stats_complex_different_delimiter_scenario2", ], }, ], ) def test_table_manager(scenarios: dict, caplog: Any) -> None: """Test functions from table manager. Args: scenarios: scenarios to test. caplog: captured log. """ with caplog.at_level(logging.INFO): LocalStorage.copy_file( f"{TEST_RESOURCES}/create/table/{scenarios['create_tbl_sql']}", f"{TEST_LAKEHOUSE_IN}/create/table/", ) manage_table( f"file://{TEST_RESOURCES}/create/{scenarios['create_tbl_json']}.json" ) assert "create_table successfully executed!" in caplog.text manage_table( f"file://{TEST_RESOURCES}/execute_sql/{scenarios['execute_sql_json']}.json" ) assert "sql successfully executed!" in caplog.text LocalStorage.copy_file( f"{TEST_RESOURCES}/create/view/{scenarios['create_vw_sql']}.sql", f"{TEST_LAKEHOUSE_IN}/create/view/", ) manage_table( f"file://{TEST_RESOURCES}/create/{scenarios['create_vw_json']}.json" ) assert "create_view successfully executed!" in caplog.text if scenarios.get("describe_tbl_json") is not None: manage_table( f"file://{TEST_RESOURCES}/describe/" f"{scenarios['describe_tbl_json']}.json" ) assert ( "DataFrame[col_name: string, data_type: string, comment: string]" in caplog.text ) if scenarios.get("vacuum_tbl_json") is not None: manage_table( f"file://{TEST_RESOURCES}/vacuum/{scenarios['vacuum_tbl_json']}.json" ) assert ( "Vacuuming table: test_db.DummyTableBronzeSimpleSplitScenario" in caplog.text ) if scenarios.get("vacuum_loc_json") is not None: manage_table( f"file://{TEST_RESOURCES}/vacuum/{scenarios['vacuum_loc_json']}.json" ) assert ( "Vacuuming location: file:///app/tests/lakehouse/out/feature/" "table_manager/dummy_table_bronze/data_simple_split_scenario" in caplog.text ) if scenarios.get("optimize_tbl_json") is not None: manage_table( f"file://{TEST_RESOURCES}/optimize/" f"{scenarios['optimize_tbl_json']}.json" ) assert ( "sql command: OPTIMIZE test_db.DummyTableBronzeSimpleSplitScenario " "WHERE year >= 2021 and month >= 09 and day > 01 ZORDER BY (col1,col2)" in caplog.text ) if scenarios.get("optimize_loc_json") is not None: manage_table( f"file://{TEST_RESOURCES}/optimize/" f"{scenarios['optimize_loc_json']}.json" ) assert ( f"sql command: OPTIMIZE delta.`file://{TEST_LAKEHOUSE_OUT}/" "dummy_table_bronze/data_simple_split_scenario` WHERE year >= 2021 " "and month >= 09 and day > 01 ZORDER BY (col1,col2)" in caplog.text ) with pytest.raises( AnalysisException, match=".*ANALYZE TABLE is not supported for v2 tables.*" ): # compute table stats is still not supported in current OS delta lake. if scenarios.get("compute_statistics_tbl_json") is not None: for ( compute_statistics_table_index, compute_statistics_table_json_file, ) in enumerate(scenarios["compute_statistics_tbl_json"]): manage_table( f"file://{TEST_RESOURCES}/compute_table_statistics/" f"{compute_statistics_table_json_file}.json" ) scenario_name = scenarios["table_and_view_name"][ compute_statistics_table_index ] assert ( "sql command: ANALYZE TABLE test_db.DummyTable" f"Bronze{scenario_name} COMPUTE STATISTICS" in caplog.text ) if scenarios.get("show_tbl_prop_json") is not None: manage_table( f"file://{TEST_RESOURCES}/show_tbl_properties/" f"{scenarios['show_tbl_prop_json']}.json" ) assert ( "sql command: SHOW TBLPROPERTIES test_db.DummyTable" "BronzeSimpleSplitScenario" in caplog.text ) if scenarios.get("tbl_primary_keys_json") is not None: manage_table( f"file://{TEST_RESOURCES}/get_tbl_pk/" f"{scenarios['tbl_primary_keys_json']}.json" ) assert "['id', 'col1']" in caplog.text if scenarios.get("drop_vw_json") is not None: manage_table( f"file://{TEST_RESOURCES}/drop/{scenarios['drop_vw_json']}.json" ) assert "View successfully dropped!" in caplog.text if scenarios.get("delete_json") is not None: manage_table( f"file://{TEST_RESOURCES}/delete/{scenarios['delete_json']}.json" ) assert ( "sql command: DELETE FROM test_db.DummyTable" "BronzeSimpleSplitScenario WHERE year=2021" in caplog.text # nosec: B608 ) if scenarios.get("drop_tbl_json") is not None: manage_table( f"file://{TEST_RESOURCES}/drop/{scenarios['drop_tbl_json']}.json" ) assert "Table successfully dropped!" in caplog.text ================================================ FILE: tests/feature/test_writers.py ================================================ """Test engine writers. Delta merge tests writers weren't added because it is always batch, micro batch or normal batch, but always batch. Also, we have another test like delta_load that uses delta_merge_writer. Kafka writer weren't added also, because we cannot simulate kafka on local tests. All other writers were covered. """ import logging import os import random import string from collections import namedtuple from typing import Any, Optional, OrderedDict from unittest.mock import patch import pytest from pyspark.sql import DataFrame from pyspark.sql.types import StructType from lakehouse_engine.core.definitions import OutputFormat, OutputSpec from lakehouse_engine.engine import load_data from lakehouse_engine.io.exceptions import NotSupportedException from lakehouse_engine.io.writers.dataframe_writer import DataFrameWriter from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "writers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}" @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_batch_files"}, {"scenario_name": "write_streaming_files"}, { "scenario_name": "write_streaming_foreachBatch_files", }, ], ) def test_write_to_files(scenario: dict) -> None: """Test file writer. Args: scenario: scenario to test. """ _prepare_files() acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_batch_rest_api"}, {"scenario_name": "write_streaming_rest_api"}, ], ) def test_write_to_rest_api(scenario: dict) -> None: """Test rest api writer. Args: scenario: scenario to test. """ _prepare_files() RestResponse = namedtuple("RestResponse", "status_code text") with patch( "lakehouse_engine.io.writers.rest_api_writer.execute_api_request", return_value=RestResponse(status_code=200, text="ok"), ): acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) load_data(acon=acon) @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_batch_jdbc"}, {"scenario_name": "write_streaming_foreachBatch_jdbc"}, ], ) def test_write_to_jdbc(scenario: dict) -> None: """Test jdbc writer. Args: scenario: scenario to test. """ _prepare_files() os.mkdir(f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/") acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) load_data(acon=acon) result_df = DataframeHelpers.read_from_jdbc( f"jdbc:sqlite:{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/test.db", f"{scenario['scenario_name']}", ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_batch_table"}, {"scenario_name": "write_streaming_table"}, {"scenario_name": "write_streaming_foreachBatch_table"}, ], ) def test_write_to_table(scenario: dict) -> None: """Test table writer. Args: scenario: scenario to test. """ _prepare_files() acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) load_data(acon=acon) result_df = DataframeHelpers.read_from_table(f"test_db.{scenario['scenario_name']}") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv" ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_batch_console"}, {"scenario_name": "write_streaming_console"}, {"scenario_name": "write_streaming_foreachBatch_console"}, ], ) def test_write_to_console(scenario: dict, capsys: Any) -> None: """Test console writer. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ _prepare_files() acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) load_data(acon=acon) captured = capsys.readouterr() logging.info(captured.out) assert "20140601|customer1|article3|" in captured.out @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_batch_dataframe"}, {"scenario_name": "write_streaming_dataframe"}, {"scenario_name": "write_streaming_foreachBatch_dataframe"}, ], ) def test_write_to_dataframe(scenario: dict, capsys: Any) -> None: """Test dataframe writer returning the output by OutputSpec. Description of the test scenarios: - write_batch_dataframe - test writing a DataFrame from two batch sources, uniting both sources. It's generated a DataFrame containing the data from both sources. - write_streaming_dataframe - similar to write_batch_dataframe but inputting data from a stream. - write_streaming_foreachBatch_dataframe - similar to write_batch_dataframe but mixing batch and streaming, so the first source from batch and the second from a stream. This test have the responsibility to execute the writer using micro batch strategy. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/source") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}") _prepare_files() result = load_data( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv" ) expected_keys = ["sales"] assert not DataframeHelpers.has_diff(result.get("sales"), control_df) assert len(result.keys()) == len(expected_keys) assert all( subject == expected for subject, expected in zip(result.keys(), expected_keys) ) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "write_streaming_df_with_checkpoint", "control": "streaming_dataframe", }, { "scenario_name": "write_streaming_foreachBatch_df_with_checkpoint", "control": "streaming_dataframe_foreachBatch", }, ], ) def test_write_to_dataframe_checkpoints(scenario: dict, capsys: Any) -> None: """Test dataframe writer using checkpoint for the next run. In this test our InputSpecs have the option `maxFilesPerTrigger`, this option forces our stream to read a maximum files per iteration, this property also needs to have a checkpoint location because spark internally needs to control the state of reading the files. Description of the test scenarios: - write_streaming_dataframe - test if the checkpoint is working as expected when writing the data from stream to DataFrame. We have two different input files for each source we expect to read just the first in the first execution and the second in the next one. - write_streaming_foreachBatch_dataframe - test if the checkpoint is working as expected when writing the data from stream and batch using the micro batch strategy to DataFrame. As we have two different input files for each source we expect to read just the first file in the first execution and the second in the next one for the stream with checkpoint source. On the batch source we expect to read the first file in the first run and both files in the second run. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/source") LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}") for iteration in range(1, 2): _prepare_files(iteration) result = load_data( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/" f"writers_control_{scenario['control']}_{iteration}.csv" ) expected_keys = ["sales"] assert not DataframeHelpers.has_diff(result.get("sales"), control_df) assert len(result.keys()) == len(expected_keys) assert all( subject == expected for subject, expected in zip(result.keys(), expected_keys) ) @pytest.mark.parametrize( "scenario", [ {"scenario_name": "write_streaming_multiple_dfs"}, ], ) def test_multiple_write_to_dataframe(scenario: dict, capsys: Any) -> None: """Test dataframe writer chaining ACON calls. This test have the objective to demonstrate how you can use the output from an ACON as input to another ACON, showing the flexibility of this writer unlock to us. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ _prepare_files() multiple_df_result = load_data( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) generated_acon = _generate_acon_from_source(multiple_df_result) result = load_data(acon=generated_acon) result_keys = list(multiple_df_result.keys()) + list(result.keys()) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv" ) expected_keys = ["sales_historical", "sales_new", "sales"] assert not DataframeHelpers.has_diff(result.get("sales"), control_df) assert len(result_keys) == len(expected_keys) assert all( subject == expected for subject, expected in zip(result_keys, expected_keys) ) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "write_streaming_processing_time_dataframe", "streaming_processing_time": "2 seconds", }, { "scenario_name": "write_streaming_continuous_dataframe", "streaming_continuous": "2 seconds", }, ], ) def test_write_to_dataframe_exception(scenario: dict, capsys: Any) -> None: """Test expected exception for dataframe writer on stream cases. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ def dataframe_writer( df: DataFrame = None, data: OrderedDict = None, streaming_processing_time: Optional[str] = None, streaming_continuous: Optional[str] = None, ) -> DataFrameWriter: """Create DataFrame Writer. Args: df: dataframe containing the data to append. data: list of all dfs generated on previous steps before writer. streaming_processing_time: if streaming query is to be kept alive, this indicates the processing time of each micro batch. streaming_continuous: set a trigger that runs a continuous query with a given checkpoint interval. """ if not df: df = DataframeHelpers.create_empty_dataframe(StructType([])) spec = OutputSpec( spec_id=random.choice(string.ascii_letters), # nosec input_id=random.choice(string.ascii_letters), # nosec write_type=None, data_format=OutputFormat.DATAFRAME.value, streaming_processing_time=streaming_processing_time, streaming_continuous=streaming_continuous, ) return DataFrameWriter(output_spec=spec, df=df.coalesce(1), data=data) with pytest.raises(NotSupportedException) as exception: dataframe_writer( streaming_processing_time=scenario.get("streaming_processing_time"), streaming_continuous=scenario.get("streaming_continuous"), ).write() assert ( "DataFrame writer doesn't support processing time or continuous streaming" in str(exception.value) ) def _generate_acon_from_source(source: OrderedDict) -> dict: """Create an ACON from dictionary source containing resulted dataframes. Args: source: Dictionary containing source computed dataframes. """ return { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "dataframe", "df_name": source.get("sales_historical"), }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "dataframe", "df_name": source.get("sales_new"), }, ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]}} ], } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "dataframe", } ], } def _prepare_files(iteration: int = 0) -> None: file_suffix = "*" if iteration == 0 else f"{iteration}" LocalStorage.copy_file( f"{TEST_RESOURCES}/source/sales_historical_{file_suffix}.csv", f"{TEST_LAKEHOUSE_IN}/source/sales_historical/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/source/sales_new_{file_suffix}.csv", f"{TEST_LAKEHOUSE_IN}/source/sales_new/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/schema/*.json", f"{TEST_LAKEHOUSE_IN}/schema/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/control/*.*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) ================================================ FILE: tests/feature/transformations/__init__.py ================================================ """Transformations feature tests.""" ================================================ FILE: tests/feature/transformations/test_chain_transformations.py ================================================ """Test chain transformer.""" from typing import Any import pytest from pyspark.sql.utils import StreamingQueryException from lakehouse_engine.core.definitions import InputFormat, OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/chain_transformations" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ {"scenario_name": "batch"}, {"scenario_name": "streaming"}, {"scenario_name": "streaming_batch"}, {"scenario_name": "write_streaming_struct_data"}, {"scenario_name": "write_streaming_struct_data_fail"}, ], ) def test_chain_transformations(scenario: dict, caplog: Any) -> None: """Test chain transformation. Args: scenario: scenario to test. batch - scenario where we are using batch dataframes; streaming - scenario where we are using streaming dataframes; streaming_batch - scenario where we are using batch and streaming dataframes; write_streaming_struct_data - scenario where are we making transformations in first place, use this result to apply other transform and write in micro batch; write_streaming_struct_data_fail - scenario where we are trying to use a result from micro batch transformation into another transform, this one should fail because we cannot have dependency from micro batch. caplog: captured log. """ _prepare_files() acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json" ) if scenario["scenario_name"] == "write_streaming_struct_data_fail": with pytest.raises( StreamingQueryException, match=".*An exception was raised by the Python Proxy.*", ): load_data(acon=acon) assert ( "A column, variable, or function parameter with name `sample_json_field1` " "cannot be resolved." in caplog.text ) else: load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/data", file_format=OutputFormat.DELTAFILES.value, ) if scenario["scenario_name"] == "write_streaming_struct_data": control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/struct_data.json", file_format=InputFormat.JSON.value, options={"multiLine": "true"}, ).select( "salesorder", "item", "article", "sample_json_field1", "sample_json_field4", "item_amount_json", ) else: control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/chain_control.csv" ) assert not DataframeHelpers.has_diff(result_df, control_df) def _prepare_files() -> None: LocalStorage.copy_file( f"{TEST_RESOURCES}/source/sales_historical.csv", f"{TEST_LAKEHOUSE_IN}/source/sales_historical/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/source/sales_new.csv", f"{TEST_LAKEHOUSE_IN}/source/sales_new/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/source/customers.csv", f"{TEST_LAKEHOUSE_IN}/source/customers/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/source/struct_data.csv", f"{TEST_LAKEHOUSE_IN}/source/struct_data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/schema/*.json", f"{TEST_LAKEHOUSE_IN}/schema/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/control/*.*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) ================================================ FILE: tests/feature/transformations/test_column_creators.py ================================================ """Test Column Creator Transformers.""" import pytest from lakehouse_engine.core.definitions import InputFormat, OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/column_creators" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["streaming", "batch"], ) def test_column_creators(scenario: str) -> None: """Test column creators. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.json", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data", file_format=InputFormat.JSON.value, options={"multiLine": "true"}, ).select( "salesorder", "item", "date", "customer", "article", "amount", "dummy_string", "dummy_int", "dummy_double", "dummy_boolean", ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_column_reshapers.py ================================================ """Test Column Reshaping Transformers.""" import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/column_reshapers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ {"type": "batch", "scenario_name": "flatten_schema"}, {"type": "streaming", "scenario_name": "flatten_schema"}, {"type": "batch", "scenario_name": "explode_arrays"}, {"type": "streaming", "scenario_name": "explode_arrays"}, {"type": "batch", "scenario_name": "flatten_and_explode_arrays_and_maps"}, {"type": "streaming", "scenario_name": "flatten_and_explode_arrays_and_maps"}, ], ) def test_column_reshapers(scenario: dict) -> None: """Test column reshaping transformers. Args: scenario: scenario to test. flatten_schema: This test flattens the struct. explode_arrays: This test explode the array columns specified. flatten_and_explode_arrays_and_maps: This test flattens the struct and explode the array and map columns specified. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario['scenario_name']}/data/source/*.json", f"{TEST_LAKEHOUSE_IN}/{scenario['scenario_name']}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario['scenario_name']}/*schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario['scenario_name']}/", ) load_data( f"file://{TEST_RESOURCES}/{scenario['scenario_name']}/{scenario['type']}.json" ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario['scenario_name']}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario['scenario_name']}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/{scenario['type']}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario['scenario_name']}/data/" ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_data_maskers.py ================================================ """Test Data Masking Transformers.""" import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/data_maskers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["drop_columns", "hash_masking"], ) def test_data_maskers(scenario: str) -> None: """Test data masking transformers. Args: scenario: scenario to test. drop_columns - scenario where we mask data by dropping columns; hash_masking - scenario where we mask data by hashing columns. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}_control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_date_transformers.py ================================================ """Test Date Transformers.""" import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/date_transformers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["streaming"], ) def test_date_transformers(scenario: str) -> None: """Test date transformers. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=OutputFormat.DELTAFILES.value, ).drop("curr_date") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_drop_duplicate_rows.py ================================================ """Test drop_duplicate_rows function.""" import pytest from lakehouse_engine.core.definitions import InputFormat, OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/drop_duplicate_rows" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ ["batch"], ["streaming"], ], ) def test_drop_duplicate_rows(scenario: str) -> None: """Tests drop duplicate rows transformer available in the ACON transform_specs. Args: scenario: scenario to test. batch - test the transformer utilization in batch mode. The transformer is tested 3 times: 1) without providing arguments; 2) providing an empty list ([]); and 3) providing a list with columns names (["order_number","item_number"]). This happens using 3 different dataframes saved in different locations specified in the ACON. In the 2 first times, the transformer should have the same behaviour has using the pyspark function distinct(). streaming - the same as batch but using streaming. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/{scenario[0]}_*.json", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario[0]}.json") load_data(acon=acon) control_drop_duplicates = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[0]}_drop_duplicates.json", file_format=InputFormat.JSON.value, options={"multiLine": "true"}, ) control_distinct = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[0]}_distinct.json", file_format=InputFormat.JSON.value, options={"multiLine": "true"}, ) df_transform_columns = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/columns/data", file_format=OutputFormat.DELTAFILES.value, ) assert not DataframeHelpers.has_diff(df_transform_columns, control_drop_duplicates) df_transform_no_args = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/orders_duplicate_no_args/data", file_format=OutputFormat.DELTAFILES.value, ) assert not DataframeHelpers.has_diff(df_transform_no_args, control_distinct) df_transform_empty = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/orders_duplicate_empty/data", file_format=OutputFormat.DELTAFILES.value, ) assert not DataframeHelpers.has_diff(df_transform_empty, control_distinct) ================================================ FILE: tests/feature/transformations/test_joiners.py ================================================ """Test Join Transformers.""" from typing import List import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/joiners" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ ["streaming", "control_scenario_1_and_2"], ["streaming_without_broadcast", "control_scenario_1_and_2"], ["streaming_without_column_rename", "control_scenario_3"], ["streaming_foreachBatch", "control_scenario_1_and_2"], ["batch", "control_scenario_1_and_2"], ], ) def test_joiners(scenario: List[str]) -> None: """Test join transformers. Args: scenario: scenario to test. streaming - join streaming scenario. streaming_without_broadcast - same as streaming scenario but without broadcast join. Note: also differs by partitioning by customer and date, not only date. streaming_without_column_rename - same as streaming scenario but without renaming name column to customer_name. streaming_foreachBatch - join streaming scenario in foreachBatch mode. batch - join batch scenario. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/customer-part-01.csv", f"{TEST_LAKEHOUSE_IN}/data/customers/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/sales-part-01.csv", f"{TEST_LAKEHOUSE_IN}/data/sales/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario[0]}.json") if scenario[0] != "batch": load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/sales-part-02.csv", f"{TEST_LAKEHOUSE_IN}/data/sales/", ) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[1]}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario[1]}_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_multiple_transformations.py ================================================ """Test multiple transformations and output specs on the same ACON.""" import pytest from lakehouse_engine.core.definitions import InputFormat, OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/multiple_transform" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["batch"], ) def test_multiple_transformations(scenario: str) -> None: """Tests multiple transformations available in the ACON transform_specs.\ Transformations are saved in different locations, according to the output_specs. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.json", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_transform_df1 = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/orders_customer_cols/data", file_format=OutputFormat.DELTAFILES.value, ) result_transform_df2 = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/orders_kpi_cols/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data", file_format=InputFormat.JSON.value, options={"multiLine": "true"}, ) assert not DataframeHelpers.has_diff( result_transform_df1, control_df.select("date", "country", "customer_number") ) assert not DataframeHelpers.has_diff( result_transform_df2, control_df.select("date", "city", "amount") ) ================================================ FILE: tests/feature/transformations/test_null_handlers.py ================================================ """Test Null Handler Transformers.""" import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/null_handlers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["replace_nulls", "replace_nulls_col_subset"], ) def test_replace_nulls(scenario: str) -> None: """Test date transformers. Args: scenario: scenario to test. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=OutputFormat.DELTAFILES.value, ).drop("curr_date") control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_optimizers.py ================================================ """Test Optimizer transformers.""" import pytest from pyspark.sql.dataframe import DataFrame from lakehouse_engine.engine import load_data from tests.conftest import FEATURE_RESOURCES, LAKEHOUSE_FEATURE_IN from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/optimizers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" def is_df_cached(df: DataFrame) -> DataFrame: """Check if the dataframe is cached. Args: df: DataFrame passed as input. Returns: DataFrame: same as the input DataFrame. """ if not df.is_cached: raise Exception return df def is_df_not_cached(df: DataFrame) -> DataFrame: """Check if the dataframe is not cached. Args: df: DataFrame passed as input. Returns: DataFrame: same as the input DataFrame. """ if df.is_cached: raise Exception return df @pytest.mark.parametrize("scenario", ["batch", "streaming"]) def test_optimizer(scenario: str) -> None: """Test the optimizer transformer both in batch and streaming.""" acon = _get_test_acon(scenario) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/part-01.csv", f"{TEST_LAKEHOUSE_IN}/data/", ) load_data(acon=acon) def _get_test_acon(read_type: str) -> dict: """Creates a test ACON with the desired logic for the algorithm. Args: read_type: the read type (streaming or batch). Returns: dict: the ACON for the algorithm configuration. """ acon = { "input_specs": [ { "spec_id": "sales_source", "read_type": read_type, "data_format": "csv", "options": {"header": True, "delimiter": "|", "inferSchema": True}, "location": f"file:///{TEST_LAKEHOUSE_IN}/data/", } ], "transform_specs": [ { "spec_id": "transformed_sales_source", "input_id": "sales_source", "transformers": [ { "function": "persist", "args": {"storage_level": "MEMORY_AND_DISK"}, }, { "function": "custom_transformation", "args": {"custom_transformer": is_df_cached}, }, { "function": "unpersist", }, { "function": "custom_transformation", "args": {"custom_transformer": is_df_not_cached}, }, { "function": "cache", }, { "function": "custom_transformation", "args": {"custom_transformer": is_df_cached}, }, ], } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "transformed_sales_source", "data_format": "console", } ], } if read_type == "streaming": acon["transform_specs"][0][ # type: ignore "force_streaming_foreach_batch_processing" ] = True acon["exec_env"] = {"spark.sql.streaming.schemaInference": True} return acon ================================================ FILE: tests/feature/transformations/test_regex_transformers.py ================================================ """Test Regex Transformers.""" import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/regex_transformers" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", ["with_regex_value"], ) def test_regex_transformers(scenario: str) -> None: """Test regex transformers. Args: scenario: scenario to test. with_regex_value - test with_regex_value feature in the regex transformers. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/source/*.csv", f"{TEST_LAKEHOUSE_IN}/{scenario}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/*schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario}/", ) acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch.json") load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv", f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) ================================================ FILE: tests/feature/transformations/test_unions.py ================================================ """Test Union Transformers.""" from typing import List import pytest from pyspark.sql.utils import AnalysisException from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/unions" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ ["batch", "union", "control_sales"], ["batch", "union_diff_schema", ""], ["batch", "unionByName", "control_sales"], ["batch", "unionByName_diff_schema", "control_sales_shipment"], ["batch", "unionByName_diff_schema_error", ""], ["streaming", "union", "control_sales_streaming"], ["streaming", "unionByName_diff_schema", "control_sales_shipment_streaming"], ["streaming", "union_foreachBatch", "control_sales_streaming_foreachBatch"], [ "streaming", "unionByName_diff_schema_foreachBatch", "control_sales_shipment_streaming_foreachBatch", ], ], ) def test_unions(scenario: List[str]) -> None: """Test union transformers. Args: scenario: scenario to test. batch_union - union batch scenario, using union function based on columns' position. batch_union_diff_schema - same as batch_union scenario but tries to union data with different schema, throwing an exception. batch_unionByName - union batch scenario, using unionByName function based on columns' names. batch_unionByName_diff_schema - same as batch_unionByName scenario but allows the union of datasets with different schemas enabling the allowMissingColumns param. batch_unionByName_diff_schema_error - same as batch_unionByName_diff_schema but disabling the allowMissingColumns param and therefore, throwing an exception. streaming_union - union streaming scenario, using union function based on columns' position. streaming_unionByName_diff_schema - union streaming scenario, using unionByName function based on columns' names and allowing the union of datasets with different schemas. streaming_union_foreachBatch - union streaming scenario, using union function based on columns' position in foreachBatch mode. streaming_unionByName_diff_schema_foreachBatch - union streaming scenario, using unionByName function based on columns' names and allowing the union of datasets with different schemas in foreachBatch mode. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/*schema.json", f"{TEST_LAKEHOUSE_IN}/", ) copy_data_files(1) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario[0]}_{scenario[1]}.json" ) if "union_diff_schema" in scenario[1] or "error" in scenario[1]: with pytest.raises( AnalysisException, match=".*UNION can only be performed on inputs with the same number.*", ): load_data(acon=acon) else: if scenario[0] != "batch": load_data(acon=acon) copy_data_files(2) load_data(acon=acon) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/control/*.csv", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}_{scenario[1]}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[2]}.csv" ) assert not DataframeHelpers.has_diff(result_df, control_df) def copy_data_files(iteration: int) -> None: """Copies the data files to the tests input location. Args: iteration: number indicating the file to load. """ LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/sales-historical-part-0{iteration}.csv", f"{TEST_LAKEHOUSE_IN}/data/sales/sales_historical/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/sales-new-part-0{iteration}.csv", f"{TEST_LAKEHOUSE_IN}/data/sales/sales_new/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/data/source/sales-shipment-part-0{iteration}.csv", f"{TEST_LAKEHOUSE_IN}/data/sales/sales_shipment/", ) ================================================ FILE: tests/feature/transformations/test_watermarker.py ================================================ """Test Watermarker Transformers.""" import pytest from lakehouse_engine.core.definitions import OutputFormat from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import load_data from lakehouse_engine.utils.configs.config_utils import ConfigUtils from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import ( FEATURE_RESOURCES, LAKEHOUSE_FEATURE_CONTROL, LAKEHOUSE_FEATURE_IN, LAKEHOUSE_FEATURE_OUT, ) from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_PATH = "transformations/watermarker" TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}" TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}" TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}" TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}" @pytest.mark.parametrize( "scenario", [ {"scenario_name": "streaming_drop_duplicates", "loads": 2}, {"scenario_name": "streaming_drop_duplicates_overall_watermark", "loads": 2}, ], ) def test_drop_duplicates_with_watermark(scenario: dict) -> None: """Test deduplication applying watermarking. For both test scenarios if there is late data coming out of the watermark time, this data won't be integrated. It won't be in the target destination (and so it is also not in the control data). Args: scenario: scenario to test. streaming_drop_duplicates - apply drop duplicates over a streaming dataframe. streaming_drop_duplicates_overall_watermark - apply drop duplicates over a streaming dataframe defined as an independent transformation. It also uses the Group and rank transformation which ignores the watermark because that transformation is applied over a foreach batch operation. """ scenario_name = scenario["scenario_name"] loads = scenario["loads"] LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/data/control/*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/*schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/", ) for load in range(1, loads + 1): LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/data/source/part-0{str(load)}.csv", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario_name}/{scenario_name}.json" ) load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data", file_format=OutputFormat.DELTAFILES.value, ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario_name}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario_name}/source_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) @pytest.mark.parametrize( "scenario", [ {"scenario_name": "streaming_inner_join", "loads": 2}, {"scenario_name": "streaming_right_outer_join", "loads": 2}, {"scenario_name": "streaming_left_outer_join", "loads": 5}, ], ) def test_joins_with_watermark(scenario: dict) -> None: """Test join operations applying watermarking. Args: scenario: scenario to test. streaming_inner_join - apply inner join over 2 streaming dataframes. streaming_right_outer_join - apply right outer join over 2 streaming dataframes. streaming_left_outer_join - apply left outer join over 2 streaming dataframes. """ scenario_name = scenario["scenario_name"] loads = scenario["loads"] if scenario_name == "streaming_right_outer_join": _drop_and_create_table( "streaming_outer_join", f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data" ) for load in range(1, loads + 1): file_prefix = f"part-0{str(load)}.csv" if load >= 1 and not scenario_name == "streaming_inner_join": LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/data/source/customer-{file_prefix}", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/customers/", ) elif load == 1 and scenario_name == "streaming_inner_join": LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/data/source/customer-part-01.csv", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/customers/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/data/source/sales-{file_prefix}", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/sales/", ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/*schema.json", f"{TEST_LAKEHOUSE_IN}/{scenario_name}/", ) acon = ConfigUtils.get_acon( f"file://{TEST_RESOURCES}/{scenario_name}/{scenario_name}.json" ) load_data(acon=acon) result_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data", file_format=OutputFormat.DELTAFILES.value, ) LocalStorage.copy_file( f"{TEST_RESOURCES}/{scenario_name}/data/control/*", f"{TEST_LAKEHOUSE_CONTROL}/data/", ) control_df = DataframeHelpers.read_from_file( f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario_name}.csv", schema=SchemaUtils.from_file_to_dict( f"file://{TEST_LAKEHOUSE_IN}/{scenario_name}/" f"{scenario_name}_control_schema.json" ), ) assert not DataframeHelpers.has_diff(result_df, control_df) def _drop_and_create_table(table_name: str, location: str) -> None: """Create test table. Args: table_name: name of the table. location: location of the table. """ ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS test_db.{table_name}") ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS test_db.{table_name} ( salesorder int, item int, date timestamp, customer string, article string, amount int, customer_name string ) USING delta LOCATION '{location}' """ ) ================================================ FILE: tests/resources/feature/append_load/failfast/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "enforce_schema_from_table": "test_db.failfast_table", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.failfast_table" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_date", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_date" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "db_table": "test_db.failfast_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data" } ] } ================================================ FILE: tests/resources/feature/append_load/failfast/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.failfast_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data" } ] } ================================================ FILE: tests/resources/feature/append_load/failfast/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/append_load/failfast/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/append_load/failfast/data/source/part-03.csv ================================================ salesorder|item|date|customer|article|amount2|onemorecolumn 5|1|20170510|customer4|article6|15000|NA 5|2|20170510|customer4|article3|10000|NA 5|3|20170510|customer4|article5|8000|NA 6|1|20170601|customer2|article4|10000|NA 6|2|20170601|customer2|article1|5000|NA 6|3|20170601|customer2|article2|9000|NA ================================================ FILE: tests/resources/feature/append_load/jdbc_permissive/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db", "table": "jdbc_permissive", "properties": { "driver": "org.sqlite.JDBC" } }, "options": { "numPartitions": 1 } }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.jdbc_permissive_table" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_date", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_date" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "db_table": "test_db.jdbc_permissive_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data" } ] } ================================================ FILE: tests/resources/feature/append_load/jdbc_permissive/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db", "table": "jdbc_permissive", "properties": { "driver": "org.sqlite.JDBC" } }, "options": { "numPartitions": 1 } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.jdbc_permissive_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data" } ] } ================================================ FILE: tests/resources/feature/append_load/jdbc_permissive/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/append_load/jdbc_permissive/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/append_load/jdbc_permissive/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/append_load/jdbc_permissive/data/source/part-03.csv ================================================ salesorder|item|date|customer|article|amount 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/source/part-03.csv ================================================ salesorder|item|date|customer|article|amount2|onemorecolumn 5|1|20170510|customer4|article6|15000|NA 5|2|20170510|customer4|article3|10000|NA 5|3|20170510|customer4|article5|8000|NA 6|1|20170601|customer2|article4|10000|NA 6|2|20170601|customer2|article1|5000|NA 6|3|20170601|customer2|article2|9000|NA ================================================ FILE: tests/resources/feature/append_load/streaming_dropmalformed/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|", "mode": "DROPMALFORMED" }, "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_dropmalformed/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "db_table": "test_db.streaming_dropmalformed_table", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/data" } ] } ================================================ FILE: tests/resources/feature/append_load/streaming_with_terminators/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/append_load/streaming_with_terminators/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/append_load/streaming_with_terminators/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|", "mode": "DROPMALFORMED" }, "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_with_terminators/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "db_table": "test_db.streaming_with_terminators_table", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/data" } ], "terminate_specs": [ { "function": "optimize_dataset", "args": { "db_table": "test_db.streaming_with_terminators_table", "debug": true } } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "group_article", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data/source/part-01.csv ================================================ salesorder|item|date|group_article|article_number|amount 1|1|20160601|IE4089|IE4019|1000 1|2|20160601|IE4088|IE4018|2000 1|3|20160601|IE4087|IE4017|500 2|1|20170215|IE4086|IE4016|100 2|2|20170215|IE4085|IE4015|500 2|3|20170215|IE4084|IE4014|300 3|1|20170215|IE4083|IE4013|2000 3|2|20170215|IE4082|IE4012|1200 3|3|20170215|IE4081|IE4011|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data/source/part-02.csv ================================================ salesorder|item|date|group_article|article_number|amount 1|1|20160601|IE4099|IE4039|1000 1|2|20160601|IE4098|IE4038|2000 1|3|20160601|IE4097|IE4037|500 2|1|20170215|IE4096|IE4036|100 2|2|20170215|IE4095|IE4035|500 2|3|20170215|IE4094|IE4034|300 3|1|20170215|IE4093|IE4033|2000 3|2|20170215|IE4092|IE4032|1200 3|3|20170215|IE4091|IE4031|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "group_article", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "group_article", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data", "schema": { "type": "struct", "fields": [ { "name": "VBELN", "type": "string", "nullable": true, "metadata": {} }, { "name": "EDATU", "type": "date", "nullable": true, "metadata": {} }, { "name": "MBDAT", "type": "string", "nullable": true, "metadata": {} }, { "name": "ERDAT", "type": "date", "nullable": true, "metadata": {} }, { "name": "ERDATA", "type": "string", "nullable": true, "metadata": {} }, { "name": "BPDAT", "type": "date", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data/source/part-01.csv ================================================ VBELN|EDATU|MBDAT|ERDAT|ERDATA|BPDAT 2001|2029-01-12|2023-11-21|2022-08-07|2022-08-07|2023-09-04 2002|2029-01-12|2020-01-01|2020-01-04|2019-08-07|2023-10-14 2003|2019-01-12|2023-03-21|2009-01-14|2012-08-07|2024-12-24 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data/source/part-02.csv ================================================ VBELN|EDATU|MBDAT|ERDAT|ERDATA|BPDAT 2004|2029-01-12|2022-04-21|2010-05-04|2020-08-07|2024-11-04 2005|2013-01-12|2022-05-21|2013-01-11|2022-05-21|2024-09-12 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "VBELN", "type": "string", "nullable": true, "metadata": {} }, { "name": "EDATU", "type": "date", "nullable": true, "metadata": {} }, { "name": "MBDAT", "type": "string", "nullable": true, "metadata": {} }, { "name": "ERDAT", "type": "date", "nullable": true, "metadata": {} }, { "name": "ERDATA", "type": "string", "nullable": true, "metadata": {} }, { "name": "BPDAT", "type": "date", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data", "schema": { "type": "struct", "fields": [ { "name": "VBELN", "type": "string", "nullable": true, "metadata": {} }, { "name": "EDATU", "type": "date", "nullable": true, "metadata": {} }, { "name": "MBDAT", "type": "string", "nullable": true, "metadata": {} }, { "name": "ERDAT", "type": "date", "nullable": true, "metadata": {} }, { "name": "ERDATA", "type": "string", "nullable": true, "metadata": {} }, { "name": "BPDAT", "type": "date", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|2016-06-01T12:00:00|customer1|article1|1000 1|2|2016-06-01T12:00:00|customer1|article2|2000 1|3|2016-06-01T12:00:00|customer1|article3|500 2|1|2017-02-15T12:00:00|customer2|article4|100 2|2|2017-02-15T12:00:00|customer2|article6|500 2|3|2017-02-15T12:00:00|customer2|article1|300 3|1|2017-02-15T12:00:00|customer1|article5|2000 3|2|2017-02-15T12:00:00|customer1|article2|1200 3|3|2017-02-15T12:00:00|customer1|article4|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 4|1|2017-04-30T12:00:00|customer3|article3|800 4|2|2017-04-30T12:00:00|customer3|article7|700 4|3|2017-04-30T12:00:00|customer3|article1|300 4|4|2017-04-30T12:00:00|customer3|article2|500 5|1|2017-05-10T12:00:00|customer4|article6|1500 5|2|2017-05-10T12:00:00|customer4|article3|1000 5|3|2017-05-10T12:00:00|customer4|article5|800 6|1|2017-06-01T12:00:00|customer2|article4|1000 6|2|2017-06-01T12:00:00|customer2|article1|500 6|3|2017-06-01T12:00:00|customer2|article2|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "number", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data/source/part-01.csv ================================================ salesorder|item|number|customer|article|amount 1|1|4061622965678|customer1|article1|1000 1|2|4061622965678|customer1|article2|2000 1|3|4061622965678|customer1|article3|500 2|1|4061622965678|customer2|article4|100 2|2|4061622965678|customer2|article6|500 2|3|4061622965678|customer2|article1|300 3|1|4061622965678|customer1|article5|2000 3|2|4061622965678|customer1|article2|1200 3|3|4061622965678|customer1|article4|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 4|1|4061622965678|customer3|article3|800 4|2|4061622965678|customer3|article7|700 4|3|4061622965678|customer3|article1|300 4|4|4061622965678|customer3|article2|500 5|1|4061622965678|customer4|article6|1500 5|2|4061622965678|customer4|article3|1000 5|3|4061622965678|customer4|article5|800 6|1|4061622965678|customer2|article4|1000 6|2|4061622965678|customer2|article1|500 6|3|4061622965678|customer2|article2|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "number", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "number", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "itemcode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data/source/part-01.csv ================================================ salesorder|item|itemcode|date|customer|article|amount 1|1|1|20160601|customer1|article1|1000 1|2|2|20160601|customer1|article2|2000 1|3|3|20160601|customer1|article3|500 2|1|1|20170215|customer2|article4|100 2|2|2|20170215|customer2|article6|500 2|3|3|20170215|customer2|article1|300 3|1|1|20170215|customer1|article5|2000 3|2|2|20170215|customer1|article2|1200 3|3|3|20170215|customer1|article4|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data/source/part-02.csv ================================================ salesorder|item|itemcode|date|customer|article|amount 4|1|1|20170430|customer3|article3|800 4|2|2|20170430|customer3|article7|700 4|3|3|20170430|customer3|article1|300 4|4|4|20170430|customer3|article2|500 5|1|1|20170510|customer4|article6|1500 5|2|2|20170510|customer4|article3|1000 5|3|3|20170510|customer4|article5|800 6|1|1|20170601|customer2|article4|1000 6|2|2|20170601|customer2|article1|500 6|3|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "itemcode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "itemcode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_queried_column_agg_value_to_be/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "itemcode", "type": "string", "nullable": true, "metadata": {} }, { "name": "year", "type": "string", "nullable": true, "metadata": {} }, { "name": "month", "type": "string", "nullable": true, "metadata": {} }, { "name": "day", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_queried_column_agg_value_to_be/data" } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/data/source/part-01.csv ================================================ salesorder|item|itemcode|year|month|day|customer|article|amount 1|1|1|2016|06|01|customer1|article1|1000 1|2|2|2016|06|01|customer1|article2|2000 1|3|3|2016|06|01|customer1|article3|500 2|1|1|2017|02|15|customer2|article4|100 2|2|2|2017|02|15|customer2|article6|500 2|3|3|2017|02|15|customer2|article1|300 3|1|1|2015|10|09|customer1|article5|2000 3|2|2|2015|10|09|customer1|article2|1200 3|3|3|2015|10|09|customer1|article4|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/data/source/part-02.csv ================================================ salesorder|item|itemcode|year|month|day|customer|article|amount 4|1|1|2020|04|30|customer3|article3|800 4|2|2|2020|04|30|customer3|article7|700 4|3|3|2021|11|31|customer3|article1|300 4|4|4|2021|11|31|customer3|article2|500 5|1|1|2022|01|01|customer4|article6|1500 5|2|2|2022|01|01|customer4|article3|1000 5|3|3|2022|01|01|customer4|article5|800 6|1|1|2010|06|29|customer2|article4|1000 6|2|2|2010|06|29|customer2|article1|500 6|3|3|2010|06|29|customer2|article2|900 ================================================ FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "itemcode", "type": "string", "nullable": true, "metadata": {} }, { "name": "year", "type": "string", "nullable": true, "metadata": {} }, { "name": "month", "type": "string", "nullable": true, "metadata": {} }, { "name": "day", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_queried_column_agg_value_to_be/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "itemcode", "type": "string", "nullable": true, "metadata": {} }, { "name": "year", "type": "string", "nullable": true, "metadata": {} }, { "name": "month", "type": "string", "nullable": true, "metadata": {} }, { "name": "day", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_queried_column_agg_value_to_be/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_queried_column_agg_value_to_be/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "amount", "type": "long", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/data/control/part-01.csv ================================================ date|amount 20160601|3500 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/control/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|15000 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|20000 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|5000 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|1000 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|5000 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|3000 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|20000 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|7000 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|4000 20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|7000 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|15000 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|10000 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|8000 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|10000 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|5000 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|9000 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|12000 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|2|1|14|4|4||20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "amount", "type": "long", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/data/control/part-01.csv ================================================ date|amount 20160601|3500 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/data_quality/build_data_docs/with_data_docs_local_fs/20240410-080323-dq_success-sales_orders-checkpoint/20240410T080323.289170Z/7ba399ea28cc40bf8c79213a440aeb91.json ================================================ { "evaluation_parameters": {}, "meta": { "active_batch_definition": { "batch_identifiers": { "input_id": "sales_orders", "spec_id": "dq_success", "timestamp": "20240410080151" }, "data_asset_name": "dq_success-sales_orders", "data_connector_name": "dq_success-sales_orders-data_connector", "datasource_name": "dq_success-sales_orders-datasource" }, "batch_markers": { "ge_load_time": "20240410T080323.295280Z" }, "batch_spec": { "batch_data": "SparkDataFrame", "data_asset_name": "dq_success-sales_orders" }, "checkpoint_id": null, "checkpoint_name": "dq_success-sales_orders-checkpoint", "expectation_suite_name": "dq_success-sales_orders-validator", "great_expectations_version": "0.18.8", "run_id": { "run_name": "20240410-080323-dq_success-sales_orders-checkpoint", "run_time": "2024-04-10T08:03:23.289170+00:00" }, "validation_id": null, "validation_time": "20240410T080323.296161Z" }, "results": [ { "exception_info": { "exception_message": null, "exception_traceback": null, "raised_exception": false }, "expectation_config": { "expectation_type": "expect_column_to_exist", "kwargs": { "batch_id": "7ba399ea28cc40bf8c79213a440aeb91", "column": "article" }, "meta": {} }, "meta": {}, "result": {}, "success": true }, { "exception_info": { "exception_message": null, "exception_traceback": null, "raised_exception": false }, "expectation_config": { "expectation_type": "expect_table_row_count_to_be_between", "kwargs": { "batch_id": "7ba399ea28cc40bf8c79213a440aeb91", "max_value": 50, "min_value": 0 }, "meta": {} }, "meta": {}, "result": { "observed_value": 34 }, "success": true } ], "statistics": { "evaluated_expectations": 2, "success_percent": 100.0, "successful_expectations": 2, "unsuccessful_expectations": 0 }, "success": true } ================================================ FILE: tests/resources/feature/data_quality/build_data_docs/without_data_docs_local_fs/20240409-143548-dq_validator-sales_source-checkpoint/20240409T143548.454043Z/f0d7bd293d22bcfd3c1fec5a7d566638.json ================================================ { "evaluation_parameters": {}, "meta": { "active_batch_definition": { "batch_identifiers": { "input_id": "sales_source", "spec_id": "dq_validator", "timestamp": "20240409143443" }, "data_asset_name": "dq_validator-sales_source", "data_connector_name": "dq_validator-sales_source-data_connector", "datasource_name": "dq_validator-sales_source-datasource" }, "batch_markers": { "ge_load_time": "20240409T143548.465215Z" }, "batch_spec": { "batch_data": "SparkDataFrame", "data_asset_name": "dq_validator-sales_source" }, "checkpoint_id": null, "checkpoint_name": "dq_validator-sales_source-checkpoint", "expectation_suite_name": "dq_validator-sales_source-validator", "great_expectations_version": "0.18.8", "run_id": { "run_name": "20240409-143548-dq_validator-sales_source-checkpoint", "run_time": "2024-04-09T14:35:48.454043+00:00" }, "validation_id": null, "validation_time": "20240409T143548.466032Z" }, "results": [ { "exception_info": { "exception_message": null, "exception_traceback": null, "raised_exception": false }, "expectation_config": { "expectation_type": "expect_table_row_count_to_be_between", "kwargs": { "batch_id": "f0d7bd293d22bcfd3c1fec5a7d566638", "max_value": 34, "min_value": 34 }, "meta": {} }, "meta": {}, "result": { "observed_value": 34 }, "success": true }, { "exception_info": { "exception_message": null, "exception_traceback": null, "raised_exception": false }, "expectation_config": { "expectation_type": "expect_table_column_count_to_be_between", "kwargs": { "batch_id": "f0d7bd293d22bcfd3c1fec5a7d566638", "max_value": 12, "min_value": 12 }, "meta": {} }, "meta": {}, "result": { "observed_value": 12 }, "success": true } ], "statistics": { "evaluated_expectations": 2, "success_percent": 100.0, "successful_expectations": 2, "unsuccessful_expectations": 0 }, "success": true } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":19,"min_value":19,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":19,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":9,"min_value":9,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":9,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "sum_total", "nullable": true, "type": "float" }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "long" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/sales.json ================================================ {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/sales_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "actrequest_timestamp", "nullable": true, "type": "string" }, { "metadata": {}, "name": "request", "nullable": true, "type": "string" }, { "metadata": {}, "name": "datapakid", "nullable": true, "type": "string" }, { "metadata": {}, "name": "partno", "nullable": true, "type": "string" }, { "metadata": {}, "name": "record", "nullable": true, "type": "string" }, { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "recordmode", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "article", "nullable": true, "type": "string" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_validations", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "raised_exceptions", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "run_row_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "dq_failure_details", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } } ], "type": "struct" } } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 19, "max_value": 19} rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12} rule_3|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 9, "max_value": 9} rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12} rule_3|expect_column_to_exist|in_motion|test_db|dummy_sales|fake_column|{"column": "fake_column"} rule_4|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/streaming_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/dq", "result_sink_format": "delta", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_table_table_filter": "dummy_sales", "tag_source_data": true, "source": "condensed_sales", "data_product_name": "delta_with_dupl_tag_gen_fail" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/streaming_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/dq", "result_sink_format": "delta", "dq_table_table_filter": "dummy_sales", "tag_source_data": true, "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "source": "condensed_sales", "data_product_name": "delta_with_dupl_tag_gen_fail" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" }, "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"19.0","min_value":"19.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":19,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_multicolumn_sum_to_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":null,"column_B":null,"column_list":"[salesorder, request]","sum_total":"5.0", "unexpected_index_list":[{"run_success":false,"customer":"customer1","date":null,"item":"1","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"2","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"3","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer2","date":20170215,"item":"1","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"3","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"4","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"2","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170215,"item":"2","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer2","date":20170215,"item":"3","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"1","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer1","date":20170215,"item":"2","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"1","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"2","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"3","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"1","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170601,"item":"3","request":"0","salesorder":"6"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"9.0","min_value":"9.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":9,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_values_to_be_in_set","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"salesorder","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"customer":"customer5","date":"20180110","item":"1","salesorder":"7"}],"value_set":"[1, 2, 3, 4, 5]","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"3.0","min_value":"3.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_value_lengths_to_be_between","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"amount","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"4","salesorder":"4"},{"run_success":false,"amount":"50","customer":"customer2","date":20170215,"item":2,"salesorder":2},{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"1","salesorder":"4"},{"run_success":false,"amount":"80","customer":"customer3","date":20170430,"item":"2","salesorder":"4"},{"run_success":false,"amount":"40","customer":"customer3","date":20170430,"item":"3","salesorder":"4"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_pair_values_to_be_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":"datapakid","column_B":"partno", "unexpected_index_list":[{"run_success":false,"datapakid":"2","salesorder":"4","customer":"customer3","date":20170430,"item":"4","partno":"1"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_list", "nullable": true, "type": "string" }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "sum_total", "nullable": true, "type": "string" }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "request", "nullable": true, "type": "string" }, { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "datapakid", "nullable": true, "type": "string" }, { "metadata": {}, "name": "partno", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "long" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_A", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_B", "nullable": true, "type": "string" }, { "metadata": {}, "name": "value_set", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/sales.json ================================================ {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_pair_values_to_be_equal","kwargs":"{\"column_A\":\"datapakid\",\"column_B\":\"partno\"}"},{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_values_to_be_in_set","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"salesorder\",\"value_set\":[1,2,3,4,5]}"}]}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/sales_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "actrequest_timestamp", "nullable": true, "type": "string" }, { "metadata": {}, "name": "request", "nullable": true, "type": "string" }, { "metadata": {}, "name": "datapakid", "nullable": true, "type": "string" }, { "metadata": {}, "name": "partno", "nullable": true, "type": "string" }, { "metadata": {}, "name": "record", "nullable": true, "type": "string" }, { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "recordmode", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "article", "nullable": true, "type": "string" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_validations", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "raised_exceptions", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "run_row_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "dq_failure_details", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } } ], "type": "struct" } } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 19, "max_value": 19} rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12} rule_3|expect_multicolumn_sum_to_equal|in_motion|test_db|dummy_sales|salesorder,request|{"column_list": ["salesorder", "request"], "sum_total": 5} rule_4|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} rule_5|expect_multicolumn_sum_to_equal|in_motion|test_db|dummy_sales|salesorder,request|{"column_list": ["salesorder", "request"], "sum_total": 5} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 9, "max_value": 9} rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12} rule_3|expect_column_values_to_be_in_set|in_motion|test_db|dummy_sales|salesorder|{"column": "salesorder", "value_set": [1, 2, 3, 4, 5]} rule_4|expect_column_value_lengths_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 3, "max_value": 3} rule_5|expect_column_to_exist|in_motion|test_db|dummy_sales|fake_column|{"column": "fake_column"} rule_6|expect_column_pair_values_to_be_equal|in_motion|test_db|dummy_sales|datapakid, partno|{"column_A": "datapakid", "column_B": "partno"} rule_7|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/streaming_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/dq", "result_sink_format": "delta", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_table_table_filter": "dummy_sales", "tag_source_data": true, "source": "condensed_sales", "data_product_name": "delta_with_duplicates_tag" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/streaming_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/dq", "result_sink_format": "delta", "tag_source_data": true, "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_table_table_filter": "dummy_sales", "source": "condensed_sales", "data_product_name": "delta_with_duplicates_tag" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" }, "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/full_overwrite_tag/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/dq", "result_sink_format": "delta", "result_sink_extra_columns": ["validation_results.result.*"], "source": "sales", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_table_table_filter": "dummy_sales", "tag_source_data": true, "data_product_name": "full_overwrite_tag" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/data" } ] } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/batch_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/full_overwrite_tag/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "prisma", "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/dq", "result_sink_format": "delta", "result_sink_extra_columns": ["validation_results.result.*"], "source": "sales", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_table_table_filter": "dummy_sales", "tag_source_data": true, "data_product_name": "full_overwrite_tag" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/data" } ], "exec_env": { "spark.sql.sources.partitionColumnTypeInference.enabled": false } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "float" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "long" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/sales.json ================================================ {"salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"1","item":"2","date":"20160601","customer":"customer1","article":"article2","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"1","item":"3","date":"20160601","customer":"customer1","article":"article3","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"2","item":"1","date":"20170215","customer":"customer2","article":"article4","amount":"1000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article6","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"2","item":"3","date":"20170215","customer":"customer2","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"3","item":"1","date":"20170215","customer":"customer1","article":"article5","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"3","item":"2","date":"20170215","customer":"customer1","article":"article2","amount":"12000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"3","item":"3","date":"20170215","customer":"customer1","article":"article4","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"2","date":"20170430","customer":"customer3","article":"article7","amount":"7000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"3","date":"20170430","customer":"customer3","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"5","item":"1","date":"20170510","customer":"customer4","article":"article6","amount":"15000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"5","item":"2","date":"20170510","customer":"customer4","article":"article3","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"5","item":"3","date":"20170510","customer":"customer4","article":"article5","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"6","item":"1","date":"20170601","customer":"customer2","article":"article4","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"6","item":"2","date":"20170601","customer":"customer2","article":"article1","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"6","item":"3","date":"20170601","customer":"customer2","article":"article2","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/sales_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "article", "nullable": true, "type": "string" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_validations", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "raised_exceptions", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "run_row_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "dq_failure_details", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } } ], "type": "struct" } } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_to_exist|in_motion|test_db|dummy_sales|article|{"column": "article"} rule_2|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 3, "max_value": 50} rule_3|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_new.csv ================================================ dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments rule_1|COLUMN EXISTS|expect_column_to_exist|in_motion|test_db|dummy_sales|article||{"column": "article"} rule_2|ROW COUNT|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales|||{"min_value": 3, "max_value": 50} rule_3|TABLE STRUCTURE|expect_wrong_expectation|at_rest|test_db|no_table|amount||{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":19,"min_value":19,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":19,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":9,"min_value":9,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":9,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "sum_total", "nullable": true, "type": "float" }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "long" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/sales.json ================================================ {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/sales_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "actrequest_timestamp", "nullable": true, "type": "string" }, { "metadata": {}, "name": "request", "nullable": true, "type": "string" }, { "metadata": {}, "name": "datapakid", "nullable": true, "type": "string" }, { "metadata": {}, "name": "partno", "nullable": true, "type": "string" }, { "metadata": {}, "name": "record", "nullable": true, "type": "string" }, { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "recordmode", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "article", "nullable": true, "type": "string" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_validations", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "raised_exceptions", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "run_row_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "dq_failure_details", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } } ], "type": "struct" } } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/streaming_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/dq", "result_sink_db_table": "test_db.validator_delta_with_dupl_tag_gen_fail", "result_sink_format": "delta", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "tag_source_data": true, "source": "condensed_sales", "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 19, "max_value": 19 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/streaming_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/dq", "result_sink_db_table": "test_db.validator_delta_with_dupl_tag_gen_fail", "result_sink_format": "delta", "tag_source_data": true, "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "source": "condensed_sales", "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 9, "max_value": 9 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } }, { "function": "expect_column_to_exist", "args": { "column": "fake_column" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" }, "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"19.0","min_value":"19.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"19","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"9.0","min_value":"9.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"9","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "float" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/streaming_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/dq", "result_sink_db_table": "test_db.validator_delta_with_duplicates", "result_sink_format": "json", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "source": "condensed_sales", "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 19, "max_value": 19 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/streaming_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/dq", "result_sink_db_table": "test_db.validator_delta_with_duplicates", "result_sink_format": "json", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "source": "condensed_sales", "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 9, "max_value": 9 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" }, "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"19.0","min_value":"19.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"19","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_multicolumn_sum_to_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":null,"column_B":null,"column_list":"[salesorder, request]","sum_total":"5.0", "unexpected_index_list":[{"run_success":false,"customer":"customer1","date":null,"item":"1","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"2","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"3","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer2","date":20170215,"item":"1","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"3","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"4","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"2","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170215,"item":"2","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer2","date":20170215,"item":"3","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"1","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer1","date":20170215,"item":"2","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"1","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"2","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"3","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"1","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170601,"item":"3","request":"0","salesorder":"6"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"9.0","min_value":"9.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"9","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_values_to_be_in_set","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"salesorder","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"customer":"customer5","date":"20180110","item":"1","salesorder":"7"}],"value_set":"[1, 2, 3, 4, 5]","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"3.0","min_value":"3.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_value_lengths_to_be_between","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"amount","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"4","salesorder":"4"},{"run_success":false,"amount":"50","customer":"customer2","date":20170215,"item":2,"salesorder":2},{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"1","salesorder":"4"},{"run_success":false,"amount":"80","customer":"customer3","date":20170430,"item":"2","salesorder":"4"},{"run_success":false,"amount":"40","customer":"customer3","date":20170430,"item":"3","salesorder":"4"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_pair_values_to_be_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":"datapakid","column_B":"partno", "unexpected_index_list":[{"run_success":false,"datapakid":"2","salesorder":"4","customer":"customer3","date":20170430,"item":"4","partno":"1"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_list", "nullable": true, "type": "string" }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "sum_total", "nullable": true, "type": "string" }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "request", "nullable": true, "type": "string" }, { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "datapakid", "nullable": true, "type": "string" }, { "metadata": {}, "name": "partno", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_A", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column_B", "nullable": true, "type": "string" }, { "metadata": {}, "name": "value_set", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/sales.json ================================================ {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_pair_values_to_be_equal","kwargs":"{\"column_A\":\"datapakid\",\"column_B\":\"partno\"}"},{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} {"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_values_to_be_in_set","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"salesorder\",\"value_set\":[1,2,3,4,5]}"}]}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} {"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/sales_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "actrequest_timestamp", "nullable": true, "type": "string" }, { "metadata": {}, "name": "request", "nullable": true, "type": "string" }, { "metadata": {}, "name": "datapakid", "nullable": true, "type": "string" }, { "metadata": {}, "name": "partno", "nullable": true, "type": "string" }, { "metadata": {}, "name": "record", "nullable": true, "type": "string" }, { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "recordmode", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "article", "nullable": true, "type": "string" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_validations", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "raised_exceptions", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "run_row_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "dq_failure_details", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } } ], "type": "struct" } } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/streaming_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/dq", "result_sink_db_table": "test_db.validator_delta_with_duplicates_tag", "result_sink_format": "delta", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "tag_source_data": true, "source": "condensed_sales", "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 19, "max_value": 19 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } }, { "function": "expect_multicolumn_sum_to_equal", "args":{ "column_list": ["salesorder", "request"], "sum_total": 5 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/streaming_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "condensed_sales", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/dq", "result_sink_db_table": "test_db.validator_delta_with_duplicates_tag", "result_sink_format": "delta", "tag_source_data": true, "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "source": "condensed_sales", "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 9, "max_value": 9 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } }, { "function": "expect_column_values_to_be_in_set", "args": { "column": "salesorder", "value_set": [1, 2, 3, 4, 5] } }, { "function": "expect_column_value_lengths_to_be_between", "args": { "column": "amount", "min_value": 3, "max_value": 3 } }, { "function": "expect_column_to_exist", "args": { "column": "fake_column" } }, { "function": "expect_column_pair_values_to_be_equal", "args": { "column_A": "datapakid", "column_B": "partno" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" }, "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/dq", "result_sink_db_table": "test_db.validator_full_overwrite", "result_sink_extra_columns": ["validation_results.result.*"], "source": "sales", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "tag_source_data": true, "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "article" } }, { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 3, "max_value": 50 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/data" } ] } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/batch_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/dq", "result_sink_db_table": "test_db.validator_full_overwrite", "result_sink_extra_columns": ["validation_results.result.*"], "source": "sales", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "tag_source_data": true, "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "article" } }, { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 3, "max_value": 50 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/data" } ], "exec_env": { "spark.sql.sources.partitionColumnTypeInference.enabled": false } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "float" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "long" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/dq", "result_sink_db_table": "test_db.validator_full_overwrite_tag", "result_sink_extra_columns": ["validation_results.result.*"], "source": "sales", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "tag_source_data": true, "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "article" } }, { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 3, "max_value": 50 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data" } ] } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/batch_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/dq", "result_sink_db_table": "test_db.validator_full_overwrite_tag", "result_sink_extra_columns": ["validation_results.result.*"], "source": "sales", "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "tag_source_data": true, "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "article" } }, { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 3, "max_value": 50 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data" } ], "exec_env": { "spark.sql.sources.partitionColumnTypeInference.enabled": false } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "validation_result_identifier", "nullable": true, "type": "string" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source", "nullable": true, "type": "string" }, { "metadata": {}, "name": "batch_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "max_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "min_value", "nullable": true, "type": "float" }, { "metadata": {}, "name": "evaluated_expectations", "nullable": true, "type": "float" }, { "metadata": {}, "name": "success_percent", "nullable": true, "type": "double" }, { "metadata": {}, "name": "successful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "unsuccessful_expectations", "nullable": true, "type": "long" }, { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "expectation_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "exception_info", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "exception_message", "nullable": true, "type": "string" }, { "metadata": {}, "name": "exception_traceback", "nullable": true, "type": "string" }, { "metadata": {}, "name": "raised_exception", "nullable": true, "type": "boolean" } ], "type": "struct" } }, { "metadata": {}, "name": "meta", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "column", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_check_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_rule_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "execution_point", "nullable": true, "type": "string" }, { "metadata": {}, "name": "filters", "nullable": true, "type": "string" }, { "metadata": {}, "name": "schema", "nullable": true, "type": "string" }, { "metadata": {}, "name": "table", "nullable": true, "type": "string" } ], "type": "struct" } }, { "metadata": {}, "name": "observed_value", "nullable": true, "type": "long" }, { "metadata": {}, "name": "run_time_year", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_month", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "run_time_day", "nullable": true, "type": "integer" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/sales.json ================================================ {"salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"1","item":"2","date":"20160601","customer":"customer1","article":"article2","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"1","item":"3","date":"20160601","customer":"customer1","article":"article3","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"2","item":"1","date":"20170215","customer":"customer2","article":"article4","amount":"1000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article6","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"2","item":"3","date":"20170215","customer":"customer2","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"3","item":"1","date":"20170215","customer":"customer1","article":"article5","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"3","item":"2","date":"20170215","customer":"customer1","article":"article2","amount":"12000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"3","item":"3","date":"20170215","customer":"customer1","article":"article4","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"2","date":"20170430","customer":"customer3","article":"article7","amount":"7000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"3","date":"20170430","customer":"customer3","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"5","item":"1","date":"20170510","customer":"customer4","article":"article6","amount":"15000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"5","item":"2","date":"20170510","customer":"customer4","article":"article3","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"5","item":"3","date":"20170510","customer":"customer4","article":"article5","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"6","item":"1","date":"20170601","customer":"customer2","article":"article4","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"6","item":"2","date":"20170601","customer":"customer2","article":"article1","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} {"salesorder":"6","item":"3","date":"20170601","customer":"customer2","article":"article2","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/sales_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "salesorder", "nullable": true, "type": "string" }, { "metadata": {}, "name": "item", "nullable": true, "type": "string" }, { "metadata": {}, "name": "date", "nullable": true, "type": "string" }, { "metadata": {}, "name": "customer", "nullable": true, "type": "string" }, { "metadata": {}, "name": "article", "nullable": true, "type": "string" }, { "metadata": {}, "name": "amount", "nullable": true, "type": "string" }, { "metadata": {}, "name": "dq_validations", "nullable": true, "type": { "fields": [ { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "raised_exceptions", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "run_row_success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "dq_failure_details", "nullable": true, "type": { "containsNull": true, "elementType": { "fields": [ { "metadata": {}, "name": "expectation_type", "nullable": true, "type": "string" }, { "metadata": {}, "name": "kwargs", "nullable": true, "type": "string" } ], "type": "struct" }, "type": "array" } } ], "type": "struct" } } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/control/data_validator.json ================================================ {"checkpoint_config":"checkpoint_config_init","run_name":"20220611-211348-dq_validator-sales_source-checkpoint","run_time":"2022-06-11T21:13:48.505870+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","source_primary_key": ["salesorder", "item", "date", "customer"]} {"checkpoint_config":"checkpoint_config","run_name":"20220612-211348-dq_validator-sales_source-checkpoint","run_time":"2022-06-12T21:13:48.505870+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","source_primary_key": ["salesorder", "item", "date", "customer"]} ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/control/data_validator_schema.json ================================================ { "fields": [ { "metadata": {}, "name": "checkpoint_config", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_name", "nullable": true, "type": "string" }, { "metadata": {}, "name": "run_time", "nullable": true, "type": "string" }, { "metadata": {}, "name": "validation_results", "nullable": true, "type": "string" }, { "metadata": {}, "name": "success", "nullable": true, "type": "boolean" }, { "metadata": {}, "name": "spec_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "input_id", "nullable": true, "type": "string" }, { "metadata": {}, "name": "source_primary_key", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "unexpected_index_list", "nullable": true, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "metadata": {}, "name": "processed_keys", "nullable": true, "type": "string" } ], "type": "struct" } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/streaming_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/no_transformers/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/dq", "result_sink_db_table": "test_db.validator_no_transformers", "result_sink_format": "json", "result_sink_explode": false, "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 34, "max_value": 34 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "append", "data_format": "delta", "db_table": "test_db.test_no_transformers", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/streaming_new.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/no_transformers/data" } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "sales_source", "dq_type": "validator", "cache_df": true, "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/dq", "result_sink_db_table": "test_db.validator_no_transformers", "result_sink_format": "json", "result_sink_explode": false, "unexpected_rows_pk": ["salesorder", "item", "date", "customer"], "dq_functions": [ { "function": "expect_table_row_count_to_be_between", "args":{ "min_value": 26, "max_value": 26 } }, { "function": "expect_table_column_count_to_be_between", "args":{ "min_value": 12, "max_value": 12 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "dq_validator", "write_type": "append", "data_format": "delta", "db_table": "test_db.test_no_transformers", "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/data_quality/validator/data/control/data_validator.csv ================================================ checkpoint_config|run_name|run_time|validation_results|success|validation_result_identifier|spec_id|input_id|source_primary_key|processed_keys checkpoint_config|20220612-221423-validator-sales_orders-checkpoint|2022-06-12T22:14:23.625852+00:00|validation_results_for_all_expectations|true|validation_result_identifier|dq_success|sales_orders|["salesorder", "item", "date", "customer"]| checkpoint_config2|20220613-221423-validator-sales_orders-checkpoint2|2022-06-12T22:14:23.625852+00:00|validation_results_for_all_expectations2|false|validation_result_identifier|dq_failure_error_disabled|sales_orders|["salesorder", "item", "date", "customer"]| ================================================ FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_failure.csv ================================================ dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments rule_1|COLUMN EXISTS|expect_column_to_exist|at_rest|test_db|dummy_sales|article||{"column": "article"} rule_2|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 1} ================================================ FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_failure_error_disabled.csv ================================================ dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments rule_1|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 1} ================================================ FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_failure_max_percentage.csv ================================================ dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments rule_1|COLUMN EXISTS|expect_column_to_exist|at_rest|test_db|dummy_sales|article||{"column": "article"} rule_2|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 1} ================================================ FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_success.csv ================================================ dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments rule_1|COLUMN EXISTS|expect_column_to_exist|at_rest|test_db|dummy_sales|article||{"column": "article"} rule_2|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 50} ================================================ FILE: tests/resources/feature/data_quality/validator/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data" }, { "spec_id": "sales_silver", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data" } ], "transform_specs": [ { "spec_id": "max_sales_silver_timestamp", "input_id": "sales_silver", "transformers": [ { "function": "get_max_value", "args": { "input_col": "extraction_date" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "incremental_filter", "args": { "input_col": "extraction_date", "increment_df": "max_sales_silver_timestamp" } }, { "function": "with_auto_increment_id" }, { "function": "group_and_rank", "args": { "group_key": [ "salesorder", "item" ], "ranking_key": [ "extraction_date", "changed_on", "lhe_row_id" ] } } ] } ], "output_specs": [ { "spec_id": "sales_silver", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_bronze", "transformers": [ { "function": "with_auto_increment_id" }, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "group_and_rank", "args": { "group_key": [ "salesorder", "item" ], "ranking_key": [ "extraction_date", "changed_on", "lhe_row_id" ] } } ] } ], "output_specs": [ { "spec_id": "sales_silver", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/control_batch_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "lhe_row_id", "type": "integer", "nullable": true, "metadata": {} }, { "name": "extraction_date", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/control_streaming_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "extraction_date", "type": "string", "nullable": true, "metadata": {} }, { "name": "lhe_batch_id", "type": "integer", "nullable": true, "metadata": {} }, { "name": "lhe_row_id", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/control/batch.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date 1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000 1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000 1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000 2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000 2|2|shipped|20200811|20170215|customer2|article2|50|0|202108111600000000 2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000 3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000 3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000 3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000 4|1|cancelled|20200811|20170430|customer3|article3|100|1|202108111600000000 4|2|released|20200811|20170430|customer3|article7|80|2|202108111600000000 4|4|released|20200811|20170430|customer3|article2|60|4|202108111600000000 5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000 5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000 5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000 6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000 6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000 6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000 7|1|cancelled|20200811|20180110|customer5|article2|120|0|202108111500000000 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/control/streaming.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|extraction_date|lhe_batch_id|lhe_row_id 1|1|shipped|20200811|20160601|customer1|article1|150|202108111500000000|4|2 1|2|created|20200811|20160601|customer1|article2|200|202108111400000000|3|1 1|3|created|20200811|20160601|customer1|article3|50|202108111400000000|3|2 2|1|created|20200811|20170215|customer2|article4|10|202108111400000000|3|3 2|2|shipped|20200811|20170215|customer2|article2|50|202108111600000000|5|0 2|3|created|20200811|20170215|customer2|article1|30|202108111400000000|3|5 3|1|created|20200811|20170215|customer1|article5|200|202108111400000000|3|6 3|2|released|20200811|20170215|customer1|article2|120|202108111500000000|4|4 3|3|released|20200811|20170215|customer1|article4|90|202108111500000000|4|5 4|1|cancelled|20200811|20170430|customer3|article3|100|202108111600000000|5|1 4|2|released|20200811|20170430|customer3|article7|80|202108111600000000|5|2 4|4|released|20200811|20170430|customer3|article2|60|202108111600000000|5|4 5|1|created|20200811|20170510|customer4|article6|150|202108111400000000|3|13 5|2|created|20200811|20170510|customer4|article3|100|202108111400000000|3|14 5|3|created|20200811|20170510|customer4|article5|80|202108111400000000|3|15 6|1|created|20200811|20170601|customer2|article4|100|202108111400000000|3|16 6|2|created|20200811|20170601|customer2|article1|50|202108111400000000|3|17 6|3|created|20200811|20170601|customer2|article2|90|202108111400000000|3|18 7|1|cancelled|20200811|20180110|customer5|article2|120|202108111500000000|4|0 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111400000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 1|1|created|20200811|20160601|customer1|article1|100 1|2|created|20200811|20160601|customer1|article2|200 1|3|created|20200811|20160601|customer1|article3|50 2|1|created|20200811|20170215|customer2|article4|10 2|2|created|20200811|20170215|customer2|article6|50 2|3|created|20200811|20170215|customer2|article1|30 3|1|created|20200811|20170215|customer1|article5|200 3|2|created|20200811|20170215|customer1|article2|120 3|3|created|20200811|20170215|customer1|article4|90 4|1|created|20200811|20170430|customer3|article3|80 4|2|created|20200811|20170430|customer3|article7|70 4|3|created|20200811|20170430|customer3|article1|30 4|4|created|20200811|20170430|customer3|article2|50 5|1|created|20200811|20170510|customer4|article6|150 5|2|created|20200811|20170510|customer4|article3|100 5|3|created|20200811|20170510|customer4|article5|80 6|1|created|20200811|20170601|customer2|article4|100 6|2|created|20200811|20170601|customer2|article1|50 6|3|created|20200811|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111500000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 7|1|cancelled|20200811|20180110|customer5|article2|120 7|1|created|20200811|20180110|customer5|article2|120 1|1|shipped|20200811|20160601|customer1|article1|150 2|2|released|20200811|20170215|customer2|article2|50 3|2|released|20200811|20170215|customer1|article2|120 3|3|released|20200811|20170215|customer1|article4|90 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111600000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 2|2|shipped|20200811|20170215|customer2|article2|50 4|1|cancelled|20200811|20170430|customer3|article3|100 4|2|released|20200811|20170430|customer3|article7|80 4|3|deleted|20200811|20170430|customer3|article1|30 4|4|released|20200811|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_bronze", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/data" } ], "transform_specs": [ { "spec_id": "sales_bronze_with_extraction_date", "input_id": "sales_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "with_auto_increment_id" }, { "function": "group_and_rank", "args": { "group_key": [ "salesorder", "item" ], "ranking_key": [ "extraction_date", "changed_on", "lhe_row_id" ] } } ] } ], "output_specs": [ { "spec_id": "sales_silver", "input_id": "sales_bronze_with_extraction_date", "write_type": "merge", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/checkpoint" }, "with_batch_id": true, "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data" }, { "spec_id": "sales_silver", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data" } ], "transform_specs": [ { "spec_id": "max_sales_silver_timestamp", "input_id": "sales_silver", "transformers": [ { "function": "get_max_value", "args": { "input_col": "extraction_date" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "incremental_filter", "args": { "input_col": "extraction_date", "increment_df": "max_sales_silver_timestamp" } }, { "function": "with_auto_increment_id" }, { "function": "group_and_rank", "args": { "group_key": [ "salesorder", "item" ], "ranking_key": [ "extraction_date", "changed_on", "lhe_row_id" ] } } ] } ], "output_specs": [ { "spec_id": "sales_silver", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_bronze", "transformers": [ { "function": "with_auto_increment_id" }, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "group_and_rank", "args": { "group_key": [ "salesorder", "item" ], "ranking_key": [ "extraction_date", "changed_on", "lhe_row_id" ] } } ] } ], "output_specs": [ { "spec_id": "sales_silver", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/control_batch_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "lhe_row_id", "type": "integer", "nullable": true, "metadata": {} }, { "name": "extraction_date", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/control_streaming_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "extraction_date", "type": "string", "nullable": true, "metadata": {} }, { "name": "lhe_batch_id", "type": "integer", "nullable": true, "metadata": {} }, { "name": "lhe_row_id", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/control/batch.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date 1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000 1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000 1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000 2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000 2|2|shipped|20200811|20170215|customer2|article2|50|0|202108111600000000 2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000 3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000 3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000 3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000 4|1|cancelled|20200811|20170430|customer3|article3|100|1|202108111600000000 4|2|released|20200811|20170430|customer3|article7|80|2|202108111600000000 4|4|released|20200811|20170430|customer3|article2|60|4|202108111600000000 5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000 5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000 5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000 6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000 6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000 6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000 7|1|cancelled|20200811|20180110|customer5|article2|120|1|202108111500000000 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/control/streaming.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|extraction_date|lhe_batch_id|lhe_row_id 1|1|shipped|20200811|20160601|customer1|article1|150|202108111500000000|4|2 1|2|created|20200811|20160601|customer1|article2|200|202108111400000000|3|1 1|3|created|20200811|20160601|customer1|article3|50|202108111400000000|3|2 2|1|created|20200811|20170215|customer2|article4|10|202108111400000000|3|3 2|2|shipped|20200811|20170215|customer2|article2|50|202108111600000000|5|0 2|3|created|20200811|20170215|customer2|article1|30|202108111400000000|3|5 3|1|created|20200811|20170215|customer1|article5|200|202108111400000000|3|6 3|2|released|20200811|20170215|customer1|article2|120|202108111500000000|4|4 3|3|released|20200811|20170215|customer1|article4|90|202108111500000000|4|5 4|1|cancelled|20200811|20170430|customer3|article3|100|202108111600000000|5|1 4|2|released|20200811|20170430|customer3|article7|80|202108111600000000|5|2 4|4|released|20200811|20170430|customer3|article2|60|202108111600000000|5|4 5|1|created|20200811|20170510|customer4|article6|150|202108111400000000|3|13 5|2|created|20200811|20170510|customer4|article3|100|202108111400000000|3|14 5|3|created|20200811|20170510|customer4|article5|80|202108111400000000|3|15 6|1|created|20200811|20170601|customer2|article4|100|202108111400000000|3|16 6|2|created|20200811|20170601|customer2|article1|50|202108111400000000|3|17 6|3|created|20200811|20170601|customer2|article2|90|202108111400000000|3|18 7|1|cancelled|20200811|20180110|customer5|article2|120|202108111500000000|4|1 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111400000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 1|1|created|20200811|20160601|customer1|article1|100 1|2|created|20200811|20160601|customer1|article2|200 1|3|created|20200811|20160601|customer1|article3|50 2|1|created|20200811|20170215|customer2|article4|10 2|2|created|20200811|20170215|customer2|article6|50 2|3|created|20200811|20170215|customer2|article1|30 3|1|created|20200811|20170215|customer1|article5|200 3|2|created|20200811|20170215|customer1|article2|120 3|3|created|20200811|20170215|customer1|article4|90 4|1|created|20200811|20170430|customer3|article3|80 4|2|created|20200811|20170430|customer3|article7|70 4|3|created|20200811|20170430|customer3|article1|30 4|4|created|20200811|20170430|customer3|article2|50 5|1|created|20200811|20170510|customer4|article6|150 5|2|created|20200811|20170510|customer4|article3|100 5|3|created|20200811|20170510|customer4|article5|80 6|1|created|20200811|20170601|customer2|article4|100 6|2|created|20200811|20170601|customer2|article1|50 6|3|created|20200811|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111500000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 7|1|created|20200811|20180110|customer5|article2|120 7|1|cancelled|20200811|20180110|customer5|article2|120 1|1|shipped|20200811|20160601|customer1|article1|150 2|2|released|20200811|20170215|customer2|article2|50 3|2|released|20200811|20170215|customer1|article2|120 3|3|released|20200811|20170215|customer1|article4|90 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111600000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 2|2|shipped|20200811|20170215|customer2|article2|50 4|1|cancelled|20200811|20170430|customer3|article3|100 4|2|released|20200811|20170430|customer3|article7|80 4|3|deleted|20200811|20170430|customer3|article1|30 4|4|released|20200811|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_bronze", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data" } ], "transform_specs": [ { "spec_id": "sales_bronze_with_extraction_date", "input_id": "sales_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "with_auto_increment_id" }, { "function": "group_and_rank", "args": { "group_key": [ "salesorder", "item" ], "ranking_key": [ "extraction_date", "changed_on", "lhe_row_id" ] } }, { "function": "repartition", "args": { "num_partitions": 1 } } ] } ], "output_specs": [ { "spec_id": "sales_silver", "input_id": "sales_bronze_with_extraction_date", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/checkpoint" }, "with_batch_id": true, "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/control_batch_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "lhe_row_id", "type": "integer", "nullable": true, "metadata": {} }, { "name": "extraction_date", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "example_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/data" }, { "spec_id": "example_silver", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/insert_column_set/data" } ], "transform_specs": [ { "spec_id": "max_sales_silver_timestamp", "input_id": "example_silver", "transformers": [ { "function": "get_max_value", "args": { "input_col": "extraction_date" } } ] }, { "spec_id": "example_transform", "input_id": "example_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "incremental_filter", "args": { "input_col": "extraction_date", "increment_df": "max_sales_silver_timestamp" } }, { "function": "with_auto_increment_id" } ] } ], "output_specs": [ { "spec_id": "example_output", "input_id": "example_transform", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/insert_column_set/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "insert_predicate": "new.event in ('shipped','cancelled')", "insert_column_set": {"salesorder": "new.salesorder", "item": "new.item", "event": "new.event","changed_on": "new.changed_on", "amount": "new.amount + 101", "lhe_row_id": "new.lhe_row_id", "extraction_date": "new.extraction_date"}, "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/batch_init.json ================================================ { "input_specs": [ { "spec_id": "example_input", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/data" } ], "transform_specs": [ { "spec_id": "example_transform", "input_id": "example_input", "transformers": [ { "function": "with_auto_increment_id" }, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } } ] } ], "output_specs": [ { "spec_id": "example_bronze", "input_id": "example_transform", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/insert_column_set/data" } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/data/control/batch.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date 1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000 1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000 2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000 3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000 4|2|created|20200811|20170430|customer3|article7|70|10|202108111400000000 4|3|created|20200811|20170430|customer3|article1|30|11|202108111400000000 5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000 5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000 5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000 6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000 6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000 7|1|cancelled|20200811||||221|1|202108111500000000 1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000 2|2|released|20200811|20170215|customer2|article2|50|3|202108111500000000 2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000 3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000 3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000 4|1|created|20200811|20170430|customer3|article3|80|9|202108111400000000 4|4|created|20200811|20170430|customer3|article2|50|12|202108111400000000 6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000 ================================================ FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/data/source/WE_SO_SCL_202108111400000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 1|1|created|20200811|20160601|customer1|article1|100 1|2|created|20200811|20160601|customer1|article2|200 1|3|created|20200811|20160601|customer1|article3|50 2|1|created|20200811|20170215|customer2|article4|10 2|2|created|20200811|20170215|customer2|article6|50 2|3|created|20200811|20170215|customer2|article1|30 3|1|created|20200811|20170215|customer1|article5|200 3|2|created|20200811|20170215|customer1|article2|120 3|3|created|20200811|20170215|customer1|article4|90 4|1|created|20200811|20170430|customer3|article3|80 4|2|created|20200811|20170430|customer3|article7|70 4|3|created|20200811|20170430|customer3|article1|30 4|4|created|20200811|20170430|customer3|article2|50 5|1|created|20200811|20170510|customer4|article6|150 5|2|created|20200811|20170510|customer4|article3|100 5|3|created|20200811|20170510|customer4|article5|80 6|1|created|20200811|20170601|customer2|article4|100 6|2|created|20200811|20170601|customer2|article1|50 6|3|created|20200811|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/data/source/WE_SO_SCL_202108111500000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 7|1|created|20200811|20180110|customer5|article2|120 7|1|cancelled|20200811|20180110|customer5|article2|120 1|1|shipped|20200811|20160601|customer1|article1|150 2|2|released|20200811|20170215|customer2|article2|50 3|2|released|20200811|20170215|customer1|article2|120 3|3|released|20200811|20170215|customer1|article4|90 ================================================ FILE: tests/resources/feature/delta_load/merge_options/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "event", "type": "string", "nullable": true, "metadata": {} }, { "name": "changed_on", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_all/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "example_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/data" }, { "spec_id": "example_silver", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_all/data" } ], "transform_specs": [ { "spec_id": "max_sales_silver_timestamp", "input_id": "example_silver", "transformers": [ { "function": "get_max_value", "args": { "input_col": "extraction_date" } } ] }, { "spec_id": "example_transform", "input_id": "example_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "incremental_filter", "args": { "input_col": "extraction_date", "increment_df": "max_sales_silver_timestamp" } }, { "function": "with_auto_increment_id" } ] } ], "output_specs": [ { "spec_id": "example_output", "input_id": "example_transform", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_all/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_all/batch_init.json ================================================ { "input_specs": [ { "spec_id": "example_input", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/data" } ], "transform_specs": [ { "spec_id": "example_transform", "input_id": "example_input", "transformers": [ { "function": "with_auto_increment_id" }, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } } ] } ], "output_specs": [ { "spec_id": "example_bronze", "input_id": "example_transform", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_all/data" } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_all/data/control/batch.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date 1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000 1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000 2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000 3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000 4|2|created|20200811|20170430|customer3|article7|70|10|202108111400000000 4|3|created|20200811|20170430|customer3|article1|30|11|202108111400000000 5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000 5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000 5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000 6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000 6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000 7|1|created|20200811|20180110|customer5|article2|120|0|202108111500000000 7|1|cancelled|20200811|20180110|customer5|article2|120|1|202108111500000000 1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000 2|2|released|20200811|20170215|customer2|article2|50|3|202108111500000000 2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000 3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000 3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000 4|1|created|20200811|20170430|customer3|article3|80|9|202108111400000000 4|4|created|20200811|20170430|customer3|article2|50|12|202108111400000000 6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000 ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_all/data/source/WE_SO_SCL_202108111400000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 1|1|created|20200811|20160601|customer1|article1|100 1|2|created|20200811|20160601|customer1|article2|200 1|3|created|20200811|20160601|customer1|article3|50 2|1|created|20200811|20170215|customer2|article4|10 2|2|created|20200811|20170215|customer2|article6|50 2|3|created|20200811|20170215|customer2|article1|30 3|1|created|20200811|20170215|customer1|article5|200 3|2|created|20200811|20170215|customer1|article2|120 3|3|created|20200811|20170215|customer1|article4|90 4|1|created|20200811|20170430|customer3|article3|80 4|2|created|20200811|20170430|customer3|article7|70 4|3|created|20200811|20170430|customer3|article1|30 4|4|created|20200811|20170430|customer3|article2|50 5|1|created|20200811|20170510|customer4|article6|150 5|2|created|20200811|20170510|customer4|article3|100 5|3|created|20200811|20170510|customer4|article5|80 6|1|created|20200811|20170601|customer2|article4|100 6|2|created|20200811|20170601|customer2|article1|50 6|3|created|20200811|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_all/data/source/WE_SO_SCL_202108111500000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 7|1|created|20200811|20180110|customer5|article2|120 7|1|cancelled|20200811|20180110|customer5|article2|120 1|1|shipped|20200811|20160601|customer1|article1|150 2|2|released|20200811|20170215|customer2|article2|50 3|2|released|20200811|20170215|customer1|article2|120 3|3|released|20200811|20170215|customer1|article4|90 ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_column_set/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "example_bronze", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/data" }, { "spec_id": "example_silver", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_column_set/data" } ], "transform_specs": [ { "spec_id": "max_sales_silver_timestamp", "input_id": "example_silver", "transformers": [ { "function": "get_max_value", "args": { "input_col": "extraction_date" } } ] }, { "spec_id": "example_transform", "input_id": "example_bronze", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } }, { "function": "incremental_filter", "args": { "input_col": "extraction_date", "increment_df": "max_sales_silver_timestamp" } }, { "function": "with_auto_increment_id" } ] } ], "output_specs": [ { "spec_id": "example_output", "input_id": "example_transform", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_column_set/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item", "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on", "update_column_set": {"event": "current.event", "lhe_row_id": "new.lhe_row_id + 100" }, "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'" } } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_column_set/batch_init.json ================================================ { "input_specs": [ { "spec_id": "example_input", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/source_schema.json", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/data" } ], "transform_specs": [ { "spec_id": "example_transform", "input_id": "example_input", "transformers": [ { "function": "with_auto_increment_id" }, { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } } ] } ], "output_specs": [ { "spec_id": "example_bronze", "input_id": "example_transform", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_column_set/data" } ] } ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_column_set/data/control/batch.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date 1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000 2|2|created|20200811|20170215|customer2|article6|50|103|202108111400000000 2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000 3|2|created|20200811|20170215|customer1|article2|120|104|202108111400000000 3|3|created|20200811|20170215|customer1|article4|90|105|202108111400000000 4|1|created|20200811|20170430|customer3|article3|80|9|202108111400000000 4|4|created|20200811|20170430|customer3|article2|50|12|202108111400000000 6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000 1|1|created|20200811|20160601|customer1|article1|100|102|202108111400000000 1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000 2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000 3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000 4|2|created|20200811|20170430|customer3|article7|70|10|202108111400000000 4|3|created|20200811|20170430|customer3|article1|30|11|202108111400000000 5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000 5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000 5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000 6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000 6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000 7|1|created|20200811|20180110|customer5|article2|120|0|202108111500000000 7|1|cancelled|20200811|20180110|customer5|article2|120|1|202108111500000000 ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_column_set/data/source/WE_SO_SCL_202108111400000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 1|1|created|20200811|20160601|customer1|article1|100 1|2|created|20200811|20160601|customer1|article2|200 1|3|created|20200811|20160601|customer1|article3|50 2|1|created|20200811|20170215|customer2|article4|10 2|2|created|20200811|20170215|customer2|article6|50 2|3|created|20200811|20170215|customer2|article1|30 3|1|created|20200811|20170215|customer1|article5|200 3|2|created|20200811|20170215|customer1|article2|120 3|3|created|20200811|20170215|customer1|article4|90 4|1|created|20200811|20170430|customer3|article3|80 4|2|created|20200811|20170430|customer3|article7|70 4|3|created|20200811|20170430|customer3|article1|30 4|4|created|20200811|20170430|customer3|article2|50 5|1|created|20200811|20170510|customer4|article6|150 5|2|created|20200811|20170510|customer4|article3|100 5|3|created|20200811|20170510|customer4|article5|80 6|1|created|20200811|20170601|customer2|article4|100 6|2|created|20200811|20170601|customer2|article1|50 6|3|created|20200811|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/merge_options/update_column_set/data/source/WE_SO_SCL_202108111500000000.csv ================================================ salesorder|item|event|changed_on|date|customer|article|amount 7|1|created|20200811|20180110|customer5|article2|120 7|1|cancelled|20200811|20180110|customer5|article2|120 1|1|shipped|20200811|20160601|customer1|article1|150 2|2|released|20200811|20170215|customer2|article2|50 3|2|released|20200811|20170215|customer1|article2|120 3|3|released|20200811|20170215|customer1|article4|90 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_backfill.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_value": "20180110120052t", "greater_or_equal": true } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|1500 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|2000 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|500 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|100 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|500 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|300 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|2000 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|700 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|400 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|700 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|1500 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|1000 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|800 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|1000 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|500 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|900 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|1200 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|1000 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|2000 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|500 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|500 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|300 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|2000 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|1200 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|900 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|800 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|700 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|300 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|500 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|1500 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|1000 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|800 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|1000 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|500 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-03.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-04.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-05.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|1200 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|1000 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|1500 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|500 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|500 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|1200 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-900 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|800 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|1000 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|700 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|800 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|300 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|500 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|600 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|600 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|700 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|1000 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|700 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|800 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|400 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/direct_silver_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/bronze/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp", "greater_or_equal": true } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/bronze/data", "merge_opts": { "merge_predicate": "current.actrequest_timestamp = new.actrequest_timestamp and current.datapakid = new.datapakid and current.partno = new.partno and current.record = new.record and current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } }, { "spec_id": "sales_silver", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/silver/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/direct_silver_load/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "partno" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/bronze/data", "merge_opts": { "merge_predicate": "current.actrequest_timestamp = new.actrequest_timestamp and current.datapakid = new.datapakid and current.partno = new.partno and current.record = new.record and current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } }, { "spec_id": "sales_silver", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/silver/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/control/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-03.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-04.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp", "greater_or_equal": true } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-03.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-04.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data" } ], "transform_specs": [ { "spec_id": "transformed_sales_source", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "transformed_sales_source", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/checkpoint" }, "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp", "greater_or_equal": true } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20211227175200t|20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|70 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-03.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|2|1|14|4|4||20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-04.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|70 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming/data" } ], "transform_specs": [ { "spec_id": "transformed_sales_source", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "transformed_sales_source", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming/checkpoint" }, "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "ranking_key_asc": [ "recordmode" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|discount|uninteresting_column 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|0.0|10.0 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|10.0| 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|10.0| 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|10.0| 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|10.0| 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|10.0| 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|10.0| 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|10.0| ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-03.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|discount|uninteresting_column 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100|10.0| 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70|10.0| 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80|10.0| 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30|10.0| 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50|10.0| 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60|10.0| 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60|10.0| ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-04.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|discount|uninteresting_column 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70|10.0| 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100|10.0| 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70|10.0| 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80|10.0| 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40|10.0| ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_duplicates/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_duplicates/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_duplicates/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_duplicates/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "ranking_key_asc": [ "recordmode" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_duplicates/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|00000000000000t|0|0|0|0|1|1|N||customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|2|N||customer2|article6|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80 20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70 20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30 20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-02.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120 20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90 20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80 20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-03.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30 20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50 20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60 20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-04.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70 20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70 20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/batch_delta.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "json", "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')", "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "json", "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "extraction_timestamp", "actrequest_timestamp", "datapakid", "partno", "record" ], "ranking_key_asc": [ "recordmode" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ] } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/control/part-01.csv ================================================ extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount 20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150 20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200 20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50 20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10 20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50 20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30 20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200 20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120 20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90 20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3||70 20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80 20211227175200t|20180110130103t|request2|1|1|6|4|3||20170430|customer3||40 20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3||70 20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150 20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100 20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80 20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100 20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50 20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90 20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120 ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-01.json ================================================ { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 1, "item": 1, "recordmode": "N", "date": "20160601", "customer": "customer1", "article": "article1", "amount": 100 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 1, "item": 2, "recordmode": "N", "date": "20160601", "customer": "customer1", "article": "article2", "amount": 200 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 1, "item": 3, "recordmode": "N", "date": "20160601", "customer": "customer1", "article": "article3", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 2, "item": 1, "recordmode": "N", "date": "20170215", "customer": "customer2", "article": "article4", "amount": 10 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 2, "item": 2, "recordmode": "N", "date": "20170215", "customer": "customer2", "article": "article6", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 2, "item": 3, "recordmode": "N", "date": "20170215", "customer": "customer2", "article": "article1", "amount": 30 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 3, "item": 1, "recordmode": "N", "date": "20170215", "customer": "customer1", "article": "article5", "amount": 200 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 3, "item": 2, "recordmode": "N", "date": "20170215", "customer": "customer1", "article": "article2", "amount": 120 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 3, "item": 3, "recordmode": "N", "date": "20170215", "customer": "customer1", "article": "article4", "amount": 90 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 1, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article3", "amount": 80 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 2, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article7", "amount": 70 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 3, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article1", "amount": 30 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 4, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article2", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 5, "item": 1, "recordmode": "N", "date": "20170510", "customer": "customer4", "article": "article6", "amount": 150 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 5, "item": 2, "recordmode": "N", "date": "20170510", "customer": "customer4", "article": "article3", "amount": 100 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 5, "item": 3, "recordmode": "N", "date": "20170510", "customer": "customer4", "article": "article5", "amount": 80 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 6, "item": 1, "recordmode": "N", "date": "20170601", "customer": "customer2", "article": "article4", "amount": 100 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 6, "item": 2, "recordmode": "N", "date": "20170601", "customer": "customer2", "article": "article1", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 6, "item": 3, "recordmode": "N", "date": "20170601", "customer": "customer2", "article": "article2", "amount": 90 } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-02.json ================================================ { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 1, "salesorder": 7, "item": 1, "recordmode": "N", "date": "20180110", "customer": "customer5", "article": "article2", "amount": 120 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 2, "salesorder": 1, "item": 1, "recordmode": "X", "date": "20160601", "customer": "customer1", "article": "article1", "amount": 100 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 3, "salesorder": 1, "item": 1, "recordmode": null, "date": "20160601", "customer": "customer1", "article": "article1", "amount": 150 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 4, "salesorder": 2, "item": 2, "recordmode": "X", "date": "20170215", "customer": "customer2", "article": "article6", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 5, "salesorder": 2, "item": 2, "recordmode": null, "date": "20170215", "customer": "customer2", "article": "article2", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 8, "salesorder": 4, "item": 1, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article3", "amount": 80 } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-03.json ================================================ { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 9, "salesorder": 4, "item": 1, "recordmode": null, "date": "20170430", "customer": "customer3", "article": "article3", "amount": 100 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 10, "salesorder": 4, "item": 2, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article7", "amount": 70 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 11, "salesorder": 4, "item": 2, "recordmode": null, "date": "20170430", "customer": "customer3", "article": "article7", "amount": 80 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 13, "salesorder": 4, "item": 4, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article2", "amount": 50 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 14, "salesorder": 4, "item": 4, "recordmode": null, "date": "20170430", "customer": "customer3", "article": "article2", "amount": 60 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 2, "partno": 1, "record": 1, "salesorder": 4, "item": 4, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article2", "amount": 60 } ================================================ FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-04.json ================================================ { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 2, "partno": 1, "record": 2, "salesorder": 4, "item": 4, "recordmode": null, "date": "20170430", "customer": "customer3", "amount": 70 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 3, "salesorder": 4, "item": 1, "recordmode": "X", "date": "20170430", "customer": "customer3", "amount": 100 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 4, "salesorder": 4, "item": 1, "recordmode": null, "date": "20170430", "customer": "customer3", "amount": 70 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 5, "salesorder": 4, "item": 3, "recordmode": "X", "date": "20170430", "customer": "customer3", "amount": 30 } { "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 6, "salesorder": 4, "item": 3, "recordmode": null, "date": "20170430", "customer": "customer3", "amount": 40 } ================================================ FILE: tests/resources/feature/dq_validator/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/dq_validator/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "db_table": "test_db.dq_sales", "location": "file:///app/tests/lakehouse/out/feature/dq_validator/data" } ] } ================================================ FILE: tests/resources/feature/dq_validator/data/control/data_restore_control.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 ================================================ FILE: tests/resources/feature/dq_validator/data/control/dq_control_failure.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source ================================================ FILE: tests/resources/feature/dq_validator/data/control/dq_control_failure_disabled.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source ================================================ FILE: tests/resources/feature/dq_validator/data/control/dq_control_success.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/dq_validator/data/control/dq_control_success_explode.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source ================================================ FILE: tests/resources/feature/dq_validator/data/control/dq_control_success_explode_disabled.csv ================================================ checkpoint_config|run_id|run_results|success|spec_id|input_id checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source ================================================ FILE: tests/resources/feature/dq_validator/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 ================================================ FILE: tests/resources/feature/dq_validator/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/dq_validator/dq_sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/dq_validator/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/dq_validator/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "string", "nullable": true, "metadata": {} }, { "name": "item", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "string", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/dq_validator/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/dq_validator/checkpoint" } } ], "exec_env": { "spark.sql.streaming.schemaInference": true } } ================================================ FILE: tests/resources/feature/dq_validator/streaming_dataframe_two_runs/data/dq_functions/test_db.dq_functions_streaming_dataframe_two_runs_first_run.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} ================================================ FILE: tests/resources/feature/dq_validator/streaming_dataframe_two_runs/data/dq_functions/test_db.dq_functions_streaming_dataframe_two_runs_second_run.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_dataframe_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_dataframe_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_dataframe_success/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_dataframe_success/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_dq_rule/data/dq_functions/test_db.dq_table_rule_id_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_3|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "article", "min_value": 3} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_4|expect_wrong_expectation|at_rest|test_db|dummy_invoice|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "article", "column_B": "amount"} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_dq_rule/data/dq_functions/test_db.dq_table_rule_id_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_success/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_batch_success/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_streaming_dq_rule/data/dq_functions/test_db.dq_table_rule_id_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article", "min_value": 0} rule_3|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "article", "min_value": 3} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|dummy_invoice|amount|{"min_value": 3, "max_value": 11} rule_5|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "article", "column_B": "amount"} ================================================ FILE: tests/resources/feature/dq_validator/table_streaming_dq_rule/data/dq_functions/test_db.dq_table_rule_id_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article", "min_value": 0} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_streaming_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_streaming_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_streaming_success/data/dq_functions/test_db.dq_functions_source_table_failure.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11} rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/dq_validator/table_streaming_success/data/dq_functions/test_db.dq_functions_source_table_success.csv ================================================ dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0} rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"} rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"} rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11} ================================================ FILE: tests/resources/feature/engine_usage_stats/dq_validator/data/control.json ================================================ {"acon": {"input_spec": {"spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": true, "delimiter": "|"}, "location": "/app/tests/lakehouse/in/feature/engine_usage_stats/dq_validator/data/"}, "dq_spec": {"spec_id": "dq_sales", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/engine_usage_stats/dq", "result_sink_db_table": "test_db.dq_validator", "result_sink_format": "json", "result_sink_explode": false, "dq_functions": [{"function": "expect_column_to_exist", "args": {"column": "article"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 3, "max_value": 11}}, {"function": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "args": {"column_A": "salesorder", "column_B": "amount"}}]}, "exec_env": {"dp_name": "dq_validator"}}, "dp_name": "dq_validator", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "execute_dq_validation", "engine_version": "1.17.0", "start_timestamp": "2024-01-03 15:05:58.808058", "year": 2024, "month": 1} ================================================ FILE: tests/resources/feature/engine_usage_stats/dq_validator/data/source.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/engine_usage_stats/load_custom_transf_and_df/data/control.json ================================================ {"acon": {"input_specs": [{"spec_id": "sales_source", "read_type": "batch", "data_format": "dataframe", "df_name": "DataFrame[salesorder: int, item: int, date: int, customer: string, article: string, amount: int]"}], "transform_specs": [{"spec_id": "renamed_kpi", "input_id": "sales_source", "transformers": [{"function": "rename", "args": {"cols": {"salesorder": "salesorder1"}}}, {"function": "custom_transformation", "args": {"custom_transformer": ""}}]}], "output_specs": [{"spec_id": "sales_bronze", "input_id": "renamed_kpi", "write_type": "overwrite", "data_format": "delta", "location": "/app/tests/lakehouse/out/feature/engine_usage_stats/load_custom_transf_and_df/data/"}], "exec_env": {"dp_name": "load_custom_transf_and_df"}}, "dp_name": "load_custom_transf_and_df", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "load_data", "engine_version": "1.17.0", "start_timestamp": "2023-12-29 18:24:55.282039", "year": 2023, "month": 12} ================================================ FILE: tests/resources/feature/engine_usage_stats/load_custom_transf_and_df/data/source.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/engine_usage_stats/load_simple_acon/data/control.json ================================================ {"acon": {"input_specs": [{"spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": true, "delimiter": "|", "password": "******"}, "location": "/app/tests/lakehouse/in/feature/engine_usage_stats/load_simple_acon/data/"}], "transform_specs": [{"spec_id": "renamed_kpi", "input_id": "sales_source", "transformers": [{"function": "rename", "args": {"cols": {"salesorder": "salesorder1"}}}]}], "output_specs": [{"spec_id": "sales_bronze", "input_id": "renamed_kpi", "write_type": "overwrite", "data_format": "delta", "location": "/app/tests/lakehouse/out/feature/engine_usage_stats/load_simple_acon/data/"}], "exec_env": {"dp_name": "load_simple_acon"}}, "dp_name": "load_simple_acon", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "load_data", "engine_version": "1.17.0", "start_timestamp": "2023-12-29 22:43:27.654809", "year": 2023, "month": 12} ================================================ FILE: tests/resources/feature/engine_usage_stats/load_simple_acon/data/source.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/engine_usage_stats/table_manager/data/control.json ================================================ {"acon": {"function": "execute_sql", "sql": "select 1", "exec_env": {"dp_name": "table_manager"}}, "dp_name": "table_manager", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "manage_table", "engine_version": "1.17.0", "start_timestamp": "2024-01-03 00:00:00.000000", "year": 2024, "month": 1} ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/control/dummy_table.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210812171010000000000|94|100|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20210812171010000000000|95|101|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20210812171010000000000|96|102|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20210812181010000000000|97|103|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20210812181010000000000|98|104|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20210812181010000000000|99|105|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| 20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| 20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/control/dummy_table_join_condition.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210812171010000000000|94|100|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20210812171010000000000|95|101|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20210812171010000000000|96|102|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20210812181010000000000|97|103|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20210812181010000000000|98|104|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20210812181010000000000|99|105|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/control/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "reqtsn", "type": "decimal(23,0)", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/dummy_table.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210812171010000000000|94|100|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20210812171010000000000|95|101|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20210812171010000000000|96|102|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20210812181010000000000|97|103|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20210812181010000000000|98|104|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20210812181010000000000|99|105|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/dummy_table_1.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210712171010000000000|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15| 20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/dummy_table_2.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/rspmrequest.csv ================================================ request_tsn|storage|last_operation_type|last_process_tsn|last_time_stamp|records|records_read|records_updated|creation_end_time|uname|source|request_status|request_status_before_deletion|last_request_status|request_is_in_process|tlogo|datatarget|syst_date|syst_time|housekeeping_status 20210712171010000000000|AQ|C|20210712171010000000000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00 20210812181010000000000|AQ|C|20211006073059000008000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00 20210912171010000000000|AQ|C|20211006073059000008000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00 20211112171010000000000|AQ|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00 20211113121010000000000|AQ|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00 20211115111010000000000|AQ|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00 20211116111010000000000|CL|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00 20211117111010000000000|AQ|C|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|GR|GR||N|ADSO|DUMMY_TABLE|20211020|123236|00 20211118111010000000000|AQ|C|20211020223734000053000|20211020223735000009000|431528|0|0|20211020223240000008000|UNAME|SOURCE|GR|GR||N|ADSO|DUMMY_TABLE|20211020|223236|00 20211118111010000000000|CL|D|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|123236|00 ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "reqtsn", "type": "decimal(23,0)", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/rspmrequest_schema.json ================================================ { "type": "struct", "fields": [ { "name": "request_tsn", "type": "decimal(23,0)", "nullable": true, "metadata": {} }, { "name": "storage", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_operation_type", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_process_tsn", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_time_stamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "records", "type": "integer", "nullable": true, "metadata": {} }, { "name": "records_read", "type": "integer", "nullable": true, "metadata": {} }, { "name": "records_updated", "type": "integer", "nullable": true, "metadata": {} }, { "name": "creation_end_time", "type": "string", "nullable": true, "metadata": {} }, { "name": "uname", "type": "string", "nullable": true, "metadata": {} }, { "name": "source", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_status", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_status_before_deletion", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_request_status", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_is_in_process", "type": "string", "nullable": true, "metadata": {} }, { "name": "tlogo", "type": "string", "nullable": true, "metadata": {} }, { "name": "datatarget", "type": "string", "nullable": true, "metadata": {} }, { "name": "syst_date", "type": "string", "nullable": true, "metadata": {} }, { "name": "syst_time", "type": "string", "nullable": true, "metadata": {} }, { "name": "housekeeping_status", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/control/dummy_table.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210713151010000000000|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20210713151010000000000|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20210713151010000000000|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20210713151010000000000|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20210713151010000000000|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20210713151010000000000|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| 20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| 20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/control/dummy_table_join_condition.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210713151010000000000|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20210713151010000000000|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20210713151010000000000|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20210713151010000000000|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20210713151010000000000|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20210713151010000000000|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/control/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "reqtsn", "type": "decimal(23,0)", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/dummy_table.csv ================================================ salesorder|item|date|time|customer|/bic/article|amount|order_date 1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/dummy_table_cl_1.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20210712171010000000000|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15| 20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/dummy_table_cl_2.csv ================================================ reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/rspmrequest.csv ================================================ REQUEST_TSN|STORAGE|LAST_OPERATION_TYPE|LAST_PROCESS_TSN|LAST_TIME_STAMP|RECORDS|RECORDS_READ|RECORDS_UPDATED|CREATION_END_TIME|UNAME|SOURCE|REQUEST_STATUS|REQUEST_STATUS_BEFORE_DELETION|LAST_REQUEST_STATUS|REQUEST_IS_IN_PROCESS|TLOGO|DATATARGET|SYST_DATE|SYST_TIME|HOUSEKEEPING_STATUS 20210712171010000000000|AT|C|20211006073059000008000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00 20211112171010000000000|AT|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00 20211113121010000000000|AT|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00 20211115111010000000000|AT|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00 20211116111010000000000|CL|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00 20211117111010000000000|AT|C|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|GG|GG||N|ADSO|DUMMY_TABLE|20211020|123236|00 20211118111010000000000|AT|C|20211020223734000053000|20211020223735000009000|431528|0|0|20211020223240000008000|UNAME|SOURCE|GG|GG||N|ADSO|DUMMY_TABLE|20211020|223236|00 20211118111010000000000|CL|D|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|123236|00 ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/dummy_table_cl_schema.json ================================================ { "type": "struct", "fields": [ { "name": "reqtsn", "type": "decimal(23,0)", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/rspmrequest_schema.json ================================================ { "type": "struct", "fields": [ { "name": "request_tsn", "type": "decimal(23,0)", "nullable": true, "metadata": {} }, { "name": "storage", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_operation_type", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_process_tsn", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_time_stamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "records", "type": "integer", "nullable": true, "metadata": {} }, { "name": "records_read", "type": "integer", "nullable": true, "metadata": {} }, { "name": "records_updated", "type": "integer", "nullable": true, "metadata": {} }, { "name": "creation_end_time", "type": "string", "nullable": true, "metadata": {} }, { "name": "uname", "type": "string", "nullable": true, "metadata": {} }, { "name": "source", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_status", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_status_before_deletion", "type": "string", "nullable": true, "metadata": {} }, { "name": "last_request_status", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_is_in_process", "type": "string", "nullable": true, "metadata": {} }, { "name": "tlogo", "type": "string", "nullable": true, "metadata": {} }, { "name": "datatarget", "type": "string", "nullable": true, "metadata": {} }, { "name": "syst_date", "type": "string", "nullable": true, "metadata": {} }, { "name": "syst_time", "type": "string", "nullable": true, "metadata": {} }, { "name": "housekeeping_status", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/RSBASIDOC_schema.json ================================================ { "type": "struct", "fields": [ { "name": "slogsys", "type": "string", "nullable": true, "metadata": {} }, { "name": "rlogsys", "type": "string", "nullable": true, "metadata": {} }, { "name": "tsprefix", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/RSTSODS_schema.json ================================================ { "type": "struct", "fields": [ { "name": "odsname_tech", "type": "string", "nullable": true, "metadata": {} }, { "name": "odsname", "type": "string", "nullable": true, "metadata": {} }, { "name": "userapp", "type": "string", "nullable": true, "metadata": {} }, { "name": "version", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/data/source/RSBASIDOC.csv ================================================ slogsys|rlogsys|tsprefix DHACLNT003|DHACLNT003|OA FFEWFEWCLN|FFEWFEWCLN|CA PHACLNT003|DHACLNT003|CA PHACLNT003|DHACLNT003|CB AHACLNT003|DHACLNT001|CT AHACLNT003|DHACLNT002|CD ================================================ FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/data/source/RSTSODS.csv ================================================ odsname_tech|odsname|userapp|version test_table_OA|8test_table_OA|CHANGELOG|000 testchartable_OA|8testchartable_OA|CHANGELOG|000 testrtable_OA|8testrtable_OA|CHANGELOG|000 test_test_table_OA|8test_test_table_OA|CHANGELOG|000 test_table_OA|8test_table_OA|CHANGELOG|001 test_table_OA|8test_table_OA|NOTCHANGELOG|000 testtable_OA|8testtable_OA|CHANGELOG|000 testtable_OA|8testtable_OA|CHANGELOG|001 testtable_OA|8testtable_OA|NOTCHANGELOG|000 ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/control/dummy_table.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211004151010|0|0|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20211004151010|0|0|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20211004151010|0|0|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20211004151010|0|0|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20211004151010|0|0|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20211004151010|0|0|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| 20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211113121010|ODSR_3C6Q7CHLJJ08WG131T491L1ZF|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211113121010|ODSR_3C6Q7CHLJJ08WG131T491L1ZF|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZA|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| 20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZF|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZF|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/control/dummy_table_join_condition.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211004151010|0|0|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20211004151010|0|0|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20211004151010|0|0|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 20211004151010|0|0|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20211004151010|0|0|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20211004151010|0|0|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| 20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZA|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/control/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "decimal(15,0)", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/dummy_table.csv ================================================ salesorder|item|date|time|customer|/bic/article|amount|order_date 1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 1|4|20160601|2016-06-01 10:01:12.000|customer99||3000| 1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/dummy_table_cl_1.csv ================================================ request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date DTPR_OLD_REQUEST_TO_IGNORE_444|1|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| ODSR_1C6Q7CHLJJ08WG131T491L1ZA|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/dummy_table_cl_2.csv ================================================ request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| ODSR_2C6Q7CHLJJ08WG131T491L1ZF|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| ODSR_3C6Q7CHLJJ08WG131T491L1ZF|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| ODSR_3C6Q7CHLJJ08WG131T491L1ZF|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| ODSR_4C6Q7CHLJJ08WG131T491L1ZF|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| ODSR_4C6Q7CHLJJ08WG131T491L1ZF|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ODSR_4C6Q7CHLJJ08WG131T491L1ZA|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/rsodsactreq.csv ================================================ odsobject|request|datapakid|activate|sidconversion|actrequest|operation|status|paketsize|timestamp dummy_table|DTPR_OLD_REQUEST_TO_IGNORE_444|0|||DTPR_OLD_REQUEST_TO_IGNORE_444|A|0|0000020000|20211004151010 dummy_table|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_1C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211104151010 dummy_table|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_2C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211112171010 dummy_table|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_3C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211113121010 dummy_table|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_4C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211114111010 dummy_table|ODSR_4C6Q7CHLJJ08WG131T491L1ZA|0|||ODSR_4C6Q7CHLJJ08WG131T491L1ZA|A|0|0000020000|20211114111010 ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/dummy_table_cl_schema.json ================================================ { "type": "struct", "fields": [ { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/rsodsactreq_schema.json ================================================ { "type": "struct", "fields": [ { "name": "odsobject", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "activate", "type": "string", "nullable": true, "metadata": {} }, { "name": "sidconversion", "type": "string", "nullable": true, "metadata": {} }, { "name": "actrequest", "type": "string", "nullable": true, "metadata": {} }, { "name": "operation", "type": "string", "nullable": true, "metadata": {} }, { "name": "status", "type": "string", "nullable": true, "metadata": {} }, { "name": "paketsize", "type": "string", "nullable": true, "metadata": {} }, { "name": "timestamp", "type": "decimal(15,0)", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211004151010|DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20211004151010|DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20211004151010|DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table_actreq_timestamp.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211003161010|DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20211003161010|DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20211003161010|DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20211003161010|DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20211003161010|DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| 20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| 20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| 20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| 20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| 20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| 20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table_join_condition.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date 20211004151010|DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| 20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| 20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| 20211004151010|DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| 20211004151010|DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "decimal(15,0)", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/dummy_table.csv ================================================ request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000| DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000| DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500| DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500| DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/dummy_table_1.csv ================================================ request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date DTPR_OLD_REQUEST_TO_IGNORE_444|1|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000| DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/dummy_table_2.csv ================================================ request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000| DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000| DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000| DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000| DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000| DTPR_F69Y1VBE6JO079PMFTL1X8BPY|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000| DTPR_F89Y1VBE6JO079PMFTL1X8BPY|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320| DTPR_F89Y1VBE6JO079PMFTL1X8BPY|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320| DTPR_F99Y1VBE6JO079PMFTL1X8BPY|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010| DTPR_F99Y1VBE6JO079PMFTL1X8BPY|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50| ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/rsodsactreq.csv ================================================ odsobject|request|datapakid|activate|sidconversion|actrequest|operation|status|paketsize|timestamp dummy_table|DTPR_OLD_REQUEST_TO_IGNORE_444|0|||DTPR_OLD_REQUEST_TO_IGNORE_444|A|0|0000020000|20211003151010 dummy_table|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_1C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211104151010 dummy_table|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_2C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211112171010 dummy_table|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_3C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211113121010 dummy_table|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_4C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211114111010 dummy_table|DTPR_INIT_REQUEST_123|0|||INIT_RECORD_L23F|A|0|0000010000|20211003161010 ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/dummy_table_schema.json ================================================ { "type": "struct", "fields": [ { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "date", "nullable": true, "metadata": {} }, { "name": "time", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "/bic/article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/rsodsactreq_schema.json ================================================ { "type": "struct", "fields": [ { "name": "odsobject", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "string", "nullable": true, "metadata": {} }, { "name": "activate", "type": "string", "nullable": true, "metadata": {} }, { "name": "sidconversion", "type": "string", "nullable": true, "metadata": {} }, { "name": "actrequest", "type": "string", "nullable": true, "metadata": {} }, { "name": "operation", "type": "string", "nullable": true, "metadata": {} }, { "name": "status", "type": "string", "nullable": true, "metadata": {} }, { "name": "paketsize", "type": "string", "nullable": true, "metadata": {} }, { "name": "timestamp", "type": "decimal(15,0)", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/file_manager/check_restore_status/acon_check_restore_status_directory.json ================================================ { "function": "check_restore_status", "bucket": "test_bucket", "source_object": "test_directory" } ================================================ FILE: tests/resources/feature/file_manager/check_restore_status/acon_check_restore_status_single_object.json ================================================ { "function": "check_restore_status", "bucket": "test_bucket", "source_object": "test_single_file.json" } ================================================ FILE: tests/resources/feature/file_manager/copy_object/acon_copy_directory.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_directory", "destination_bucket": "destination_bucket", "destination_object": "destination_directory", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/copy_object/acon_copy_directory_dry_run.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_directory", "destination_bucket": "destination_bucket", "destination_object": "destination_directory", "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager/copy_object/acon_copy_single_object.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/copy_object/acon_copy_single_object_dry_run.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager/delete_objects/acon_delete_objects.json ================================================ { "function": "delete_objects", "bucket": "test_bucket", "object_paths": ["test_single_file.json", "test_directory"], "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/delete_objects/acon_delete_objects_dry_run.json ================================================ { "function": "delete_objects", "bucket": "test_bucket", "object_paths": ["test_single_file.json", "test_directory"], "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager/request_restore/acon_request_restore_directory.json ================================================ { "function": "request_restore", "bucket": "test_bucket", "source_object": "test_directory", "restore_expiration": 1, "retrieval_tier": "Bulk", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/request_restore/acon_request_restore_single_object.json ================================================ { "function": "request_restore", "bucket": "test_bucket", "source_object": "test_single_file.json", "restore_expiration": 1, "retrieval_tier": "Bulk", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_directory.json ================================================ { "function": "request_restore_to_destination_and_wait", "bucket": "test_bucket", "source_object": "test_directory", "destination_bucket": "destination_bucket", "destination_object": "destination_directory", "restore_expiration": 1, "retrieval_tier": "Expedited", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object.json ================================================ { "function": "request_restore_to_destination_and_wait", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "restore_expiration": 1, "retrieval_tier": "Expedited", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object_raise_error.json ================================================ { "function": "request_restore_to_destination_and_wait", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "restore_expiration": 1, "retrieval_tier": "Bulk", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_dbfs/copy_objects/acon_copy_directory.json ================================================ { "function": "copy_objects", "bucket": "", "source_object": "tests/lakehouse/dbfs/test_directory", "destination_bucket": "", "destination_object": "tests/lakehouse/dbfs/destination_directory", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_dbfs/copy_objects/acon_copy_directory_dry_run.json ================================================ { "function": "copy_objects", "bucket": "", "source_object": "tests/lakehouse/dbfs/test_directory", "destination_bucket": "", "destination_object": "tests/lakehouse/dbfs/destination_directory", "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager_dbfs/copy_objects/acon_copy_single_object.json ================================================ { "function": "copy_objects", "bucket": "", "source_object": "tests/lakehouse/dbfs/test_single_file.json", "destination_bucket": "", "destination_object": "tests/lakehouse/dbfs/destination_single_file.json", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_dbfs/delete_objects/acon_delete_objects.json ================================================ { "function": "delete_objects", "bucket": "", "object_paths": ["tests/lakehouse/dbfs/destination_directory"], "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_dbfs/delete_objects/acon_delete_objects_dry_run.json ================================================ { "function": "delete_objects", "bucket": "", "object_paths": ["tests/lakehouse/dbfs/test_directory", "tests/lakehouse/dbfs/destination_directory"], "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager_dbfs/move_objects/acon_move_objects.json ================================================ { "function": "move_objects", "bucket": "", "source_object": "tests/lakehouse/dbfs/test_directory", "destination_bucket": "", "destination_object": "tests/lakehouse/dbfs/test_mv_directory", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_dbfs/move_objects/acon_move_objects_dry_run.json ================================================ { "function": "move_objects", "bucket": "", "source_object": "tests/lakehouse/dbfs/test_directory", "destination_bucket": "", "destination_object": "tests/lakehouse/dbfs/test_mv_directory", "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager_s3/check_restore_status/acon_check_restore_status_directory.json ================================================ { "function": "check_restore_status", "bucket": "test_bucket", "source_object": "test_directory" } ================================================ FILE: tests/resources/feature/file_manager_s3/check_restore_status/acon_check_restore_status_single_object.json ================================================ { "function": "check_restore_status", "bucket": "test_bucket", "source_object": "test_single_file.json" } ================================================ FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_directory.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_directory", "destination_bucket": "destination_bucket", "destination_object": "destination_directory", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_directory_dry_run.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_directory", "destination_bucket": "destination_bucket", "destination_object": "destination_directory", "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_single_object.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_single_object_dry_run.json ================================================ { "function": "copy_objects", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager_s3/delete_objects/acon_delete_objects.json ================================================ { "function": "delete_objects", "bucket": "test_bucket", "object_paths": ["test_single_file.json", "test_directory"], "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/delete_objects/acon_delete_objects_dry_run.json ================================================ { "function": "delete_objects", "bucket": "test_bucket", "object_paths": ["test_single_file.json", "test_directory"], "dry_run": true } ================================================ FILE: tests/resources/feature/file_manager_s3/request_restore/acon_request_restore_directory.json ================================================ { "function": "request_restore", "bucket": "test_bucket", "source_object": "test_directory", "restore_expiration": 1, "retrieval_tier": "Bulk", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/request_restore/acon_request_restore_single_object.json ================================================ { "function": "request_restore", "bucket": "test_bucket", "source_object": "test_single_file.json", "restore_expiration": 1, "retrieval_tier": "Bulk", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_directory.json ================================================ { "function": "request_restore_to_destination_and_wait", "bucket": "test_bucket", "source_object": "test_directory", "destination_bucket": "destination_bucket", "destination_object": "destination_directory", "restore_expiration": 1, "retrieval_tier": "Expedited", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object.json ================================================ { "function": "request_restore_to_destination_and_wait", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "restore_expiration": 1, "retrieval_tier": "Expedited", "dry_run": false } ================================================ FILE: tests/resources/feature/file_manager_s3/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object_raise_error.json ================================================ { "function": "request_restore_to_destination_and_wait", "bucket": "test_bucket", "source_object": "test_single_file.json", "destination_bucket": "destination_bucket", "destination_object": "destination_single_file", "restore_expiration": 1, "retrieval_tier": "Bulk", "dry_run": false } ================================================ FILE: tests/resources/feature/full_load/full_overwrite/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/full_load/full_overwrite/data" } ], "transform_specs": [ { "spec_id": "repartitioned_sales", "input_id": "sales_source", "transformers": [ { "function": "repartition", "args": { "num_partitions": 1, "cols": ["date", "customer"] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/full_load/full_overwrite/data" } ] } ================================================ FILE: tests/resources/feature/full_load/full_overwrite/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/full_load/full_overwrite/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/full_load/full_overwrite/data" } ] } ================================================ FILE: tests/resources/feature/full_load/full_overwrite/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/full_load/full_overwrite/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/full_load/full_overwrite/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/full_load/with_filter/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter/data" } ], "transform_specs": [ { "spec_id": "filtered_sales", "input_id": "sales_source", "transformers": [ { "function": "expression_filter", "args": { "exp": "date like '2016%'" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "filtered_sales", "write_type": "overwrite", "data_format": "parquet", "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter/data" } ] } ================================================ FILE: tests/resources/feature/full_load/with_filter/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "parquet", "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter/data" } ] } ================================================ FILE: tests/resources/feature/full_load/with_filter/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/full_load/with_filter/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/full_load/with_filter/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data" } ], "transform_specs": [ { "spec_id": "filtered_sales", "input_id": "sales_source", "transformers": [ { "function": "expression_filter", "args": { "exp": "date like '2016%'" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "filtered_sales", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data", "options": { "replaceWhere": "date like '2016%'" } } ] } ================================================ FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|", "inferSchema": true }, "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "partitions": [ "date", "customer" ], "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data" } ] } ================================================ FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|100 2|2|20170215|customer2|article6|500 2|3|20170215|customer2|article1|300 3|1|20170215|customer1|article5|2000 3|2|20170215|customer1|article2|1200 3|3|20170215|customer1|article4|900 4|1|20170430|customer3|article3|800 4|2|20170430|customer3|article7|700 4|3|20170430|customer3|article1|300 4|4|20170430|customer3|article2|500 5|1|20170510|customer4|article6|1500 5|2|20170510|customer4|article3|1000 5|3|20170510|customer4|article5|800 6|1|20170601|customer2|article4|1000 6|2|20170601|customer2|article1|500 6|3|20170601|customer2|article2|900 ================================================ FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|10000 1|2|20160601|customer1|article2|20000 1|3|20160601|customer1|article3|5000 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 5|1|20170510|customer4|article6|15000 5|2|20170510|customer4|article3|10000 5|3|20170510|customer4|article5|8000 6|1|20170601|customer2|article4|10000 6|2|20170601|customer2|article1|5000 6|3|20170601|customer2|article2|9000 ================================================ FILE: tests/resources/feature/gab/control/data/vw_dummy_sales_kpi.csv ================================================ cadence|order_date|to_date|category_name|qty_articles|total_amount|total_amount_last_year|avg_total_amount_last_2_years|discounted_total_amount YEAR|2016-01-01|2016-12-31|category_a|3|7000|0|0|3920.0000000000005 YEAR|2017-01-01|2017-12-31|category_a|10|15000|7000|7000|8400 YEAR|2018-01-01|2018-12-31|category_a|4|36|15000|11000|20.160000000000004 YEAR|2017-01-01|2017-12-31|category_b|5|11000|0|0|6160.000000000001 ================================================ FILE: tests/resources/feature/gab/control/data/vw_nam_orders_all_snapshot.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY2|10|1010|0|0|0|5 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY2|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY7|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY11|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY1|24|2424|0|0|0|12 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY2|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY5|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY1|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY9|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY4|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY4|9|909|0|0|0|4.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY8|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY9|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY1|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY5|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY8|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY4|9|909|0|0|0|4.5 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY1|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY10|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY1|12|1213|0|0|0|6 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY5|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY2|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY1|78|7878|0|0|0|39 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY2|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY5|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY2|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY1|78|7878|0|0|0|39 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY2|10|1010|0|0|0|5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY1|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY1|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY4|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY2|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY2|5|505|0|0|0|2.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY7|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY1|24|2424|0|0|0|12 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY1|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY11|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY1|12|1213|0|0|0|6 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY10|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY2|5|505|0|0|0|2.5 ================================================ FILE: tests/resources/feature/gab/control/data/vw_nam_orders_filtered_snapshot.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1 ================================================ FILE: tests/resources/feature/gab/control/data/vw_negative_offset_orders_all.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived WEEK|2022-01-02|2022-01-07|10102413|COUNTRY1|24|2424|21|21|21|12 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY2|5|505|5|5|13|2.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY5|3|303|3|3|6|1.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY8|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY11|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY4|9|909|9|9|9|4.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY7|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5 WEEK|2022-01-02|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY7|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102418|COUNTRY1|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY4|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY1|78|7878|78|78|227|39 WEEK|2022-01-02|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2 WEEK|2022-01-02|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5 WEEK|2022-01-02|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-08|10102415|COUNTRY1|12|1213|12|12|22|6 WEEK|2022-01-02|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY8|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY5|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY11|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY2|10|1010|10|10|27|5 WEEK|2022-01-02|2022-01-09|10102416|COUNTRY2|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-08|10102416|COUNTRY2|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY9|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2 WEEK|2022-01-02|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY4|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-07|10102417|COUNTRY1|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3 WEEK|2022-01-02|2022-01-08|10102415|COUNTRY2|4|404|3|3|6|2 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY4|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102417|COUNTRY1|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5 WEEK|2022-01-02|2022-01-09|10102417|COUNTRY2|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-09|10102416|COUNTRY1|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-08|10102417|COUNTRY2|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-09|10102415|COUNTRY2|4|404|4|4|10|2 WEEK|2022-01-02|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY1|24|2424|24|24|69|12 WEEK|2022-01-02|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY4|9|909|9|9|18|4.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4 WEEK|2022-01-02|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY2|5|505|4|4|8|2.5 WEEK|2022-01-02|2022-01-09|10102418|COUNTRY1|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2 WEEK|2022-01-02|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY4|9|909|9|9|27|4.5 WEEK|2022-01-02|2022-01-07|10102416|COUNTRY2|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY1|78|7878|71|71|71|39 WEEK|2022-01-02|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-07|10102415|COUNTRY2|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY7|3|303|3|3|6|1.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY9|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-08|10102418|COUNTRY1|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3 WEEK|2022-01-02|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 WEEK|2022-01-02|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-07|10102416|COUNTRY1|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-07|10102415|COUNTRY1|12|1213|10|10|10|6 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY9|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY5|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY5|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY1|24|2424|24|24|45|12 WEEK|2022-01-02|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3 WEEK|2022-01-02|2022-01-09|10102415|COUNTRY1|12|1213|12|12|34|6 WEEK|2022-01-02|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY2|10|1010|9|9|17|5 WEEK|2022-01-02|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5 WEEK|2022-01-02|2022-01-07|10102417|COUNTRY2|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY11|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY1|78|7878|78|78|149|39 WEEK|2022-01-02|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5 WEEK|2022-01-02|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY8|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY2|9|909|8|8|8|4.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY10|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-08|10102417|COUNTRY1|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY2|4|404|4|4|4|2 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY5|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY5|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY10|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-08|10102416|COUNTRY1|3|303|3|3|6|1.5 ================================================ FILE: tests/resources/feature/gab/control/data/vw_negative_offset_orders_filtered.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived WEEK|2022-01-02|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5 WEEK|2022-01-02|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2 WEEK|2022-01-02|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5 WEEK|2022-01-02|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4 WEEK|2022-01-02|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2 WEEK|2022-01-02|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3 WEEK|2022-01-02|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 WEEK|2022-01-02|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 WEEK|2022-01-02|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2 WEEK|2022-01-02|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4 WEEK|2022-01-02|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-02|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3 WEEK|2022-01-02|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 WEEK|2022-01-02|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-02|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1 WEEK|2022-01-02|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3 WEEK|2022-01-02|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-02|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1 WEEK|2022-01-02|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5 WEEK|2022-01-02|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1 WEEK|2022-01-02|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-02|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-02|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5 WEEK|2022-01-02|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 ================================================ FILE: tests/resources/feature/gab/control/data/vw_orders_all.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived DAY|2022-01-06|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY1|7|707|71|0|71|3.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4 DAY|2022-01-07|2022-01-07|10102412|COUNTRY2|1|101|8|0|8|0.5 DAY|2022-01-08|2022-01-08|10102412|COUNTRY2|1|101|1|0|9|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 DAY|2022-01-06|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5 DAY|2022-01-07|2022-01-07|10102413|COUNTRY1|3|303|21|0|21|1.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2 DAY|2022-01-08|2022-01-08|10102413|COUNTRY2|1|101|4|0|4|0.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 DAY|2022-01-06|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5 DAY|2022-01-07|2022-01-07|10102415|COUNTRY1|2|202|10|0|10|1 DAY|2022-01-06|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5 DAY|2022-01-08|2022-01-08|10102415|COUNTRY2|1|101|3|0|3|0.5 DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY1|78|7878|0|0|0|39 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY10|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY11|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY2|10|1010|0|0|0|5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY4|9|909|0|0|0|4.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY5|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY7|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY8|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY9|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY1|24|2424|0|0|0|12 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY2|5|505|0|0|0|2.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY4|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY5|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY1|12|1213|0|0|0|6 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY2|4|404|0|0|0|2 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY1|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY2|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY1|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY2|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY1|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY1|78|7878|0|0|0|39 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY10|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY11|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY2|10|1010|0|0|0|5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY4|9|909|0|0|0|4.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY5|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY7|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY8|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY9|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY1|24|2424|0|0|0|12 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY2|5|505|0|0|0|2.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY4|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY5|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY1|12|1213|0|0|0|6 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY2|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY1|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY2|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY1|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY2|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY1|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY1|78|7878|0|0|0|39 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY10|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY11|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY2|10|1010|0|0|0|5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|0|0|0|3 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY4|9|909|0|0|0|4.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY5|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|0|0|0|4 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY7|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY8|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY9|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY1|24|2424|0|0|0|12 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY2|5|505|0|0|0|2.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY4|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY5|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|0|0|0|2 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY1|12|1213|0|0|0|6 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY2|4|404|0|0|0|2 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY1|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY2|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY1|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY2|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY1|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY1|78|7878|0|0|0|39 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY10|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY11|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY2|10|1010|0|0|0|5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY3|6|606|0|0|0|3 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY4|9|909|0|0|0|4.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY5|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY6|8|808|0|0|0|4 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY7|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY8|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY9|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY1|24|2424|0|0|0|12 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY2|5|505|0|0|0|2.5 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY3|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY4|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY5|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY6|4|404|0|0|0|2 YEAR|2022-01-01|2022-12-31|10102415|COUNTRY1|12|1213|0|0|0|6 YEAR|2022-01-01|2022-12-31|10102415|COUNTRY2|4|404|0|0|0|2 YEAR|2022-01-01|2022-12-31|10102415|COUNTRY3|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102415|COUNTRY6|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102416|COUNTRY1|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102416|COUNTRY2|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102416|COUNTRY3|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102416|COUNTRY6|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102417|COUNTRY1|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102417|COUNTRY2|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102417|COUNTRY3|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102417|COUNTRY6|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102418|COUNTRY1|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102418|COUNTRY3|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102418|COUNTRY6|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102419|COUNTRY6|1|101|0|0|0|0.5 ================================================ FILE: tests/resources/feature/gab/control/data/vw_orders_all_snapshot.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived DAY|2022-01-06|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 DAY|2022-01-06|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2 DAY|2022-01-08|2022-01-08|10102415|COUNTRY2|1|101|3|0|3|0.5 DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5 DAY|2022-01-07|2022-01-07|10102415|COUNTRY1|2|202|10|0|10|1 DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1 DAY|2022-01-07|2022-01-07|10102412|COUNTRY1|7|707|71|0|71|3.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY2|1|101|8|0|8|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5 DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 DAY|2022-01-06|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4 DAY|2022-01-07|2022-01-07|10102413|COUNTRY1|3|303|21|0|21|1.5 DAY|2022-01-08|2022-01-08|10102413|COUNTRY2|1|101|4|0|4|0.5 DAY|2022-01-06|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5 DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5 DAY|2022-01-08|2022-01-08|10102412|COUNTRY2|1|101|1|0|9|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY2|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY7|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY2|4|404|4|4|10|2 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY8|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY4|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-07|10102415|COUNTRY1|12|1213|10|10|10|6 WEEK|2022-01-03|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY8|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY2|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4 WEEK|2022-01-03|2022-01-07|10102415|COUNTRY2|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY7|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY2|5|505|4|4|8|2.5 WEEK|2022-01-03|2022-01-07|10102417|COUNTRY2|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY11|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY4|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY2|10|1010|10|10|27|5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-08|10102418|COUNTRY1|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3 WEEK|2022-01-03|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-08|10102417|COUNTRY2|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5 WEEK|2022-01-03|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY9|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 WEEK|2022-01-03|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5 WEEK|2022-01-03|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY10|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-07|10102416|COUNTRY1|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY4|9|909|9|9|27|4.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY7|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-07|10102418|COUNTRY1|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-08|10102416|COUNTRY1|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY5|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2 WEEK|2022-01-03|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY2|9|909|8|8|8|4.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY2|10|1010|9|9|17|5 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY5|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY9|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-08|10102415|COUNTRY2|4|404|3|3|6|2 WEEK|2022-01-03|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-08|10102415|COUNTRY1|12|1213|12|12|22|6 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY5|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY5|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY2|5|505|5|5|13|2.5 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY2|4|404|4|4|4|2 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY1|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY8|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4 WEEK|2022-01-03|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2 WEEK|2022-01-03|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-07|10102417|COUNTRY1|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY1|78|7878|78|78|227|39 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY5|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY1|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY4|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY4|9|909|9|9|9|4.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY1|24|2424|24|24|45|12 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY1|78|7878|78|78|149|39 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY9|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY5|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY11|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY10|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY1|24|2424|24|24|69|12 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-08|10102417|COUNTRY1|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY1|78|7878|71|71|71|39 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY1|12|1213|12|12|34|6 WEEK|2022-01-03|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY11|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY1|24|2424|21|21|21|12 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY4|9|909|9|9|18|4.5 WEEK|2022-01-03|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-08|10102416|COUNTRY2|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY1|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-07|10102416|COUNTRY2|2|202|2|2|2|1 ================================================ FILE: tests/resources/feature/gab/control/data/vw_orders_filtered.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3 MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5 MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3 QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5 QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|0|0|0|3 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|0|0|0|4 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|0|0|0|2 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY3|6|606|0|0|0|3 YEAR|2022-01-01|2022-12-31|10102412|COUNTRY6|8|808|0|0|0|4 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY3|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102413|COUNTRY6|4|404|0|0|0|2 YEAR|2022-01-01|2022-12-31|10102415|COUNTRY3|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102415|COUNTRY6|3|303|0|0|0|1.5 YEAR|2022-01-01|2022-12-31|10102416|COUNTRY3|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102416|COUNTRY6|2|202|0|0|0|1 YEAR|2022-01-01|2022-12-31|10102417|COUNTRY3|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102417|COUNTRY6|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102418|COUNTRY3|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102418|COUNTRY6|1|101|0|0|0|0.5 YEAR|2022-01-01|2022-12-31|10102419|COUNTRY6|1|101|0|0|0|0.5 ================================================ FILE: tests/resources/feature/gab/control/data/vw_orders_filtered_snapshot.csv ================================================ cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5 DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4 WEEK|2022-01-03|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3 WEEK|2022-01-03|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2 WEEK|2022-01-03|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2 WEEK|2022-01-03|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3 WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5 WEEK|2022-01-03|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1 WEEK|2022-01-03|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5 WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4 WEEK|2022-01-03|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1 WEEK|2022-01-03|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5 WEEK|2022-01-03|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1 WEEK|2022-01-03|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3 WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5 WEEK|2022-01-03|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4 WEEK|2022-01-03|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4 WEEK|2022-01-03|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1 WEEK|2022-01-03|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5 WEEK|2022-01-03|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5 WEEK|2022-01-03|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1 ================================================ FILE: tests/resources/feature/gab/control/schema/vw_dummy_sales_kpi.json ================================================ { "type": "struct", "fields": [ { "name":"cadence", "type":"string", "nullable": true, "metadata": {} }, { "name":"order_date", "type":"date", "nullable": true, "metadata": {} }, { "name":"to_date", "type":"date", "nullable": true, "metadata": {} }, { "name":"category_name", "type":"string", "nullable": true, "metadata": {} }, { "name":"qty_articles", "type":"double", "nullable": true, "metadata": {} }, { "name":"total_amount", "type":"double", "nullable": true, "metadata": {} }, { "name":"total_amount_last_year", "type":"double", "nullable": true, "metadata": {} }, { "name":"avg_total_amount_last_2_years", "type":"double", "nullable": true, "metadata": {} }, { "name":"discounted_total_amount", "type":"double", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/gab/control/schema/vw_orders.json ================================================ { "type": "struct", "fields": [ { "name": "cadence", "type": "string", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} }, { "name": "to_date", "type": "date", "nullable": true, "metadata": {} }, { "name": "sales_order_schedule", "type":"string", "nullable": true, "metadata": {} }, { "name": "delivery_country_cod", "type":"string", "nullable": true, "metadata": {} }, { "name": "orders", "type":"double", "nullable": true, "metadata": {} }, { "name": "total_sales", "type":"double", "nullable": true, "metadata": {} }, { "name": "orders_last_cad", "type":"double", "nullable": true, "metadata": {} }, { "name": "orders_last_year", "type":"double", "nullable": true, "metadata": {} }, { "name": "orders_avg_last_3_1", "type":"double", "nullable": true, "metadata": {} }, { "name": "orders_derived", "type":"double", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/gab/setup/column_list/calendar.json ================================================ { "calendar_date": "date", "day_en": "string", "weeknum_mon": "int", "weekstart_mon": "date", "weekend_mon": "date", "weekstart_sun": "date", "weekend_sun": "date", "month_start": "date", "month_end": "date", "quarter_start": "date", "quarter_end": "date", "year_start": "date", "year_end": "date" } ================================================ FILE: tests/resources/feature/gab/setup/column_list/dummy_sales_kpi.json ================================================ { "order_date": "date", "article_id": "string", "amount": "int" } ================================================ FILE: tests/resources/feature/gab/setup/column_list/gab_log_events.json ================================================ { "run_start_time": "timestamp", "run_end_time": "timestamp", "input_start_date": "timestamp", "input_end_date": "timestamp", "query_id": "string", "query_label": "string", "cadence": "string", "stage_name": "string", "stage_query": "string", "status": "string", "error_code": "string" } ================================================ FILE: tests/resources/feature/gab/setup/column_list/gab_use_case_results.json ================================================ { "query_id": "string", "cadence": "string", "from_date": "date", "to_date": "date", "d1": "string", "d2": "string", "d3": "string", "d4": "string", "d5": "string", "d6": "string", "d7": "string", "d8": "string", "d9": "string", "d10": "string", "d11": "string", "d12": "string", "d13": "string", "d14": "string", "d15": "string", "d16": "string", "d17": "string", "d18": "string", "d19": "string", "d20": "string", "d21": "string", "d22": "string", "d23": "string", "d24": "string", "d25": "string", "d26": "string", "d27": "string", "d28": "string", "d29": "string", "d30": "string", "d31": "string", "d32": "string", "d33": "string", "d34": "string", "d35": "string", "d36": "string", "d37": "string", "d38": "string", "d39": "string", "d40": "string", "m1": "double", "m2": "double", "m3": "double", "m4": "double", "m5": "double", "m6": "double", "m7": "double", "m8": "double", "m9": "double", "m10": "double", "m11": "double", "m12": "double", "m13": "double", "m14": "double", "m15": "double", "m16": "double", "m17": "double", "m18": "double", "m19": "double", "m20": "double", "m21": "double", "m22": "double", "m23": "double", "m24": "double", "m25": "double", "m26": "double", "m27": "double", "m28": "double", "m29": "double", "m30": "double", "m31": "double", "m32": "double", "m33": "double", "m34": "double", "m35": "double", "m36": "double", "m37": "double", "m38": "double", "m39": "double", "m40": "double", "lh_created_on": "timestamp" } ================================================ FILE: tests/resources/feature/gab/setup/column_list/lkp_query_builder.json ================================================ { "query_id": "int", "query_label": "string", "query_type": "string", "mappings": "string", "intermediate_stages": "string", "recon_window": "string", "timezone_offset": "int", "start_of_the_week": "string", "is_active": "string", "queue": "string", "lh_created_on": "timestamp" } ================================================ FILE: tests/resources/feature/gab/setup/column_list/order_events.json ================================================ { "request_timestamp": "string", "data_pack_id": "string", "record_number": "int", "update_mode": "string", "sales_order_header": "string", "sales_order_schedule": "string", "sales_order_item": "string", "orgsales_orgp": "string", "order_header_key": "string", "order_line_key": "string", "derived_order_header": "string", "derived_order_line_k": "string", "return_reason": "string", "reqmnt_category": "string", "delivery_status10": "string", "req_del_dt_item": "date", "reason_for_rejsize": "string", "invoice_item_price": "string", "id_of_the_customer": "string", "logistics_profit_ctr": "string", "material_availabilit": "date", "mso_store": "string", "name_of_orderer": "string", "overall_delivery_sta": "string", "overall_processing_s20": "string", "overall_processing_s21": "string", "coupon_code": "string", "org_grape_bapcx": "string", "cust_service_rep": "string", "customer_purchase_or25": "date", "delivery_country_cod": "string", "delivery_city_code": "string", "delivery_post_code": "string", "delivery_state_code": "string", "delivery_status30": "string", "ops_del_block_sohdr": "string", "ops_del_block_soscl": "string", "ecom_crm_id": "string", "conf_del_date_size": "date", "created_on": "date", "time": "string", "sales_doc_item_cat": "string", "shipping_campaign_id": "string", "shipping_coupon_code": "string", "shipping_city": "string", "shipping_postal_code": "string", "shp_promotion_code": "string", "size_grid": "string", "main_chan_frm_src": "string", "prctr_billing": "string", "prere_indfrm_src": "string", "reg__clr_from_src": "string", "update_flag": "string", "usage": "string", "so_header_usgindp": "string", "vas_customer_defined": "string", "adidas_group_article": "string", "billto_cust": "string", "requirement_type": "string", "shipto_cust__r2": "string", "soldto_cust_r2": "string", "sales_doc_category": "string", "product_division": "string", "promotion_code": "string", "sd_categ_precdoc": "string", "so_hdrpreceding_doc": "string", "so_itmpreceding_doc": "string", "so_scl_prec_doc": "string", "article__region__s": "string", "reference_1": "string", "mkt_place_order_num": "string", "sales_representative": "string", "subtotal_1_source": "decimal", "subtotal_2_source": "decimal", "subtotal_3_source": "decimal", "subtotal_4_source": "decimal", "subtotal_5_source": "decimal", "subtotal_6_source": "decimal", "grid_value": "string", "orgcompcodep": "string", "created_by": "string", "miscdistchcopap": "string", "document_currency": "string", "reason_for_order": "string", "opsplantp": "string", "sales_group": "string", "sales_office": "string", "sales_unit": "string", "storage_location": "string", "so_net_price_2": "decimal", "sales_order_net_valu": "decimal", "so_conf_qty": "decimal", "so_cum_order_qty": "decimal", "so_net_price": "decimal", "so_net_value": "decimal", "so_org_qty": "decimal", "so_conf_qty_actual": "decimal", "sales_order_qty": "decimal", "sales_odr_qty_actual": "decimal", "article_campaign_id": "string", "sales_document_type": "string", "order_date_header": "date", "billing_city": "string", "billing_postal_code": "string", "customer_po_time": "string", "customer_purchase_or101": "string", "overall_rej_status": "string", "changed_on": "date", "epoch_status": "string", "sales_order_canqty": "decimal", "epoch_entry_type": "string", "epoch_entry_by": "string", "epoch_order_type": "string", "epoch_line_type": "string", "omnihub_marketplace": "string", "confirmed_delivery_t": "string", "shipping_city_addres112": "string", "shipping_city_addres113": "string", "shipping_city_addres114": "string", "billing_city_address115": "string", "billing_city_address116": "string", "billing_city_address117": "string", "omnihub_seller_org": "string", "omnihub_locale_code": "string", "customer_po_type": "string", "omnihub_carrier_serv": "string", "qualifier": "string", "omnihub_document_typ": "string", "omnihub_return_code": "string", "refund_process_date": "date", "refund_process_time": "string", "omni_cancel_reason": "string", "sales_order_ecom_fre": "decimal", "omnihub_custom_order": "string", "vas_packing_type_so": "string", "vas_spl_ser_type_so": "string", "vas_tktlbl_type_so": "string", "exchange_flag": "string", "exchange_type": "string", "customer_po_timedw": "string", "cnc_store_id": "string", "last_hold__type": "string", "last_hold_released_t": "string", "last_hold_release_dt": "date", "dynamic_pricing_iden": "string", "dynamic_pricing_valu": "string", "dymamic_pricing_amnt": "decimal", "exchange_reason": "string", "omnihub_site_id": "string", "international_shipme": "string", "exchange_variant": "string", "secondary_article_ca": "string", "secondary_article_pr": "string", "secondary_coupon_cod": "string", "double_discount_flag": "string", "extraction_date": "string", "lhe_batch_id": "int", "lhe_row_id": "bigint", "source_update_date": "date", "source_update_time": "string" } ================================================ FILE: tests/resources/feature/gab/setup/data/dummy_sales_kpi.csv ================================================ order_date|article_id|amount 2017-02-15|article1|600 2017-02-15|article6|1000 2017-02-15|article2|2400 2017-02-15|article4|2000 2017-02-15|article5|4000 2017-04-30|article7|1400 2017-04-30|article2|1000 2017-04-30|article3|1600 2017-04-30|article1|600 2016-06-01|article2|4000 2016-06-01|article1|2000 2016-06-01|article3|1000 2017-05-10|article5|1600 2017-05-10|article6|3000 2017-05-10|article3|2000 2017-06-01|article4|2000 2017-06-01|article1|1000 2017-06-01|article2|1800 2018-07-11|article3|6 2018-07-11|article1|2 2018-07-01|article2|18 2018-07-01|article1|10 ================================================ FILE: tests/resources/feature/gab/setup/data/lkp_query_builder.csv ================================================ query_id|query_label|query_type|mappings|intermediate_stages|recon_window|timezone_offset|start_of_the_week|is_active|queue|lh_created_on 742783030|order_events|GLOBAL|{ 'vw_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{'1': {'file_path': 'order_events/1_order_events.sql','table_alias': 'order_events_query','storage_level': 'MEMORY_ONLY','project_date_column': 'order_date_header','filter_date_column': 'order_date_header','repartition': {}}}|{'DAY': {}, 'WEEK': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'MONTH': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'QUARTER': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'YEAR': {'recon_window': {'DAY': {'snapshot': 'N'}}}}|0|SUNDAY|Y|Medium|2024-02-08T11:33:49.76Z 74776315|dummy_sales_kpi|GLOBAL|{ 'vw_dummy_sales_kpi': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'category_name' }, 'metric': { 'm1': { 'metric_name': 'qty_articles', 'calculated_metric': {}, 'derived_metric': {} }, 'm2': { 'metric_name': 'total_amount', 'calculated_metric': { 'last_cadence': [ { 'label': 'total_amount_last_year', 'window': '1' } ], 'window_function': [ { 'label': 'avg_total_amount_last_2_years', 'window': [ 2, 1 ], 'agg_func': 'avg' } ] }, 'derived_metric': [ { 'label': 'discounted_total_amount', 'formula': 'total_amount*0.56' } ] } }, 'filter': {} } }|{ '1': { 'file_path': 'dummy_sales_kpi/1_article_category.sql', 'table_alias': 'article_categories', 'storage_level': 'MEMORY_ONLY', 'project_date_column': '', 'filter_date_column': '', 'repartition': {} }, '2': { 'file_path': 'dummy_sales_kpi/2_dummy_sales_kpi.sql', 'table_alias': 'dummy_sales_kpi', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date', 'filter_date_column': 'order_date', 'repartition': {} } }|{'YEAR': {}}|0|MONDAY|Y|Low|2024-03-07T15:38:52.922Z 742783031|order_events_snapshot|GLOBAL|{ 'vw_orders_all_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_orders_filtered_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {} } }|{'DAY': {}, 'WEEK': {'recon_window': {'DAY': {'snapshot': 'Y'}}}, 'MONTH': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'QUARTER': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'YEAR': {'recon_window': {'DAY': {'snapshot': 'N'}}}}|0|SUNDAY|Y|Medium|2024-03-25T10:17:51.907Z 742783032|order_events_nam|NAM|{ 'vw_nam_orders_all_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_nam_orders_filtered_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {} } }|{'DAY': {}, 'WEEK': {'recon_window': {'DAY': {'snapshot': 'Y'}}}, 'MONTH': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'QUARTER': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'YEAR': {'recon_window': {'DAY': {'snapshot': 'N'}}}}|0|MONDAY|Y|Medium|2024-03-25T10:19:12.597Z 742783034|order_events_negative_timezone_offset|GLOBAL|{ 'vw_negative_offset_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_negative_offset_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {'numPartitions':3, 'keys':['order_date']} } }|{'WEEK': {'recon_window': {'DAY': {'snapshot': 'Y'}}}}|-3|MONDAY|Y|Medium|2024-03-25T10:20:27.992Z 742783035|order_events_empty_reconciliation_window|GLOBAL|{ 'vw_negative_offset_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_negative_offset_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {'numPartitions':3, 'keys':['order_date']} } }|{}|-3|MONDAY|Y|Medium|2024-03-25T10:20:27.992Z 742783036|order_events_unexisting_cadence|GLOBAL|{ 'vw_negative_offset_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_negative_offset_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {'numPartitions':3, 'keys':['order_date']} } }|{'UNEXINSTING_CADENCE': {'recon_window': {'DAY': {'snapshot': 'Y'}}}}|-3|MONDAY|Y|Medium|2024-03-25T10:20:27.992Z ================================================ FILE: tests/resources/feature/gab/setup/data/order_events.csv ================================================ request_timestamp|data_pack_id|record_number|update_mode|sales_order_header|sales_order_schedule|sales_order_item|orgsales_orgp|order_header_key|order_line_key|derived_order_header|derived_order_line_k|return_reason|reqmnt_category|delivery_status10|req_del_dt_item|reason_for_rejsize|invoice_item_price|id_of_the_customer|logistics_profit_ctr|material_availabilit|mso_store|name_of_orderer|overall_delivery_sta|overall_processing_s20|overall_processing_s21|coupon_code|org_grape_bapcx|cust_service_rep|customer_purchase_or25|delivery_country_cod|delivery_city_code|delivery_post_code|delivery_state_code|delivery_status30|ops_del_block_sohdr|ops_del_block_soscl|ecom_crm_id|conf_del_date_size|created_on|time|sales_doc_item_cat|shipping_campaign_id|shipping_coupon_code|shipping_city|shipping_postal_code|shp_promotion_code|size_grid|main_chan_frm_src|prctr_billing|prere_indfrm_src|reg__clr_from_src|update_flag|usage|so_header_usgindp|vas_customer_defined|adidas_group_article|billto_cust|requirement_type|shipto_cust__r2|soldto_cust_r2|sales_doc_category|product_division|promotion_code|sd_categ_precdoc|so_hdrpreceding_doc|so_itmpreceding_doc|so_scl_prec_doc|article__region__s|reference_1|mkt_place_order_num|sales_representative|subtotal_1_source|subtotal_2_source|subtotal_3_source|subtotal_4_source|subtotal_5_source|subtotal_6_source|grid_value|orgcompcodep|created_by|miscdistchcopap|document_currency|reason_for_order|opsplantp|sales_group|sales_office|sales_unit|storage_location|so_net_price_2|sales_order_net_valu|so_conf_qty|so_cum_order_qty|so_net_price|so_net_value|so_org_qty|so_conf_qty_actual|sales_order_qty|sales_odr_qty_actual|article_campaign_id|sales_document_type|order_date_header|billing_city|billing_postal_code|customer_po_time|customer_purchase_or101|overall_rej_status|changed_on|epoch_status|sales_order_canqty|epoch_entry_type|epoch_entry_by|epoch_order_type|epoch_line_type|omnihub_marketplace|confirmed_delivery_t|shipping_city_addres112|shipping_city_addres113|shipping_city_addres114|billing_city_address115|billing_city_address116|billing_city_address117|omnihub_seller_org|omnihub_locale_code|customer_po_type|omnihub_carrier_serv|qualifier|omnihub_document_typ|omnihub_return_code|refund_process_date|refund_process_time|omni_cancel_reason|sales_order_ecom_fre|omnihub_custom_order|vas_packing_type_so|vas_spl_ser_type_so|vas_tktlbl_type_so|exchange_flag|exchange_type|customer_po_timedw|cnc_store_id|last_hold__type|last_hold_released_t|last_hold_release_dt|dynamic_pricing_iden|dynamic_pricing_valu|dymamic_pricing_amnt|exchange_reason|omnihub_site_id|international_shipme|exchange_variant|secondary_article_ca|secondary_article_pr|secondary_coupon_cod|double_discount_flag|extraction_date|lhe_batch_id|lhe_row_id|source_update_date|source_update_time null|null|null|null|VALUE1|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING1|null|AC|null|null|STRING1|2022-01-10|2022-01-06|81351101|VALUE1|null|null|CITY1|888420101|null|2300101|404|null|RER|CRC|XCX|null|null|null|STRING2|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING2|STRING1|STRING1|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY1|885201|8123401001|STRING1|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING1|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING1|null|null|null|null|null|124814001|null|STRING1|2140001|2022-01-06|null|null|null|STRING1|COMP1COUNTRY1|null|null|STRING1|STRING1|STRING1|No||2|1|null|null null|null|null|null|VALUE3|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING3|null|AC|null|null|STRING3|2022-01-11|2022-01-06|81351103|VALUE1|null|null|CITY3|888420103|null|2300104|404|null|RER|CRC|XCX|null|null|null|STRING4|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING4|STRING4|STRING4|null|null|15002.00|null|15002.00|null|null|230003|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85003.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY3|885203|8123401003|STRING4|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING3|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING3|null|null|null|null|null|124814003|null|STRING1|2140003|2022-01-06|null|null|null|STRING3|COMP1COUNTRY3|null|null|null|null|null|No||2|3|null|null null|null|null|null|VALUE4|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING4|METHOD1|AC|null|null|STRING4|2022-01-11|2022-01-06|81351104|VALUE1|null|null|CITY4|888420104|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING5|null|101|null|null|null|1001|null|null|null|null|null|STRING5|STRING5|STRING5|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY4|885204|8123401004|STRING5|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING4|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING4|null|null|null|null|null|124814004|null|STRING1|2140004|2022-01-06|null|null|null|STRING4|COMP1COUNTRY1|null|null|null|null|null|No||2|4|null|null null|null|null|null|VALUE7|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE3|null|null|2022-01-05|COUNTRY1|null|STRING7|null|AC|null|null|STRING7|2022-01-11|2022-01-06|81351107|VALUE1|null|null|CITY7|888420107|null|2300110|404|null|RER|RCR|XCX|null|null|null|STRING10|null|101|null|null|null|1001|STRING4|null|null|null|null|STRING10|STRING10|STRING10|null|null|null|null|null|null|null|230007|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING4|STRING1|2022-01-06|CITY7|885207|8123401007|STRING10|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING7|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING7|null|null|null|null|null|124814007|null|STRING1|2140007|2022-01-06|null|null|null|STRING7|COMP1COUNTRY1|null|null|null|null|null|No||2|9|null|null null|null|null|null|VALUE8|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE4|null|null|2022-01-05|COUNTRY1|null|STRING8|null|AC|null|null|STRING8|2022-01-13|2022-01-06|81351108|VALUE1|null|null|CITY8|888420108|null|2300111|404|null|RER|RCR|XCX|null|null|STRING1|STRING11|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING11|STRING11|STRING11|null|null|15004.00|null|15004.00|null|null|230008|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85009.00|101.000|101.000|101.000|101.000|STRING5|STRING1|2022-01-06|CITY8|885208|8123401008|STRING11|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE2|STRING8|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING8|null|EFL|null|null|null|124814008|null|STRING1|2140008|2022-01-06|null|null|null|STRING8|COMP1COUNTRY1|null|null|null|null|null|No||2|10|null|null null|null|null|null|VALUE8|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE4|null|null|2022-01-05|COUNTRY1|null|STRING8|null|AC|null|null|STRING8|2022-01-13|2022-01-06|81351108|VALUE1|null|null|CITY8|888420108|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING12|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING12|STRING11|STRING11|null|null|15005.00|null|15005.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85010.00|101.000|101.000|101.000|101.000|STRING5|STRING1|2022-01-06|CITY8|885208|8123401008|STRING11|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING8|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING8|null|null|null|null|null|124814008|null|STRING1|2140008|2022-01-06|null|null|null|STRING8|COMP1COUNTRY1|null|null|null|null|null|No||2|11|null|null null|null|null|null|VALUE8|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE4|null|null|2022-01-05|COUNTRY1|null|STRING8|null|AC|null|null|STRING8|2022-01-13|2022-01-06|81351108|VALUE1|null|null|CITY8|888420108|null|2300113|404|null|RER|RCR|XCX|null|null|null|STRING13|null|101|null|null|null|1003|STRING5|null|null|null|null|STRING13|STRING11|STRING11|null|null|15006.00|null|15006.00|null|null|230010|null|null|101|STRING1|null|null|null|null|PCP|101|6500132.00|null|102.000|102.000|6500132.00|85011.00|102.000|102.000|102.000|102.000|STRING5|STRING1|2022-01-06|CITY8|885208|8123401008|STRING11|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING8|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING8|null|null|null|null|null|124814008|null|STRING1|2140008|2022-01-06|null|null|null|STRING8|COMP1COUNTRY1|null|null|null|null|null|No||2|12|null|null null|null|null|null|VALUE9|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING9|null|AC|null|null|STRING9|2022-01-11|2022-01-06|81351109|VALUE1|null|null|CITY9|888420109|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING14|null|101|null|null|null|1002|null|null|null|null|null|STRING14|STRING14|STRING14|null|null|15003.00|null|15003.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500133.00|null|101.000|101.000|6500133.00|85012.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY9|885209|8123401009|STRING14|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING9|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING9|null|null|null|null|null|124814009|null|STRING1|2140009|2022-01-06|null|null|null|STRING9|COMP1COUNTRY1|null|null|null|null|null|No||2|13|null|null null|null|null|null|VALUE10|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE5|null|null|2022-01-05|COUNTRY1|null|STRING10|null|AC|null|null|STRING10|2022-01-13|2022-01-06|81351110|VALUE1|null|null|CITY10|888420110|null|2300115|404|null|RER|RCR|XCX|null|null|STRING2|STRING15|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING15|STRING15|STRING15|null|null|15007.00|null|15007.00|null|null|230012|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85004.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY10|885210|8123401010|STRING15|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE2|STRING10|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING10|null|EFL|null|null|null|124814010|null|STRING1|2140010|2022-01-06|null|null|null|STRING10|COMP1COUNTRY1|null|null|null|null|null|No||2|14|null|null null|null|null|null|VALUE11|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY4|null|STRING11|null|AC|null|null|STRING11|2022-01-11|2022-01-06|81351111|VALUE1|null|null|CITY11|888420111|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING16|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING16|STRING16|STRING16|null|null|15008.00|null|15008.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|85013.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY11|885211|8123401011|STRING16|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING11|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING11|null|null|null|null|null|124814011|null|STRING1|2140011|2022-01-06|null|null|null|STRING11|COMP1COUNTRY4|null|null|null|null|null|No||2|15|null|null null|null|null|null|VALUE13|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING13|null|AC|null|null|STRING13|2022-01-11|2022-01-06|81351113|VALUE1|null|null|CITY13|888420113|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING18|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING18|STRING18|STRING18|null|null|15010.00|null|15010.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85015.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY13|885213|8123401013|STRING18|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING13|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING13|null|null|null|null|null|124814013|null|STRING1|2140013|2022-01-06|null|null|null|STRING13|COMP1COUNTRY1|null|null|null|null|null|No||2|17|null|null null|null|null|null|VALUE14|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING14|null|AC|null|null|STRING14|2022-01-11|2022-01-06|81351114|VALUE1|null|null|CITY14|888420114|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING19|null|101|null|null|null|1001|null|null|null|null|null|STRING19|STRING19|STRING19|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|85016.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY14|885214|8123401014|STRING19|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING14|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING14|null|null|null|null|null|124814014|null|STRING1|2140014|2022-01-06|null|null|null|STRING14|COMP1COUNTRY1|null|null|null|null|null|No||2|18|null|null null|null|null|null|VALUE15|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-14|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING15|null|AA|null|null|STRING15|2022-01-14|2022-01-06|81351115|VALUE1|null|null|CITY15|888420115|null|2300120|404|null|RER|RCR|XCX|null|null|STRING3|STRING20|null|101|null|null|null|1002|null|null|null|null|null|STRING20|STRING20|STRING20|null|null|15003.00|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85009.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY15|885215|8123401015|STRING20|null|2022-01-06|STRING4|10020.000|STRING3|STORE1|STRING1|TYPE2|STRING15|10240403|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING1|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING15|null|EFL|null|null|null|124814015|null|null|null|null|null|null|null|STRING15|COMP1COUNTRY4|null|null|null|null|null|No||2|19|null|null null|null|null|null|VALUE16|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY5|null|STRING16|null|AA|null|null|STRING16|null|2022-01-06|81351116|VALUE1|null|null|CITY16|888420116|null|2300121|404|null|RER|CRC|XCX|null|null|null|STRING21|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING21|STRING21|STRING21|null|null|null|15003.00|null|null|null|230016|null|null|101|STRING2|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY16|885216|8123401016|STRING21|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING16|10240404|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING16|null|null|null|null|null|124814016|null|null|null|null|null|null|null|STRING16|COMP1COUNTRY5|null|null|STRING2|STRING1|null|Yes||2|20|null|null null|null|null|null|VALUE16|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY5|null|STRING16|null|AA|null|null|STRING16|null|2022-01-06|81351116|VALUE1|null|null|CITY16|888420116|null|2300105|404|null|RER|CRC|XCX|null|null|null|STRING22|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING22|STRING21|STRING21|null|null|null|15003.00|null|null|null|230004|null|null|101|STRING2|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY16|885216|8123401016|STRING21|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING16|10240404|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING16|null|null|null|null|null|124814016|null|null|null|null|null|null|null|STRING16|COMP1COUNTRY5|null|null|STRING2|STRING1|null|Yes||2|21|null|null null|null|null|null|VALUE17|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING17|null|AC|null|null|STRING17|2022-01-11|2022-01-06|81351117|VALUE1|null|null|CITY17|888420117|null|2300123|404|null|RER|CRC|XCX|null|null|null|STRING23|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING23|STRING23|STRING23|null|null|15010.00|null|15010.00|null|null|230017|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85008.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY17|885217|8123401017|STRING23|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING17|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING17|null|null|null|null|null|124814017|null|STRING1|2140016|2022-01-06|null|null|null|STRING17|COMP1COUNTRY1|null|null|null|null|null|No||2|22|null|null null|null|null|null|VALUE18|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING18|null|AC|null|null|STRING18|2022-01-11|2022-01-06|81351118|VALUE1|null|null|CITY18|888420118|null|2300114|404|null|RER|CRC|XCX|null|null|null|STRING24|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING24|STRING24|STRING24|null|null|null|null|null|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY18|885218|8123401018|STRING24|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING18|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING18|null|null|null|null|null|124814018|null|STRING1|2140017|2022-01-06|null|null|null|STRING18|COMP1COUNTRY1|null|null|null|null|null|No||2|23|null|null null|null|null|null|VALUE18|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING18|null|AC|null|null|STRING18|2022-01-11|2022-01-06|81351118|VALUE1|null|null|CITY18|888420118|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING14|null|101|null|null|null|1002|null|null|null|null|null|STRING14|STRING24|STRING24|null|null|15003.00|null|15003.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500133.00|null|101.000|101.000|6500133.00|85012.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY18|885218|8123401018|STRING24|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING18|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING18|null|null|null|null|null|124814018|null|STRING1|2140017|2022-01-06|null|null|null|STRING18|COMP1COUNTRY1|null|null|null|null|null|No||2|24|null|null null|null|null|null|VALUE19|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING19|null|AC|null|null|STRING19|2022-01-10|2022-01-06|81351119|VALUE1|null|null|CITY19|888420119|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING25|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING25|STRING26|STRING26|null|null|15008.00|null|15008.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|85013.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY19|885219|8123401019|STRING26|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING19|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING19|null|null|null|null|null|124814019|null|STRING1|2140018|2022-01-06|null|null|null|STRING19|COMP1COUNTRY1|null|null|null|null|null|No||2|25|null|null null|null|null|null|VALUE20|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING20|null|AC|null|null|STRING20|2022-01-11|2022-01-06|81351120|VALUE1|null|null|CITY20|888420120|null|2300107|404|null|RER|RCR|XCX|null|null|null|STRING26|null|101|null|null|null|1001|null|null|null|null|null|STRING26|STRING27|STRING27|null|null|15003.00|null|15003.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85017.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY20|885220|8123401020|STRING27|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING20|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING20|null|null|null|null|null|124814020|null|STRING1|2140019|2022-01-06|null|null|null|STRING20|COMP1COUNTRY1|null|null|null|null|null|No||2|26|null|null null|null|null|null|VALUE21|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING21|null|AC|null|null|STRING21|2022-01-11|2022-01-06|81351121|VALUE1|null|null|CITY21|888420121|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING27|null|101|null|null|null|1003|STRING2|null|null|null|null|STRING27|STRING28|STRING28|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY21|885221|8123401021|STRING28|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING21|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING21|null|null|null|null|null|124814021|null|STRING1|2140010|2022-01-06|null|null|null|STRING21|COMP1COUNTRY1|null|null|null|null|null|No||2|27|null|null null|null|null|null|VALUE22|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING22|null|AC|null|null|STRING22|2022-01-11|2022-01-06|81351122|VALUE1|null|null|CITY22|888420122|null|2300129|404|null|RER|RCR|XCX|null|null|null|STRING28|null|101|null|null|null|1002|null|null|null|null|null|STRING28|STRING29|STRING29|null|null|15003.00|null|15003.00|null|null|230019|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY22|885222|8123401022|STRING29|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING22|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING22|null|null|null|null|null|124814022|null|STRING1|2140011|2022-01-06|null|null|null|STRING22|COMP1COUNTRY1|null|null|null|null|null|No||2|28|null|null null|null|null|null|VALUE23|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING29|null|101|null|null|null|1002|null|null|null|null|null|STRING29|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|85019.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|29|null|null null|null|null|null|VALUE23|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING30|null|101|null|null|null|1002|null|null|null|null|null|STRING30|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|30|null|null null|null|null|null|VALUE23|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING31|null|101|null|null|null|1002|null|null|null|null|null|STRING31|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|31|null|null null|null|null|null|VALUE23|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300133|404|null|RER|RCR|XCX|null|null|null|STRING32|null|101|null|null|null|1001|null|null|null|null|null|STRING32|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|85019.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|32|null|null null|null|null|null|VALUE24|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE7|null|null|2022-01-05|COUNTRY1|null|STRING24|null|AC|null|null|STRING24|2022-01-13|2022-01-06|81351124|VALUE1|null|null|CITY24|888420124|null|2300134|404|null|RER|RCR|XCX|null|null|null|STRING33|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING33|STRING34|STRING34|null|null|15011.00|null|15011.00|null|null|230021|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85016.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY24|885224|8123401024|STRING34|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING24|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING24|null|null|null|null|null|124814024|null|STRING1|2140021|2022-01-06|null|null|null|STRING24|COMP1COUNTRY1|null|null|null|null|null|No||2|33|null|null null|null|null|null|VALUE25|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300135|404|null|RER|CRC|XCX|null|null|null|STRING34|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING34|STRING35|STRING35|null|null|15012.00|null|15012.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85020.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|34|null|null null|null|null|null|VALUE25|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING35|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING35|STRING35|STRING35|null|null|15012.00|null|15012.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85020.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|35|null|null null|null|null|null|VALUE25|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300107|404|null|RER|CRC|XCX|null|null|null|STRING36|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING36|STRING35|STRING35|null|null|15013.00|null|15013.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85021.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|36|null|null null|null|null|null|VALUE25|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300138|404|null|RER|CRC|XCX|null|null|null|STRING37|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING37|STRING35|STRING35|null|null|15013.00|null|15013.00|null|null|230023|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85021.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|37|null|null null|null|null|null|VALUE25|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING38|null|101|null|null|null|1002|null|null|null|null|null|STRING38|STRING35|STRING35|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|38|null|null null|null|null|null|VALUE26|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY7|null|STRING26|null|AC|null|null|STRING26|2022-01-11|2022-01-06|81351126|VALUE1|null|null|CITY26|888420126|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING39|null|101|null|null|null|1002|null|null|null|null|null|STRING39|STRING40|STRING40|null|null|15003.00|null|15003.00|null|null|230025|null|null|101|STRING3|null|null|null|null|PCP|101|6500139.00|null|101.000|101.000|6500139.00|85022.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY26|885226|8123401026|STRING40|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING26|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB7|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING26|null|null|null|null|null|124814026|null|STRING1|2140023|2022-01-06|null|null|null|STRING26|COMP1COUNTRY7|null|null|null|null|null|No||2|39|null|null null|null|null|null|VALUE27|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE8|null|null|2022-01-05|COUNTRY1|null|STRING27|null|AC|null|null|STRING27|2022-01-10|2022-01-06|81351127|VALUE1|null|null|CITY27|888420127|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING40|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING40|STRING41|STRING41|null|null|15014.00|null|15014.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY27|885227|8123401027|STRING41|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING27|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING27|null|null|null|null|null|124814027|null|STRING1|2140024|2022-01-06|null|null|null|STRING27|COMP1COUNTRY1|null|null|null|null|null|No||2|40|null|null null|null|null|null|VALUE27|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE8|null|null|2022-01-05|COUNTRY1|null|STRING27|null|AC|null|null|STRING27|2022-01-10|2022-01-06|81351127|VALUE1|null|null|CITY27|888420127|null|2300142|404|null|RER|RCR|XCX|null|null|null|STRING41|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING41|STRING41|STRING41|null|null|15014.00|null|15014.00|null|null|230026|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY27|885227|8123401027|STRING41|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING27|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING27|null|null|null|null|null|124814027|null|STRING1|2140024|2022-01-06|null|null|null|STRING27|COMP1COUNTRY1|null|null|null|null|null|No||2|41|null|null null|null|null|null|VALUE27|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING27|null|AC|null|null|STRING27|2022-01-10|2022-01-06|81351127|VALUE1|null|null|CITY27|888420127|null|2300143|404|null|RER|CRC|XCX|null|null|null|STRING42|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING42|STRING41|STRING41|null|null|null|null|null|null|null|230027|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY27|885227|8123401027|STRING41|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING27|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING27|null|null|null|null|null|124814027|null|STRING1|2140024|2022-01-06|null|null|null|STRING27|COMP1COUNTRY1|null|null|STRING1|STRING1|STRING2|Yes||2|42|null|null null|null|null|null|VALUE28|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE9|null|null|2022-01-05|COUNTRY1|null|STRING28|null|AC|null|null|STRING28|2022-01-11|2022-01-06|81351128|VALUE1|null|null|CITY20|888420128|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING43|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING43|STRING44|STRING44|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY20|885228|8123401028|STRING44|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING28|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING28|null|null|null|null|null|124814028|null|STRING1|2140025|2022-01-06|null|null|null|STRING28|COMP1COUNTRY1|null|null|null|null|null|No||2|43|null|null null|null|null|null|VALUE28|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING28|null|AC|null|null|STRING28|2022-01-11|2022-01-06|81351128|VALUE1|null|null|CITY20|888420128|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING44|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING44|STRING44|STRING44|null|null|null|null|null|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885228|8123401028|STRING44|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING28|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING28|null|null|null|null|null|124814028|null|STRING1|2140025|2022-01-06|null|null|null|STRING28|COMP1COUNTRY1|null|null|STRING3|STRING2|STRING3|Yes||2|44|null|null null|null|null|null|VALUE29|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING29|METHOD2|AC|null|null|STRING29|2022-01-11|2022-01-06|81351129|VALUE1|null|null|CITY28|888420129|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING45|null|101|null|null|null|1001|null|null|null|null|null|STRING45|STRING46|STRING46|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY28|885229|8123401029|STRING46|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING29|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING29|null|null|null|null|null|124814029|null|STRING1|2140026|2022-01-06|null|null|null|STRING29|COMP1COUNTRY1|null|null|null|null|null|No||2|45|null|null null|null|null|null|VALUE30|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY4|null|STRING30|null|AC|null|null|STRING30|2022-01-11|2022-01-06|81351130|VALUE1|null|null|CITY29|888420130|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING46|null|101|null|null|null|1001|null|null|null|null|null|STRING46|STRING47|STRING47|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY29|885230|8123401030|STRING47|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING30|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING30|null|null|null|null|null|124814030|null|STRING1|2140027|2022-01-06|null|null|null|STRING30|COMP1COUNTRY4|null|null|null|null|null|No||2|46|null|null null|null|null|null|VALUE31|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING31|null|AA|null|null|STRING31|2022-01-13|2022-01-06|81351131|VALUE1|null|null|CITY11|888420131|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING47|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING47|STRING48|STRING48|null|null|null|null|null|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY11|885231|8123401031|STRING48|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING31|10240405|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING31|null|null|null|null|null|124814031|null|null|null|null|null|null|null|STRING31|COMP1COUNTRY4|null|null|STRING4|STRING1|null|Yes||2|47|null|null null|null|null|null|VALUE32|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING32|null|AC|null|null|STRING32|2022-01-11|2022-01-06|81351132|VALUE1|null|null|CITY30|888420132|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING48|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING48|STRING49|STRING49|null|null|15015.00|null|15015.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500141.00|null|101.000|101.000|6500141.00|85026.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY30|885232|8123401032|STRING49|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING32|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING32|null|null|null|null|null|124814032|null|STRING1|2140008|2022-01-06|null|null|null|STRING32|COMP1COUNTRY2|null|null|null|null|null|No||2|48|null|null null|null|null|null|VALUE32|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING32|null|AC|null|null|STRING32|2022-01-11|2022-01-06|81351132|VALUE1|null|null|CITY30|888420132|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING49|null|101|null|null|null|1001|null|null|null|null|null|STRING49|STRING49|STRING49|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY30|885232|8123401032|STRING49|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING32|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING32|null|null|null|null|null|124814032|null|STRING1|2140008|2022-01-06|null|null|null|STRING32|COMP1COUNTRY2|null|null|null|null|null|No||2|49|null|null null|null|null|null|VALUE33|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE10|null|null|2022-01-05|COUNTRY1|null|STRING33|null|AC|null|null|STRING33|2022-01-13|2022-01-06|81351133|VALUE1|null|null|CITY31|888420133|null|2300151|404|null|RER|RCR|XCX|null|null|null|STRING50|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING50|STRING51|STRING51|null|null|15016.00|null|15016.00|null|null|230028|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85028.00|101.000|101.000|101.000|101.000|STRING9|STRING1|2022-01-06|CITY31|885233|8123401033|STRING51|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING33|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING33|null|null|null|null|null|124814033|null|STRING1|2140028|2022-01-06|null|null|null|STRING33|COMP1COUNTRY1|null|null|null|null|null|No||2|50|null|null null|null|null|null|VALUE33|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE10|null|null|2022-01-05|COUNTRY1|null|STRING33|null|AC|null|null|STRING33|2022-01-13|2022-01-06|81351133|VALUE1|null|null|CITY31|888420133|null|2300151|404|null|RER|RCR|XCX|null|null|null|STRING51|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING51|STRING51|STRING51|null|null|15016.00|null|15016.00|null|null|230028|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85028.00|101.000|101.000|101.000|101.000|STRING9|STRING1|2022-01-06|CITY31|885233|8123401033|STRING51|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING33|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING33|null|null|null|null|null|124814033|null|STRING1|2140028|2022-01-06|null|null|null|STRING33|COMP1COUNTRY1|null|null|null|null|null|No||2|51|null|null null|null|null|null|VALUE34|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE11|null|null|2022-01-05|COUNTRY1|null|STRING34|null|AC|null|null|STRING34|2022-01-11|2022-01-06|81351134|VALUE1|null|null|CITY20|888420134|null|2300104|404|null|RER|RCR|XCX|null|null|null|STRING52|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING52|STRING53|STRING53|null|null|15011.00|null|15011.00|null|null|230003|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85016.00|101.000|101.000|101.000|101.000|STRING10|STRING1|2022-01-06|CITY20|885234|8123401034|STRING53|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING34|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING34|null|null|null|null|null|124814034|null|STRING1|2140029|2022-01-06|null|null|null|STRING34|COMP1COUNTRY1|null|null|null|null|null|No||2|52|null|null null|null|null|null|VALUE35|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE12|null|null|2022-01-05|COUNTRY8|null|STRING35|null|AC|null|null|STRING35|2022-01-11|2022-01-06|81351135|VALUE1|null|null|CITY32|888420135|null|2300111|404|null|RER|RCR|XCX|null|null|null|STRING53|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING53|STRING54|STRING54|null|null|null|null|null|null|null|230008|null|null|101|STRING4|null|null|null|null|PCP|101|6500143.00|null|101.000|101.000|6500143.00|null|101.000|101.000|101.000|101.000|STRING11|STRING1|2022-01-06|CITY32|885235|8123401035|STRING54|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING35|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB8|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING35|null|null|null|null|null|124814035|null|STRING1|2140030|2022-01-06|null|null|null|STRING35|COMP1COUNTRY8|null|null|null|null|null|No||2|53|null|null null|null|null|null|VALUE36|10102412|10102412|null|3,02E+25|3,02E+25|null|null|XXX|null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY7|null|STRING36|null|AA|null|null|STRING36|null|2022-01-06|81351136|VALUE1|null|null|CITY33|888420136|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING54|null|101|null|null|null|1001|STRING7|null|6024050701|null|null|STRING54|STRING55|STRING55|null|null|null|null|null|null|null|230004|null|null|101|STRING3|null|null|null|null|PCP|101|6500144.00|null|101.000|101.000|6500144.00|null|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY33|885236|8123401036|STRING55|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING36|10240406|null|null|null|null|null|null|COMPANY2|COUNTRYAB7|STRING3|STRING3|1923002|1923001|123051|null|null|null|10349200.00|STRING36|null|null|null|EOD|STRING1|124814036|null|null|null|null|null|null|null|STRING36|null|null|null|STRING5|STRING3|null|Yes||2|54|null|null null|null|null|null|VALUE38|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING38|null|AC|null|null|STRING38|2022-01-11|2022-01-06|81351138|VALUE1|null|null|CITY35|888420138|null|2300110|404|null|RER|CRC|XCX|null|null|null|STRING56|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING56|STRING57|STRING57|null|null|15010.00|null|15010.00|null|null|230007|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85015.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY35|885238|8123401038|STRING57|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING38|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING38|null|null|null|null|null|124814038|null|STRING1|2140032|2022-01-06|null|null|null|STRING38|COMP1COUNTRY1|null|null|null|null|null|No||2|56|null|null null|null|null|null|VALUE39|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING39|null|AC|null|null|STRING39|2022-01-11|2022-01-06|81351139|VALUE1|null|null|CITY36|888420139|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING57|null|101|null|null|null|1001|null|null|null|null|null|STRING57|STRING58|STRING58|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY36|885239|8123401039|STRING58|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING39|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING39|null|null|null|null|null|124814039|null|STRING1|2140033|2022-01-06|null|null|null|STRING39|COMP1COUNTRY1|null|null|null|null|null|No||2|57|null|null null|null|null|null|VALUE40|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE13|null|null|2022-01-05|COUNTRY1|null|STRING40|METHOD3|AC|null|null|STRING40|2022-01-13|2022-01-06|81351140|VALUE1|null|null|CITY37|888420140|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING58|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING58|STRING59|STRING59|null|null|15001.00|null|15001.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85027.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY37|885240|8123401040|STRING59|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING40|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING40|null|null|null|null|null|124814040|null|STRING1|2140034|2022-01-06|null|null|null|STRING40|COMP1COUNTRY1|null|null|null|null|null|No||2|58|null|null null|null|null|null|VALUE41|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE14|null|null|2022-01-05|COUNTRY5|null|STRING41|null|AA|null|null|STRING41|2022-01-10|2022-01-06|81351141|VALUE1|null|null|CITY38|888420141|null|2300123|404|null|RER|RCR|XCX|null|null|null|STRING59|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING59|STRING60|STRING60|null|null|null|null|null|null|null|230017|null|null|101|STRING2|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|null|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY38|885241|8123401041|STRING60|null|2022-01-06|STRING4|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING41|10240406|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING1|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING41|null|null|null|null|null|124814041|null|null|null|null|null|null|null|STRING41|COMP1COUNTRY5|null|null|null|null|null|No||2|59|null|null null|null|null|null|VALUE42|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE15|null|null|2022-01-05|COUNTRY9|null|STRING42|METHOD4|AA|null|null|STRING42|null|2022-01-06|81351142|VALUE1|null|null|CITY39|888420142|null|2300110|404|null|RER|RCR|XCX|null|null|null|STRING60|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING60|STRING61|STRING61|null|null|null|null|null|null|null|230007|null|null|101|STRING5|null|null|null|null|PCP|101|6500146.00|null|101.000|101.000|6500146.00|null|101.000|101.000|101.000|101.000|STRING13|STRING2|2022-01-06|CITY39|885242|8123401042|STRING61|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING3|TYPE1|STRING42|10240407|null|null|null|null|null|null|COMPANY1|COUNTRYAB9|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|STRING42|null|null|null|null|null|124814031|3059002|null|null|null|null|null|null|STRING42|COMP1COUNTRY9|null|null|null|null|null|No||2|60|null|null null|null|null|null|VALUE43|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-14|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY7|null|STRING43|null|AA|null|null|STRING43|2022-01-14|2022-01-06|81351143|VALUE1|null|null|CITY40|888420143|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING61|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING61|STRING62|STRING62|null|null|null|null|null|null|null|230013|null|null|101|STRING3|null|null|null|null|PCP|101|6500147.00|null|101.000|101.000|6500147.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY40|885243|8123401043|STRING62|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING43|10240408|null|null|null|null|null|null|COMPANY1|COUNTRYAB7|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING43|null|null|null|null|null|124814042|null|null|null|null|null|null|null|STRING43|COMP1COUNTRY7|null|null|STRING6|STRING1|null|Yes||2|61|null|null null|null|null|null|VALUE44|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE16|null|null|2022-01-05|COUNTRY1|null|STRING44|null|AC|null|null|STRING44|2022-01-13|2022-01-06|81351144|VALUE1|null|null|CITY41|888420144|null|2300163|404|null|RER|RCR|XCX|null|null|STRING4|STRING62|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING62|STRING63|STRING63|null|null|15014.00|null|15014.00|null|null|230029|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85030.00|101.000|101.000|101.000|101.000|STRING4|STRING1|2022-01-06|CITY41|885244|8123401044|STRING63|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE2|STRING44|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING44|null|EFL|null|null|null|124814043|null|STRING1|2140035|2022-01-06|null|null|null|STRING44|COMP1COUNTRY1|null|null|null|null|null|No||2|62|null|null null|null|null|null|VALUE45|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING45|null|AC|null|null|STRING45|2022-01-10|2022-01-06|81351145|VALUE1|null|null|CITY42|888420145|null|2300164|404|null|RER|RCR|XCX|null|null|null|STRING63|null|101|null|null|null|1002|null|null|null|null|null|STRING63|STRING64|STRING64|null|null|15003.00|null|15003.00|null|null|230030|null|null|101|STRING1|null|null|null|null|PCP|101|6500149.00|null|101.000|101.000|6500149.00|85031.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY42|885245|8123401045|STRING64|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING45|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING45|null|null|null|null|null|124814044|null|STRING1|2140036|2022-01-06|null|null|null|STRING45|COMP1COUNTRY1|null|null|null|null|null|No||2|63|null|null null|null|null|null|VALUE46|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE17|null|null|2022-01-05|COUNTRY1|null|STRING46|METHOD5|AC|null|null|STRING46|2022-01-11|2022-01-06|81351146|VALUE1|null|null|CITY20|888420146|null|2300165|404|null|RER|RCR|XCX|null|null|null|STRING64|null|101|null|null|null|1001|STRING8|null|null|null|null|STRING64|STRING65|STRING65|null|null|15017.00|null|15017.00|null|null|230031|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85032.00|101.000|101.000|101.000|101.000|STRING14|STRING1|2022-01-06|CITY20|885246|8123401046|STRING65|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING46|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING46|null|null|null|null|null|124814045|null|STRING1|2140037|2022-01-06|null|null|null|STRING46|COMP1COUNTRY1|null|null|null|null|null|No||2|64|null|null null|null|null|null|VALUE47|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING47|METHOD3|AC|null|null|STRING47|2022-01-11|2022-01-06|81351147|VALUE1|null|null|CITY20|888420147|null|2300120|404|null|RER|CRC|XCX|null|null|null|STRING65|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING65|STRING66|STRING66|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885247|8123401047|STRING66|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING47|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING47|null|null|null|null|null|124814046|null|STRING1|2140003|2022-01-06|null|null|null|STRING47|COMP1COUNTRY1|null|null|null|null|null|No||2|65|null|null null|null|null|null|VALUE47|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING47|METHOD3|AC|null|null|STRING47|2022-01-11|2022-01-06|81351147|VALUE1|null|null|CITY20|888420147|null|2300120|404|null|RER|CRC|XCX|null|null|null|STRING66|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING66|STRING66|STRING66|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500133.00|null|101.000|101.000|6500133.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885247|8123401047|STRING66|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING47|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING47|null|null|null|null|null|124814046|null|STRING1|2140003|2022-01-06|null|null|null|STRING47|COMP1COUNTRY1|null|null|null|null|null|No||2|66|null|null null|null|null|null|VALUE47|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING47|METHOD3|AC|null|null|STRING47|2022-01-11|2022-01-06|81351147|VALUE1|null|null|CITY20|888420147|null|2300120|404|null|RER|CRC|XCX|null|null|null|STRING67|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING67|STRING66|STRING66|null|null|15014.00|null|15014.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885247|8123401047|STRING66|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING47|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING47|null|null|null|null|null|124814046|null|STRING1|2140003|2022-01-06|null|null|null|STRING47|COMP1COUNTRY1|null|null|null|null|null|No||2|67|null|null null|null|null|null|VALUE49|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING49|null|AC|null|null|STRING49|2022-01-10|2022-01-06|81351149|VALUE1|null|null|CITY44|888420149|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING70|null|101|null|null|null|1002|null|null|null|null|null|STRING70|STRING71|STRING71|null|null|15003.00|null|15003.00|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY44|885249|8123401049|STRING71|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING49|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING49|null|null|null|null|null|124814048|null|STRING1|2140039|2022-01-06|null|null|null|STRING49|COMP1COUNTRY1|null|null|null|null|null|No||2|70|null|null null|null|null|null|VALUE49|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING49|null|AC|null|null|STRING49|2022-01-10|2022-01-06|81351149|VALUE1|null|null|CITY44|888420149|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING71|null|101|null|null|null|1002|null|null|null|null|null|STRING71|STRING71|STRING71|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|85034.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY44|885249|8123401049|STRING71|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING49|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING49|null|null|null|null|null|124814048|null|STRING1|2140039|2022-01-06|null|null|null|STRING49|COMP1COUNTRY1|null|null|null|null|null|No||2|71|null|null null|null|null|null|VALUE50|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING50|null|AA|null|null|STRING50|2022-01-13|2022-01-06|81351142|VALUE1|null|null|CITY45|888420150|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING72|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING72|STRING73|STRING73|null|null|null|null|null|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY45|885250|8123401042|STRING73|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING50|10240407|null|null|null|null|null|null|COMPANY2|COUNTRYAB4|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING50|null|null|null|null|null|124814049|null|null|null|null|null|null|null|STRING50|COMP2COUNTRY4|null|null|null|null|null|No||2|72|null|null null|null|null|null|VALUE51|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING51|null|AC|null|null|STRING51|2022-01-11|2022-01-06|81351150|VALUE1|null|null|CITY46|888420151|null|2300121|404|null|RER|CRC|XCX|null|null|null|STRING73|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING73|STRING74|STRING74|null|null|null|null|null|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY46|885251|8123401050|STRING74|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING51|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING51|null|null|null|null|null|124814050|null|STRING1|2140030|2022-01-06|null|null|null|STRING51|COMP1COUNTRY3|null|null|null|null|null|No||2|73|null|null null|null|null|null|VALUE51|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING51|null|AC|null|null|STRING51|2022-01-11|2022-01-06|81351150|VALUE1|null|null|CITY46|888420151|null|2300175|404|null|RER|CRC|XCX|null|null|null|STRING74|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING74|STRING74|STRING74|null|null|15019.00|null|15019.00|null|null|230032|null|null|101|STRING1|null|null|null|null|PCP|101|6500151.00|null|101.000|101.000|6500151.00|85009.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY46|885251|8123401050|STRING74|null|2022-01-06|STRING5|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING51|10240409|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING51|null|null|null|null|null|124814050|null|STRING1|2140030|2022-01-06|null|null|null|STRING51|COMP1COUNTRY3|null|null|null|null|null|No||2|74|null|null null|null|null|null|VALUE52|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE18|null|null|2022-01-05|COUNTRY1|null|STRING52|null|AC|null|null|STRING52|2022-01-11|2022-01-06|81351151|VALUE1|null|null|CITY47|888420152|null|2300115|404|null|RER|RCR|XCX|null|null|null|STRING75|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING75|STRING76|STRING76|null|null|15007.00|null|15007.00|null|null|230012|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85004.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY47|885252|8123401051|STRING76|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING52|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING52|null|null|null|null|null|124814051|null|STRING1|2140040|2022-01-06|null|null|null|STRING52|COMP1COUNTRY1|null|null|null|null|null|No||2|75|null|null null|null|null|null|VALUE53|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING53|METHOD6|AC|null|null|STRING53|2022-01-11|2022-01-06|81351152|VALUE1|null|null|CITY48|888420153|null|2300177|404|null|RER|RCR|XCX|null|null|null|STRING76|null|101|null|null|null|1001|null|null|null|null|null|STRING76|STRING77|STRING77|null|null|15003.00|null|15003.00|null|null|230033|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85015.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY48|885253|8123401052|STRING77|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING53|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING53|null|null|null|null|null|124814052|null|STRING1|2140041|2022-01-06|null|null|null|STRING53|COMP1COUNTRY1|null|null|null|null|null|No||2|76|null|null null|null|null|null|VALUE54|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE19|null|null|2022-01-05|COUNTRY1|null|STRING54|null|AC|null|null|STRING54|2022-01-11|2022-01-06|81351153|VALUE1|null|null|CITY49|888420154|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING77|null|101|null|null|null|1003|STRING6|null|null|null|null|STRING77|STRING78|STRING78|null|null|15020.00|null|15020.00|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500152.00|null|101.000|101.000|6500152.00|85035.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY49|885254|8123401053|STRING78|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING54|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING54|null|null|null|null|null|124814053|null|STRING1|2140042|2022-01-06|null|null|null|STRING54|COMP1COUNTRY1|null|null|null|null|null|No||2|77|null|null null|null|null|null|VALUE54|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE19|null|null|2022-01-05|COUNTRY1|null|STRING54|null|AC|null|null|STRING54|2022-01-11|2022-01-06|81351153|VALUE1|null|null|CITY49|888420154|null|2300133|404|null|RER|RCR|XCX|null|null|null|STRING78|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING78|STRING78|STRING78|null|null|null|null|null|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|null|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY49|885254|8123401053|STRING78|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING54|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING54|null|null|null|null|null|124814053|null|STRING1|2140042|2022-01-06|null|null|null|STRING54|COMP1COUNTRY1|null|null|null|null|null|No||2|78|null|null null|null|null|null|VALUE55|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING55|null|AC|null|null|STRING55|2022-01-13|2022-01-06|81351154|VALUE1|null|null|CITY50|888420155|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING79|null|101|null|null|null|1001|null|null|null|null|null|STRING79|STRING80|STRING80|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85036.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY50|885255|8123401054|STRING80|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING55|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING55|null|null|null|null|null|124814054|null|STRING1|2140043|2022-01-06|null|null|null|STRING55|COMP1COUNTRY6|null|null|null|null|null|No||2|79|null|null null|null|null|null|VALUE56|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE20|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|2022-01-11|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING80|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING80|STRING81|STRING81|null|null|15016.00|null|15016.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85028.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING56|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING56|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|80|null|null null|null|null|null|VALUE56|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|2022-01-11|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300175|404|null|RER|RCR|XCX|null|null|null|STRING81|null|101|null|null|null|1001|null|null|null|null|null|STRING81|STRING81|STRING81|null|null|15003.00|null|15003.00|null|null|230032|null|null|101|STRING1|null|null|null|null|PCP|101|6500141.00|null|101.000|101.000|6500141.00|85037.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING56|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING56|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|81|null|null null|null|null|null|VALUE56|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE20|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|2022-01-11|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING82|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING82|STRING81|STRING81|null|null|15021.00|null|15021.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85038.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING56|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING56|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|82|null|null null|null|null|null|VALUE56|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE20|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|null|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING83|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING83|STRING81|STRING81|null|null|15020.00|null|15020.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85023.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING6|10020.000|STRING1|STORE1|STRING1|TYPE3|STRING56|10240402|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|null|1923002|1923001|null|null|null|null|10349200.00|STRING57|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|83|null|null null|null|null|null|VALUE57|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING57|null|AC|null|null|STRING57|2022-01-11|2022-01-06|81351156|VALUE1|null|null|CITY52|888420157|null|2300185|404|null|RER|RCR|XCX|null|null|null|STRING84|null|101|null|null|null|1001|null|null|null|null|null|STRING84|STRING85|STRING85|null|null|15003.00|null|15003.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85006.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY52|885257|8123401056|STRING85|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING57|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING58|null|null|null|null|null|124814056|null|STRING1|2140044|2022-01-06|null|null|null|STRING57|COMP1COUNTRY6|null|null|null|null|null|No||2|84|null|null null|null|null|null|VALUE57|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING57|null|AC|null|null|STRING57|2022-01-11|2022-01-06|81351156|VALUE1|null|null|CITY52|888420157|null|2300185|404|null|RER|RCR|XCX|null|null|null|STRING85|null|101|null|null|null|1001|null|null|null|null|null|STRING85|STRING85|STRING85|null|null|15003.00|null|15003.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500153.00|null|101.000|101.000|6500153.00|85039.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY52|885257|8123401056|STRING85|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING57|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING58|null|null|null|null|null|124814056|null|STRING1|2140044|2022-01-06|null|null|null|STRING57|COMP1COUNTRY6|null|null|null|null|null|No||2|85|null|null null|null|null|null|VALUE57|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING57|null|AC|null|null|STRING57|2022-01-11|2022-01-06|81351156|VALUE1|null|null|CITY52|888420157|null|2300185|404|null|RER|RCR|XCX|null|null|null|STRING86|null|101|null|null|null|1001|null|null|null|null|null|STRING86|STRING85|STRING85|null|null|15003.00|null|15003.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY52|885257|8123401056|STRING85|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING57|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING58|null|null|null|null|null|124814056|null|STRING1|2140044|2022-01-06|null|null|null|STRING57|COMP1COUNTRY6|null|null|null|null|null|No||2|86|null|null null|null|null|null|VALUE58|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING58|null|AC|null|null|STRING58|2022-01-13|2022-01-06|81351157|VALUE1|null|null|CITY53|888420158|null|2300101|404|null|RER|RCR|XCX|null|null|STRING5|STRING87|null|101|null|null|null|1002|null|null|null|null|null|STRING87|STRING88|STRING88|null|null|15003.00|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY53|885258|8123401057|STRING88|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE2|STRING58|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING59|null|EFL|null|null|null|124814057|null|STRING1|2140045|2022-01-06|null|null|null|STRING58|COMP1COUNTRY3|null|null|null|null|null|No||2|87|null|null null|null|null|null|VALUE59|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING59|null|AC|null|null|STRING59|2022-01-11|2022-01-06|81351158|VALUE1|null|null|CITY54|888420159|null|2300105|404|null|RER|CRC|XCX|null|null|null|STRING88|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING88|STRING89|STRING89|null|null|null|null|null|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY54|885259|8123401058|STRING89|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING59|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING60|null|null|null|null|null|124814058|null|STRING1|2140046|2022-01-06|null|null|null|STRING59|COMP1COUNTRY1|null|null|null|null|null|No||2|88|null|null null|null|null|null|VALUE59|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING59|null|AC|null|null|STRING59|2022-01-11|2022-01-06|81351158|VALUE1|null|null|CITY54|888420159|null|2300190|404|null|RER|RCR|XCX|null|null|null|STRING89|null|101|null|null|null|1003|null|null|null|null|null|STRING89|STRING89|STRING89|null|null|15003.00|null|15003.00|null|null|230036|null|null|101|STRING1|null|null|null|null|PCP|101|6500154.00|null|101.000|101.000|6500154.00|85040.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY54|885259|8123401058|STRING89|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING59|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING60|null|null|null|null|null|124814058|null|STRING1|2140046|2022-01-06|null|null|null|STRING59|COMP1COUNTRY1|null|null|null|null|null|No||2|89|null|null null|null|null|null|VALUE59|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING59|null|AC|null|null|STRING59|2022-01-11|2022-01-06|81351158|VALUE1|null|null|CITY54|888420159|null|2300191|404|null|RER|RCR|XCX|null|null|null|STRING90|null|101|null|null|null|1003|null|null|null|null|null|STRING90|STRING89|STRING89|null|null|15003.00|null|15003.00|null|null|230037|null|null|101|STRING1|null|null|null|null|PCP|101|6500154.00|null|101.000|101.000|6500154.00|85040.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY54|885259|8123401058|STRING89|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING59|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING60|null|null|null|null|null|124814058|null|STRING1|2140046|2022-01-06|null|null|null|STRING59|COMP1COUNTRY1|null|null|null|null|null|No||2|90|null|null null|null|null|null|VALUE60|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-05|COUNTRY1|null|STRING60|null|AC|null|null|STRING60|2022-01-13|2022-01-06|81351159|VALUE1|null|null|CITY55|888420160|null|2300192|404|null|RER|RCR|XCX|null|null|null|STRING91|null|101|null|null|null|1002|STRING3|null|null|null|null|STRING91|STRING92|STRING92|null|null|null|null|null|null|null|230038|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY55|885260|8123401059|STRING92|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING60|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING61|null|null|null|null|null|124814059|null|STRING1|2140047|2022-01-06|null|null|null|STRING60|COMP1COUNTRY1|null|null|null|null|null|No||2|91|null|null null|null|null|null|VALUE61|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING61|null|AC|null|null|STRING61|2022-01-11|2022-01-06|81351160|VALUE1|null|null|CITY20|888420161|null|2300123|404|null|RER|RCR|XCX|null|null|null|STRING92|null|101|null|null|null|1001|null|null|null|null|null|STRING92|STRING93|STRING93|null|null|15003.00|null|15003.00|null|null|230017|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY20|885261|8123401060|STRING93|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING61|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING62|null|null|null|null|null|124814060|null|STRING1|2140048|2022-01-06|null|null|null|STRING61|COMP1COUNTRY1|null|null|null|null|null|No||2|92|null|null null|null|null|null|VALUE61|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE21|null|null|2022-01-05|COUNTRY1|null|STRING61|null|AC|null|null|STRING61|2022-01-11|2022-01-06|81351160|VALUE1|null|null|CITY20|888420161|null|2300129|404|null|RER|RCR|XCX|null|null|null|STRING93|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING93|STRING93|STRING93|null|null|15022.00|null|15022.00|null|null|230019|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85041.00|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY20|885261|8123401060|STRING93|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING61|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING62|null|null|null|null|null|124814060|null|STRING1|2140048|2022-01-06|null|null|null|STRING61|COMP1COUNTRY1|null|null|null|null|null|No||2|93|null|null null|null|null|null|VALUE63|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING63|null|AC|null|null|STRING63|2022-01-11|2022-01-06|81351162|VALUE1|null|null|CITY57|888420163|null|2300101|404|null|RER|CRC|XCX|null|null|null|STRING95|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING95|STRING96|STRING96|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY57|885263|8123401062|STRING96|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING63|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING64|null|null|null|null|null|124814062|null|STRING1|2140044|2022-01-06|null|null|null|STRING63|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|95|null|null null|null|null|null|VALUE64|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300165|404|null|RER|RCR|XCX|null|null|null|STRING96|null|101|null|null|null|1002|STRING7|null|null|null|null|STRING96|STRING97|STRING97|null|null|15016.00|null|15016.00|null|null|230031|null|null|101|STRING1|null|null|null|null|PCP|101|6500152.00|null|101.000|101.000|6500152.00|85042.00|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|96|null|null null|null|null|null|VALUE64|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300164|404|null|RER|RCR|XCX|null|null|null|STRING97|null|101|null|null|null|1002|STRING7|null|null|null|null|STRING97|STRING97|STRING97|null|null|null|null|null|null|null|230030|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|null|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|97|null|null null|null|null|null|VALUE64|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300164|404|null|RER|RCR|XCX|null|null|null|STRING98|null|101|null|null|null|1002|STRING7|null|null|null|null|STRING98|STRING97|STRING97|null|null|null|null|null|null|null|230030|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|null|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|98|null|null null|null|null|null|VALUE64|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300175|404|null|RER|RCR|XCX|null|null|null|STRING99|null|101|null|null|null|1001|STRING7|null|null|null|null|STRING99|STRING97|STRING97|null|null|15011.00|null|15011.00|null|null|230032|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85016.00|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|99|null|null null|null|null|null|VALUE65|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING65|null|AC|null|null|STRING65|2022-01-11|2022-01-06|81351164|VALUE1|null|null|CITY59|888420165|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING100|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING100|STRING101|STRING101|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING15|STRING1|2022-01-06|CITY59|885265|8123401064|STRING101|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING65|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB10|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING66|null|null|null|null|null|124814064|null|STRING1|2140051|2022-01-06|null|null|null|STRING65|COMP2COUNTRY6|null|null|null|null|null|No||2|100|null|null null|null|null|null|VALUE66|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING66|null|AC|null|null|STRING66|2022-01-13|2022-01-06|81351165|VALUE1|null|null|CITY60|888420166|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING101|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING101|STRING102|STRING102|null|null|15023.00|null|15023.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500149.00|null|101.000|101.000|6500149.00|85043.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY60|885266|8123401065|STRING102|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING66|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING67|null|null|null|null|null|124814065|null|STRING1|2140052|2022-01-06|null|null|null|STRING66|COMP1COUNTRY1|null|null|null|null|null|No||2|101|null|null null|null|null|null|VALUE67|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE23|null|null|2022-01-05|COUNTRY1|null|STRING67|null|AC|null|null|STRING67|2022-01-11|2022-01-06|81351166|VALUE1|null|null|CITY61|888420167|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING102|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING102|STRING103|STRING103|null|null|15017.00|null|15017.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85044.00|101.000|101.000|101.000|101.000|STRING5|STRING1|2022-01-06|CITY61|885267|8123401066|STRING103|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING67|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING68|null|null|null|null|null|124814066|null|STRING1|2140053|2022-01-06|null|null|null|STRING67|COMP1COUNTRY1|null|null|null|null|null|No||2|102|null|null null|null|null|null|VALUE68|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING68|null|AC|null|null|STRING68|2022-01-11|2022-01-06|81351167|VALUE1|null|null|CITY62|888420168|null|2300140|404|null|RER|CRC|XCX|null|null|null|STRING103|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING103|STRING104|STRING104|null|null|null|null|null|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY62|885268|8123401067|STRING104|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING68|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING69|null|null|null|null|null|124814067|null|STRING1|2140054|2022-01-06|null|null|null|STRING68|COMP1COUNTRY1|null|null|STRING7|STRING2|null|Yes||2|103|null|null null|null|null|null|VALUE69|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING69|null|AC|null|null|STRING69|2022-01-11|2022-01-06|81351168|VALUE1|null|null|CITY63|888420169|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING104|null|101|null|null|null|1002|null|null|null|null|null|STRING104|STRING105|STRING105|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY63|885269|8123401068|STRING105|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING69|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING70|null|null|null|null|null|124814068|null|STRING1|2140055|2022-01-06|null|null|null|STRING69|COMP1COUNTRY1|null|null|null|null|null|No||2|104|null|null null|null|null|null|VALUE70|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING70|METHOD2|AC|null|null|STRING70|2022-01-11|2022-01-06|81351169|VALUE1|null|null|CITY64|888420170|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING85|null|101|null|null|null|1001|null|null|null|null|null|STRING105|STRING106|STRING106|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500153.00|null|101.000|101.000|6500153.00|85039.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY64|885270|8123401069|STRING106|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING70|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING71|null|null|null|null|null|124814069|null|STRING1|2140056|2022-01-06|null|null|null|STRING70|COMP1COUNTRY1|null|null|null|null|null|No||2|105|null|null null|null|null|null|VALUE71|10102412|10102412|null|3,02E+25|3,02E+25|null|null|XYX|null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY2|null|STRING71|null|AA|null|null|STRING71|null|2022-01-06|81351170|VALUE1|null|null|CITY65|888420171|null|2300207|404|null|RER|RCR|XCX|null|null|null|STRING105|null|101|null|null|null|1002|null|null|6024050703|null|null|STRING106|STRING107|STRING107|null|null|15003.00|null|15003.00|null|null|230039|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY65|885271|8123401070|STRING107|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING71|10240410|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING3|1923002|1923001|123124061|null|null|null|10349200.00|STRING72|null|null|null|EOD|STRING1|124814070|null|null|null|null|null|null|null|STRING71|null|null|STRING1|null|null|null|No||2|106|null|null null|null|null|null|VALUE72|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE2|null|null|2022-01-05|COUNTRY4|null|STRING72|null|AA|null|null|STRING72|2022-01-13|2022-01-06|81351170|VALUE1|null|null|CITY66|888420172|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING106|null|101|null|null|null|1001|STRING3|null|null|null|null|STRING107|STRING108|STRING108|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|null|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY66|885272|8123401070|STRING108|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING72|10240410|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING73|null|null|null|null|null|124814071|null|null|null|null|null|null|null|STRING72|COMP1COUNTRY4|null|null|null|null|null|No||2|107|null|null null|null|null|null|VALUE73|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE24|null|null|2022-01-05|COUNTRY1|null|STRING73|null|AC|null|null|STRING73|2022-01-13|2022-01-06|81351171|VALUE1|null|null|CITY67|888420173|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING107|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING108|STRING109|STRING109|null|null|15024.00|null|15024.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85045.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY67|885273|8123401071|STRING109|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING73|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING74|null|null|null|null|null|124814072|null|STRING1|2140057|2022-01-06|null|null|null|STRING73|COMP1COUNTRY1|null|null|null|null|null|No||2|108|null|null null|null|null|null|VALUE74|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING74|null|AC|null|null|STRING74|2022-01-13|2022-01-06|81351172|VALUE1|null|null|CITY68|888420174|null|2300143|404|null|RER|RCR|XCX|null|null|null|STRING108|null|101|null|null|null|1001|null|null|null|null|null|STRING109|STRING110|STRING110|null|null|15003.00|null|15003.00|null|null|230027|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY68|885274|8123401072|STRING110|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING74|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING75|null|null|null|null|null|124814073|null|STRING1|2140058|2022-01-06|null|null|null|STRING74|COMP1COUNTRY1|null|null|null|null|null|No||2|109|null|null null|null|null|null|VALUE75|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300211|404|null|RER|RCR|XCX|null|null|null|STRING109|null|101|null|null|null|1003|null|null|null|null|null|STRING110|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230040|null|null|101|STRING1|null|null|null|null|PCP|101|6500155.00|null|101.000|101.000|6500155.00|85046.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|110|null|null null|null|null|null|VALUE75|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300211|404|null|RER|RCR|XCX|null|null|null|STRING110|null|101|null|null|null|1003|null|null|null|null|null|STRING111|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230040|null|null|101|STRING1|null|null|null|null|PCP|101|6500155.00|null|101.000|101.000|6500155.00|85046.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|111|null|null null|null|null|null|VALUE75|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300135|404|null|RER|RCR|XCX|null|null|null|STRING111|null|101|null|null|null|1001|null|null|null|null|null|STRING112|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|112|null|null null|null|null|null|VALUE75|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING112|null|101|null|null|null|1002|null|null|null|null|null|STRING113|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500156.00|null|101.000|101.000|6500156.00|85023.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|113|null|null null|null|null|null|VALUE75|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING113|null|101|null|null|null|1002|null|null|null|null|null|STRING114|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|114|null|null null|null|null|null|VALUE76|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE25|null|null|2022-01-05|COUNTRY2|null|STRING76|null|AC|null|null|STRING76|2022-01-11|2022-01-06|81351174|VALUE1|null|null|CITY70|888420176|null|2300133|404|null|RER|RCR|XCX|null|null|null|STRING114|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING115|STRING116|STRING116|null|null|15017.00|15004.00|15017.00|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|5003.00|101.000|101.000|6500148.00|85032.00|101.000|101.000|101.000|101.000|STRING16|STRING1|2022-01-06|CITY70|885276|8123401074|STRING116|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING76|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING77|null|null|null|null|null|124814075|null|STRING1|2140059|2022-01-06|null|null|null|STRING76|COMP1COUNTRY2|null|null|null|null|null|No||2|115|null|null null|null|null|null|VALUE77|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE26|null|null|2022-01-05|COUNTRY1|null|STRING77|null|AC|null|null|STRING77|2022-01-13|2022-01-06|81351175|VALUE1|null|null|CITY71|888420177|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING115|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING116|STRING117|STRING117|null|null|15024.00|null|15024.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85045.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY71|885277|8123401075|STRING117|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING77|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING78|null|null|null|null|null|124814076|null|STRING1|2140060|2022-01-06|null|null|null|STRING77|COMP1COUNTRY1|null|null|null|null|null|No||2|116|null|null null|null|null|null|VALUE78|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING78|null|AC|null|null|STRING78|2022-01-11|2022-01-06|81351176|VALUE1|null|null|CITY72|888420178|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING116|null|101|null|null|null|1001|null|null|null|null|null|STRING117|STRING118|STRING118|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY72|885278|8123401076|STRING118|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING78|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING79|null|null|null|null|null|124814077|null|STRING1|2140061|2022-01-06|null|null|null|STRING78|COMP1COUNTRY1|null|null|null|null|null|No||2|117|null|null null|null|null|null|VALUE79|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING79|null|AC|null|null|STRING79|2022-01-11|2022-01-06|81351177|VALUE1|null|null|CITY73|888420179|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING5|null|101|null|null|null|1001|null|null|null|null|null|STRING118|STRING119|STRING119|null|null|15003.00|null|15003.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY73|885279|8123401077|STRING119|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING79|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING80|null|null|null|null|null|124814078|null|STRING1|2140062|2022-01-06|null|null|null|STRING79|COMP1COUNTRY1|null|null|null|null|null|No||2|118|null|null null|null|null|null|VALUE80|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING80|null|AC|null|null|STRING80|2022-01-11|2022-01-06|81351178|VALUE1|null|null|CITY74|888420180|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING117|null|101|null|null|null|1002|null|null|null|null|null|STRING119|STRING120|STRING120|null|null|15003.00|null|15003.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|85029.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY74|885280|8123401078|STRING120|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING80|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING81|null|null|null|null|null|124814079|null|STRING1|2140063|2022-01-06|null|null|null|STRING80|COMP1COUNTRY2|null|null|null|null|null|No||2|119|null|null null|null|null|null|VALUE80|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING80|null|AC|null|null|STRING80|2022-01-11|2022-01-06|81351178|VALUE1|null|null|CITY74|888420180|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING118|null|101|null|null|null|1002|null|null|null|null|null|STRING120|STRING120|STRING120|null|null|15003.00|15005.00|15003.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|5004.00|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY74|885280|8123401078|STRING120|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING80|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING81|null|null|null|null|null|124814079|null|STRING1|2140063|2022-01-06|null|null|null|STRING80|COMP1COUNTRY2|null|null|null|null|null|No||2|120|null|null null|null|null|null|VALUE80|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING80|null|AC|null|null|STRING80|2022-01-11|2022-01-06|81351178|VALUE1|null|null|CITY74|888420180|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING119|null|101|null|null|null|1002|null|null|null|null|null|STRING121|STRING120|STRING120|null|null|15003.00|15005.00|15003.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|5005.00|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY74|885280|8123401078|STRING120|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING80|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING81|null|null|null|null|null|124814079|null|STRING1|2140063|2022-01-06|null|null|null|STRING80|COMP1COUNTRY2|null|null|null|null|null|No||2|121|null|null null|null|null|null|VALUE81|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING120|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING122|STRING123|STRING123|null|null|15025.00|null|15025.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500141.00|null|101.000|101.000|6500141.00|85047.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|122|null|null null|null|null|null|VALUE81|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300224|404|null|RER|RCR|XCX|null|null|null|STRING121|null|101|null|null|null|1003|STRING6|null|null|null|null|STRING123|STRING123|STRING123|null|null|null|null|null|null|null|230041|null|null|101|STRING1|null|null|null|null|PCP|101|6500157.00|null|101.000|101.000|6500157.00|null|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|123|null|null null|null|null|null|VALUE81|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300225|404|null|RER|RCR|XCX|null|null|null|STRING122|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING124|STRING123|STRING123|null|null|null|null|null|null|null|230042|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|124|null|null null|null|null|null|VALUE81|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300225|404|null|RER|RCR|XCX|null|null|null|STRING123|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING125|STRING123|STRING123|null|null|15024.00|null|15024.00|null|null|230042|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85048.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|125|null|null null|null|null|null|VALUE81|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300224|404|null|RER|RCR|XCX|null|null|null|STRING124|null|101|null|null|null|1003|null|null|null|null|null|STRING126|STRING123|STRING123|null|null|15003.00|null|15003.00|null|null|230041|null|null|101|STRING1|null|null|null|null|PCP|101|6500157.00|null|101.000|101.000|6500157.00|85049.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|126|null|null null|null|null|null|VALUE82|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING82|null|AC|null|null|STRING82|2022-01-11|2022-01-06|81351180|VALUE1|null|null|CITY76|888420182|null|2300185|404|null|RER|CRC|XCX|null|null|null|STRING125|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING127|STRING128|STRING128|null|null|15014.00|null|15014.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY76|885282|8123401080|STRING128|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING82|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING83|null|null|null|null|null|124814081|null|STRING1|2140064|2022-01-06|null|null|null|STRING82|COMP1COUNTRY1|null|null|null|null|null|No||2|127|null|null null|null|null|null|VALUE83|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING83|null|AC|null|null|STRING83|2022-01-11|2022-01-06|81351181|VALUE1|null|null|CITY77|888420183|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING126|null|101|null|null|null|1002|null|null|null|null|null|STRING128|STRING129|STRING129|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|85016.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY77|885283|8123401081|STRING129|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING83|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING84|null|null|null|null|null|124814082|null|STRING1|2140011|2022-01-06|null|null|null|STRING83|COMP1COUNTRY1|null|null|null|null|null|No||2|128|null|null null|null|null|null|VALUE84|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING84|null|AC|null|null|STRING84|2022-01-11|2022-01-06|81351182|VALUE1|null|null|CITY78|888420184|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING127|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING129|STRING130|STRING130|null|null|null|null|null|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY78|885284|8123401082|STRING130|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING84|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING85|null|null|null|null|null|124814083|null|STRING1|2140065|2022-01-06|null|null|null|STRING84|COMP1COUNTRY1|null|null|STRING7|STRING2|null|Yes||2|129|null|null null|null|null|null|VALUE85|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING85|null|AC|null|null|STRING85|null|2022-01-06|81351183|VALUE1|null|null|CITY79|888420185|null|2300135|404|null|RER|RCR|XCX|null|null|null|STRING128|null|101|null|null|null|1001|null|null|null|null|null|STRING130|STRING131|STRING131|null|null|15003.00|null|15003.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85050.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY79|885285|8123401083|STRING131|null|2022-01-06|STRING6|10020.000|STRING2|STORE1|STRING1|TYPE3|STRING85|10240402|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|null|1923002|1923001|null|null|null|null|10349200.00|STRING86|null|null|null|null|null|124814084|null|STRING1|2140033|2022-01-06|null|null|null|STRING85|COMP1COUNTRY1|null|null|null|null|null|No||2|130|null|null null|null|null|null|VALUE85|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING85|null|AC|null|null|STRING85|2022-01-10|2022-01-06|81351183|VALUE1|null|null|CITY79|888420185|null|2300232|404|null|RER|RCR|XCX|null|null|STRING6|STRING30|null|101|null|null|null|1002|null|null|null|null|null|STRING131|STRING131|STRING131|null|null|15003.00|null|15003.00|null|null|230043|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY79|885285|8123401083|STRING131|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE2|STRING85|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING87|null|EFL|null|null|null|124814084|null|STRING1|2140033|2022-01-06|null|null|null|STRING85|COMP1COUNTRY1|null|null|null|null|null|No||2|131|null|null null|null|null|null|VALUE86|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING86|null|AC|null|null|STRING86|2022-01-11|2022-01-06|81351184|VALUE1|null|null|CITY80|888420186|null|2300233|404|null|RER|RCR|XCX|null|null|null|STRING129|null|101|null|null|null|1001|null|null|null|null|null|STRING132|STRING133|STRING133|null|null|15003.00|null|15003.00|null|null|230044|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY80|885286|8123401084|STRING133|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING86|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING88|null|null|null|null|null|124814085|null|STRING1|2140066|2022-01-06|null|null|null|STRING86|COMP1COUNTRY1|null|null|null|null|null|No||2|132|null|null null|null|null|null|VALUE86|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING86|null|AC|null|null|STRING86|2022-01-11|2022-01-06|81351184|VALUE1|null|null|CITY80|888420186|null|2300234|404|null|RER|RCR|XCX|null|null|null|STRING130|null|101|null|null|null|1001|null|null|null|null|null|STRING133|STRING133|STRING133|null|null|15003.00|null|15003.00|null|null|230045|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY80|885286|8123401084|STRING133|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING86|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING88|null|null|null|null|null|124814085|null|STRING1|2140066|2022-01-06|null|null|null|STRING86|COMP1COUNTRY1|null|null|null|null|null|No||2|133|null|null null|null|null|null|VALUE86|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING86|null|AC|null|null|STRING86|2022-01-11|2022-01-06|81351184|VALUE1|null|null|CITY80|888420186|null|2300142|404|null|RER|CRC|XCX|null|null|null|STRING131|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING134|STRING133|STRING133|null|null|null|null|null|null|null|230026|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY80|885286|8123401084|STRING133|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING86|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING88|null|null|null|null|null|124814085|null|STRING1|2140066|2022-01-06|null|null|null|STRING86|COMP1COUNTRY1|null|null|null|null|null|No||2|134|null|null null|null|null|null|VALUE87|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING87|null|AC|null|null|STRING87|2022-01-13|2022-01-06|81351185|VALUE1|null|null|CITY81|888420187|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING132|null|101|null|null|null|1002|null|null|null|null|null|STRING135|STRING136|STRING136|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|85034.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY81|885287|8123401085|STRING136|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING87|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING89|null|null|null|null|null|124814086|null|STRING1|2140067|2022-01-06|null|null|null|STRING87|COMP1COUNTRY1|null|null|null|null|null|No||2|135|null|null null|null|null|null|VALUE88|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY4|null|STRING88|null|AC|null|null|STRING88|2022-01-11|2022-01-06|81351186|VALUE1|null|null|CITY82|888420188|null|2300111|404|null|RER|RCR|XCX|null|null|null|STRING133|null|101|null|null|null|1001|null|null|null|null|null|STRING136|STRING137|STRING137|null|null|15003.00|null|15003.00|null|null|230008|null|null|101|STRING1|null|null|null|null|PCP|101|6500158.00|null|101.000|101.000|6500158.00|85051.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY82|885288|8123401086|STRING137|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING88|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING90|null|null|null|null|null|124814087|null|STRING1|2140068|2022-01-06|null|null|null|STRING88|COMP1COUNTRY4|null|null|null|null|null|No||2|136|null|null null|null|null|null|VALUE89|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING89|null|AC|null|null|STRING89|2022-01-11|2022-01-06|81351187|VALUE1|null|null|CITY83|888420189|null|2300116|404|null|RER|RCR|XCX|null|null|null|STRING134|null|101|null|null|null|1001|null|null|null|null|null|STRING137|STRING138|STRING138|null|null|15003.00|null|15003.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY83|885289|8123401087|STRING138|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING89|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING91|null|null|null|null|null|124814088|null|STRING1|2140069|2022-01-06|null|null|null|STRING89|COMP1COUNTRY1|null|null|null|null|null|No||2|137|null|null null|null|null|null|VALUE90|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING90|METHOD7|AC|null|null|STRING90|2022-01-13|2022-01-06|81351188|VALUE1|null|null|CITY84|888420190|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING135|null|101|null|null|null|1003|STRING2|null|null|null|null|STRING138|STRING139|STRING139|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY84|885290|8123401088|STRING139|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING90|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING92|null|null|null|null|null|124814089|null|STRING1|2140070|2022-01-06|null|null|null|STRING90|COMP1COUNTRY1|null|null|null|null|null|No||2|138|null|null null|null|null|null|VALUE91|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE28|null|null|2022-01-05|COUNTRY4|null|STRING91|null|AC|null|null|STRING91|2022-01-11|2022-01-06|81351189|VALUE1|null|null|CITY85|888420191|null|2300240|404|null|RER|RCR|XCX|null|null|null|STRING136|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING139|STRING140|STRING140|null|null|15026.00|null|15026.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85052.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY85|885291|8123401089|STRING140|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING91|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING93|null|null|null|null|null|124814090|null|STRING1|2140071|2022-01-06|null|null|null|STRING91|COMP1COUNTRY4|null|null|null|null|null|No||2|139|null|null null|null|null|null|VALUE91|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE28|null|null|2022-01-05|COUNTRY4|null|STRING91|null|AC|null|null|STRING91|2022-01-11|2022-01-06|81351189|VALUE1|null|null|CITY85|888420191|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING137|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING140|STRING140|STRING140|null|null|15026.00|null|15026.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85052.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY85|885291|8123401089|STRING140|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING91|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING93|null|null|null|null|null|124814090|null|STRING1|2140071|2022-01-06|null|null|null|STRING91|COMP1COUNTRY4|null|null|null|null|null|No||2|140|null|null null|null|null|null|VALUE92|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE29|null|null|2022-01-05|COUNTRY1|null|STRING92|null|AC|null|null|STRING92|2022-01-13|2022-01-06|81351190|VALUE1|null|null|CITY86|888420192|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING138|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING141|STRING142|STRING142|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY86|885292|8123401090|STRING142|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING92|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING94|null|null|null|null|null|124814091|null|STRING1|2140072|2022-01-06|null|null|null|STRING92|COMP1COUNTRY1|null|null|null|null|null|No||2|141|null|null null|null|null|null|VALUE93|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING139|null|101|null|null|null|1002|null|null|null|null|null|STRING142|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|142|null|null null|null|null|null|VALUE93|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300113|404|null|RER|RCR|XCX|null|null|null|STRING140|null|101|null|null|null|1002|null|null|null|null|null|STRING143|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230010|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|143|null|null null|null|null|null|VALUE93|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING141|null|101|null|null|null|1002|null|null|null|null|null|STRING144|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|144|null|null null|null|null|null|VALUE93|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING142|null|101|null|null|null|1002|null|null|null|null|null|STRING145|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|145|null|null null|null|null|null|VALUE93|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING143|null|101|null|null|null|1002|STRING3|null|null|null|null|STRING146|STRING143|STRING143|null|null|null|null|null|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500156.00|null|101.000|101.000|6500156.00|null|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|146|null|null null|null|null|null|VALUE93|10102418|10102418|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|CRC|XCX|null|null|null|STRING144|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING147|STRING143|STRING143|null|null|null|null|null|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|147|null|null null|null|null|null|VALUE94|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-05|COUNTRY1|null|STRING94|null|AC|null|null|STRING94|2022-01-10|2022-01-06|81351192|VALUE1|null|null|CITY44|888420194|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING145|null|101|null|null|null|1001|STRING3|null|null|null|null|STRING148|STRING149|STRING149|null|null|15027.00|null|15027.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85053.00|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY44|885294|8123401092|STRING149|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING94|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING96|null|null|null|null|null|124814093|null|STRING1|2140073|2022-01-06|null|null|null|STRING94|COMP1COUNTRY1|null|null|null|null|null|No||2|148|null|null null|null|null|null|VALUE95|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING95|null|AC|null|null|STRING95|2022-01-11|2022-01-06|81351193|VALUE1|null|null|CITY87|888420195|null|2300240|404|null|RER|RCR|XCX|null|null|null|STRING146|null|101|null|null|null|1001|null|null|null|null|null|STRING149|STRING150|STRING150|null|null|15003.00|null|15003.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY87|885295|8123401093|STRING150|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING95|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING97|null|null|null|null|null|124814094|null|STRING1|2140074|2022-01-06|null|null|null|STRING95|COMP1COUNTRY2|null|null|null|null|null|No||2|149|null|null null|null|null|null|VALUE97|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING97|null|AC|null|null|STRING97|2022-01-10|2022-01-06|81351195|VALUE1|null|null|CITY88|888420197|null|2300165|404|null|RER|CRC|XCX|null|null|null|STRING150|null|101|null|null|null|1002|STRING9|null|null|null|null|STRING153|STRING154|STRING154|null|null|null|null|null|null|null|230031|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY88|885297|8123401095|STRING154|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING97|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING99|null|null|null|null|null|124814096|null|STRING1|2140076|2022-01-06|null|null|null|STRING97|COMP1COUNTRY6|null|null|null|null|null|No||2|153|null|null null|null|null|null|VALUE97|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING97|null|AC|null|null|STRING97|2022-01-10|2022-01-06|81351195|VALUE1|null|null|CITY88|888420197|null|2300133|404|null|RER|CRC|XCX|null|null|null|STRING151|null|101|null|null|null|1002|STRING9|null|null|null|null|STRING154|STRING154|STRING154|null|null|15024.00|null|15024.00|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85045.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY88|885297|8123401095|STRING154|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING97|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING99|null|null|null|null|null|124814096|null|STRING1|2140076|2022-01-06|null|null|null|STRING97|COMP1COUNTRY6|null|null|null|null|null|No||2|154|null|null null|null|null|null|VALUE98|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING98|null|AC|null|null|STRING98|2022-01-11|2022-01-06|81351196|VALUE1|null|null|CITY89|888420198|null|2300135|404|null|RER|RCR|XCX|null|null|null|STRING152|null|101|null|null|null|1001|null|null|null|null|null|STRING155|STRING156|STRING156|null|null|15003.00|null|15003.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500159.00|null|101.000|101.000|6500159.00|85054.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY89|885298|8123401096|STRING156|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING98|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING100|null|null|null|null|null|124814097|null|STRING1|2140077|2022-01-06|null|null|null|STRING98|COMP1COUNTRY1|null|null|null|null|null|No||2|155|null|null null|null|null|null|VALUE99|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING99|null|AC|null|null|STRING99|2022-01-11|2022-01-06|81351197|VALUE1|null|null|CITY89|888420199|null|2300180|404|null|RER|CRC|XCX|null|null|null|STRING153|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING156|STRING157|STRING157|null|null|null|null|null|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500160.00|null|101.000|101.000|6500160.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY89|885299|8123401097|STRING157|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING99|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING101|null|null|null|null|null|124814098|null|STRING1|2140078|2022-01-06|null|null|null|STRING99|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|156|null|null null|null|null|null|VALUE99|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING99|null|AC|null|null|STRING99|2022-01-11|2022-01-06|81351197|VALUE1|null|null|CITY89|888420199|null|2300114|404|null|RER|CRC|XCX|null|null|null|STRING154|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING157|STRING157|STRING157|null|null|null|null|null|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY89|885299|8123401097|STRING157|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING99|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING101|null|null|null|null|null|124814098|null|STRING1|2140078|2022-01-06|null|null|null|STRING99|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|157|null|null null|null|null|null|VALUE99|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING99|null|AC|null|null|STRING99|2022-01-11|2022-01-06|81351197|VALUE1|null|null|CITY89|888420199|null|2300114|404|null|RER|CRC|XCX|null|null|null|STRING155|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING158|STRING157|STRING157|null|null|null|null|null|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY89|885299|8123401097|STRING157|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING99|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING101|null|null|null|null|null|124814098|null|STRING1|2140078|2022-01-06|null|null|null|STRING99|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|158|null|null null|null|null|null|VALUE100|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING100|null|AC|null|null|STRING100|2022-01-10|2022-01-06|81351198|VALUE1|null|null|CITY27|888420200|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING156|null|101|null|null|null|1003|STRING2|null|null|null|null|STRING159|STRING160|STRING160|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500152.00|null|101.000|101.000|6500152.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY27|885300|8123401098|STRING160|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING100|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING102|null|null|null|null|null|124814099|null|STRING1|2140079|2022-01-06|null|null|null|STRING100|COMP1COUNTRY1|null|null|STRING1|STRING1|STRING4|Yes||2|159|null|null null|null|null|null|VALUE100|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE30|null|null|2022-01-05|COUNTRY1|null|STRING100|null|AC|null|null|STRING100|2022-01-10|2022-01-06|81351198|VALUE1|null|null|CITY27|888420200|null|2300108|404|null|RER|RCR|XCX|null|null|null|STRING157|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING160|STRING160|STRING160|null|null|null|null|null|null|null|230006|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|null|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY27|885300|8123401098|STRING160|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING100|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING102|null|null|null|null|null|124814099|null|STRING1|2140079|2022-01-06|null|null|null|STRING100|COMP1COUNTRY1|null|null|null|null|null|No||2|160|null|null null|null|null|null|VALUE101|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE31|null|null|2022-01-05|COUNTRY2|null|STRING101|null|AA|null|null|STRING101|2022-01-13|2022-01-06|81351199|VALUE1|null|null|CITY90|888420201|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING158|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING161|STRING162|STRING162|null|null|15028.00|null|15028.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85055.00|101.000|101.000|101.000|101.000|STRING16|STRING1|2022-01-06|CITY90|885301|8123401099|STRING162|null|2022-01-06|STRING4|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING101|10240411|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING103|null|null|null|null|null|124814100|null|null|null|null|null|null|null|STRING101|COMP1COUNTRY2|null|null|null|null|null|No||2|161|null|null null|null|null|null|VALUE102|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING102|null|AC|null|null|STRING102|2022-01-13|2022-01-06|81351200|VALUE1|null|null|CITY91|888420202|null|2300140|404|null|RER|CRC|XCX|null|null|null|STRING159|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING162|STRING163|STRING163|null|null|null|null|null|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY91|885302|8123401100|STRING163|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING102|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING104|null|null|null|null|null|124814101|null|STRING1|2140080|2022-01-06|null|null|null|STRING102|COMP1COUNTRY1|null|null|null|null|null|No||2|162|null|null null|null|null|null|VALUE102|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING102|null|AC|null|null|STRING102|2022-01-13|2022-01-06|81351200|VALUE1|null|null|CITY91|888420202|null|2300121|404|null|RER|CRC|XCX|null|null|null|STRING160|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING163|STRING163|STRING163|null|null|null|null|null|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY91|885302|8123401100|STRING163|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING102|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING104|null|null|null|null|null|124814101|null|STRING1|2140080|2022-01-06|null|null|null|STRING102|COMP1COUNTRY1|null|null|null|null|null|No||2|163|null|null null|null|null|null|VALUE104|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING104|null|AC|null|null|STRING104|2022-01-11|2022-01-06|81351202|VALUE1|null|null|CITY93|888420204|null|2300240|404|null|RER|CRC|XCX|null|null|null|STRING162|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING165|STRING166|STRING166|null|null|15002.00|null|15002.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85003.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY93|885304|8123401102|STRING166|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING104|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING106|null|null|null|null|null|124814103|null|STRING1|2140082|2022-01-06|null|null|null|STRING104|COMP1COUNTRY1|null|null|null|null|null|No||2|165|null|null null|null|null|null|VALUE105|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING163|null|101|null|null|null|1001|null|null|null|null|null|STRING166|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|166|null|null null|null|null|null|VALUE105|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING163|null|101|null|null|null|1001|null|null|null|null|null|STRING167|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|167|null|null null|null|null|null|VALUE105|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300269|404|null|RER|RCR|XCX|null|null|null|STRING163|null|101|null|null|null|1001|null|null|null|null|null|STRING168|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230047|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|168|null|null null|null|null|null|VALUE105|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING164|null|101|null|null|null|1001|null|null|null|null|null|STRING169|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|169|null|null null|null|null|null|VALUE105|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING164|null|101|null|null|null|1001|null|null|null|null|null|STRING170|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|170|null|null null|null|null|null|VALUE105|10102418|10102418|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300269|404|null|RER|RCR|XCX|null|null|null|STRING164|null|101|null|null|null|1001|null|null|null|null|null|STRING171|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230047|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|171|null|null null|null|null|null|VALUE107|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE32|null|null|2022-01-05|COUNTRY1|null|STRING107|null|AC|null|null|STRING107|2022-01-11|2022-01-06|81351204|VALUE1|null|null|CITY95|888420207|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING167|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING175|STRING176|STRING176|null|null|15020.00|null|15020.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85023.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY95|885307|8123401104|STRING176|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING107|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING109|null|null|null|null|null|124814106|null|STRING1|2140084|2022-01-06|null|null|null|STRING107|COMP1COUNTRY1|null|null|null|null|null|No||2|175|null|null null|null|null|null|VALUE108|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING108|null|AC|null|null|STRING108|2022-01-11|2022-01-06|81351205|VALUE1|null|null|CITY20|888420208|null|2300101|404|null|RER|CRC|XCX|null|null|null|STRING168|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING176|STRING177|STRING177|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885308|8123401105|STRING177|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING108|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING110|null|null|null|null|null|124814107|null|STRING1|2140017|2022-01-06|null|null|null|STRING108|COMP1COUNTRY1|null|null|STRING7|STRING2|STRING5|Yes||2|176|null|null null|null|null|null|VALUE108|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE33|null|null|2022-01-05|COUNTRY1|null|STRING108|null|AC|null|null|STRING108|2022-01-11|2022-01-06|81351205|VALUE1|null|null|CITY20|888420208|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING169|null|101|null|null|null|1002|STRING10|null|null|null|null|STRING177|STRING177|STRING177|null|null|15014.00|null|15014.00|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85056.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY20|885308|8123401105|STRING177|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING108|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING110|null|null|null|null|null|124814107|null|STRING1|2140017|2022-01-06|null|null|null|STRING108|COMP1COUNTRY1|null|null|null|null|null|No||2|177|null|null null|null|null|null|VALUE108|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING108|null|AC|null|null|STRING108|2022-01-11|2022-01-06|81351205|VALUE1|null|null|CITY20|888420208|null|2300143|404|null|RER|CRC|XCX|null|null|null|STRING170|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING178|STRING177|STRING177|null|null|15014.00|null|15014.00|null|null|230027|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85056.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885308|8123401105|STRING177|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING108|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING110|null|null|null|null|null|124814107|null|STRING1|2140017|2022-01-06|null|null|null|STRING108|COMP1COUNTRY1|null|null|STRING7|STRING2|STRING5|Yes||2|178|null|null null|null|null|null|VALUE109|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE34|null|null|2022-01-05|COUNTRY6|null|STRING109|null|AC|null|null|STRING109|2022-01-10|2022-01-06|81351206|VALUE1|null|null|CITY96|888420209|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING171|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING179|STRING180|STRING180|null|null|15022.00|null|15022.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85013.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY96|885309|8123401106|STRING180|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING109|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING111|null|null|null|null|null|124814108|null|STRING1|2140024|2022-01-06|null|null|null|STRING109|COMP1COUNTRY6|null|null|null|null|null|No||2|179|null|null null|null|null|null|VALUE111|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE36|null|null|2022-01-05|COUNTRY1|null|STRING111|null|AC|null|null|STRING111|2022-01-11|2022-01-06|81351208|VALUE1|null|null|CITY98|888420211|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING173|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING181|STRING182|STRING182|null|null|15022.00|null|15022.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85041.00|101.000|101.000|101.000|101.000|STRING18|STRING1|2022-01-06|CITY98|885311|8123401108|STRING182|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING111|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING113|null|null|null|null|null|124814110|null|STRING1|2140086|2022-01-06|null|null|null|STRING111|COMP1COUNTRY1|null|null|null|null|null|No||2|181|null|null null|null|null|null|VALUE111|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE36|null|null|2022-01-05|COUNTRY1|null|STRING111|null|AC|null|null|STRING111|2022-01-11|2022-01-06|81351208|VALUE1|null|null|CITY98|888420211|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING174|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING182|STRING182|STRING182|null|null|null|null|null|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING18|STRING1|2022-01-06|CITY98|885311|8123401108|STRING182|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING111|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING113|null|null|null|null|null|124814110|null|STRING1|2140086|2022-01-06|null|null|null|STRING111|COMP1COUNTRY1|null|null|null|null|null|No||2|182|null|null null|null|null|null|VALUE112|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-14|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING112|null|AC|null|null|STRING112|2022-01-14|2022-01-06|81351209|VALUE1|null|null|CITY99|888420212|null|2300101|404|null|RER|RCR|XCX|null|null|STRING7|STRING175|null|101|null|null|null|1001|null|null|null|null|null|STRING183|STRING184|STRING184|null|null|15003.00|null|15003.00|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85006.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY99|885312|8123401109|STRING184|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE2|STRING112|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING114|null|EFL|null|null|null|124814111|null|STRING1|2140087|2022-01-06|null|null|null|STRING112|COMP1COUNTRY1|null|null|null|null|null|No||2|183|null|null null|null|null|null|VALUE113|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING113|METHOD8|AA|null|null|STRING113|null|2022-01-06|81351210|VALUE1|null|null|CITY100|888420213|null|2300116|404|null|RER|RCR|XCX|null|null|null|STRING176|null|101|null|null|null|1001|null|null|null|null|null|STRING184|STRING185|STRING185|null|null|15003.00|null|15003.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500158.00|null|101.000|101.000|6500158.00|85051.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY100|885313|8123401110|STRING185|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING3|TYPE1|STRING113|10240413|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING2|STRING1|1923002|1923001|null|null|null|null|10349200.00|STRING115|null|null|null|null|null|124814112|3059003|null|null|null|null|null|null|STRING113|COMP1COUNTRY4|null|null|null|null|null|No||2|184|null|null null|null|null|null|VALUE114|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING114|METHOD9|AC|null|null|STRING114|2022-01-11|2022-01-06|81351211|VALUE1|null|null|CITY101|888420214|null|2300211|404|null|RER|RCR|XCX|null|null|null|STRING177|null|101|null|null|null|1003|null|null|null|null|null|STRING185|STRING186|STRING186|null|null|15003.00|null|15003.00|null|null|230040|null|null|101|STRING1|null|null|null|null|PCP|101|6500156.00|null|101.000|101.000|6500156.00|85023.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY101|885314|8123401111|STRING186|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING114|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING116|null|null|null|null|null|124814113|null|STRING1|2140088|2022-01-06|null|null|null|STRING114|COMP1COUNTRY1|null|null|null|null|null|No||2|185|null|null null|null|null|null|VALUE115|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING115|null|AC|null|null|STRING115|2022-01-11|2022-01-06|81351212|VALUE1|null|null|CITY102|888420215|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING132|null|101|null|null|null|1002|null|null|null|null|null|STRING135|STRING187|STRING187|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|85034.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY102|885315|8123401112|STRING187|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING115|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING117|null|null|null|null|null|124814114|null|STRING1|2140089|2022-01-06|null|null|null|STRING115|COMP1COUNTRY1|null|null|null|null|null|No||2|186|null|null null|null|null|null|VALUE116|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE37|null|null|2022-01-05|COUNTRY11|null|STRING116|null|AC|null|null|STRING116|2022-01-11|2022-01-06|81351213|VALUE1|null|null|CITY103|888420216|null|2300143|404|null|RER|RCR|XCX|null|null|null|STRING178|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING186|STRING188|STRING188|null|null|null|null|null|null|null|230027|null|null|101|STRING6|null|null|null|null|PCP|101|6500161.00|null|101.000|101.000|6500161.00|null|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY103|885316|8123401113|STRING188|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING116|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB12|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING118|null|null|null|null|null|124814115|null|STRING1|2140090|2022-01-06|null|null|null|STRING116|COMP1COUNTRY10|null|null|null|null|null|No||2|187|null|null null|null|null|null|VALUE117|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING117|null|AC|null|null|STRING117|2022-01-10|2022-01-06|81351214|VALUE1|null|null|CITY104|888420217|null|2300240|404|null|RER|RCR|XCX|null|null|null|STRING179|null|101|null|null|null|1001|null|null|null|null|null|STRING187|STRING189|STRING189|null|null|15003.00|null|15003.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY104|885317|8123401114|STRING189|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING117|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING119|null|null|null|null|null|124814116|null|STRING1|2140059|2022-01-06|null|null|null|STRING117|COMP1COUNTRY1|null|null|null|null|null|No||2|188|null|null null|null|null|null|VALUE118|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE38|null|null|2022-01-05|COUNTRY1|null|STRING118|null|AC|null|null|STRING118|2022-01-13|2022-01-06|81351215|VALUE1|null|null|CITY105|888420218|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING180|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING188|STRING190|STRING190|null|null|15001.00|null|15001.00|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85027.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY105|885318|8123401115|STRING190|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING118|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING120|null|null|null|null|null|124814117|null|STRING1|2140091|2022-01-06|null|null|null|STRING118|COMP1COUNTRY1|null|null|null|null|null|No||2|189|null|null null|null|null|null|VALUE119|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING119|null|AC|null|null|STRING119|2022-01-11|2022-01-06|81351216|VALUE1|null|null|CITY106|888420219|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING181|null|101|null|null|null|1001|null|null|null|null|null|STRING189|STRING191|STRING191|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY106|885319|8123401116|STRING191|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING119|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING121|null|null|null|null|null|124814118|null|STRING1|2140092|2022-01-06|null|null|null|STRING119|COMP1COUNTRY6|null|null|null|null|null|No||2|190|null|null null|null|null|null|VALUE120|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300105|404|null|RER|CRC|XCX|null|null|null|STRING182|null|101|null|null|null|1001|STRING9|null|null|null|null|STRING190|STRING192|STRING192|null|null|15029.00|null|15029.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85057.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|STRING8|STRING2|STRING6|Yes||2|191|null|null null|null|null|null|VALUE120|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300293|404|null|RER|CRC|XCX|null|null|null|STRING183|null|101|null|null|null|1002|STRING9|null|null|null|null|STRING191|STRING192|STRING192|null|null|15004.00|null|15004.00|null|null|230048|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85024.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|STRING8|STRING2|STRING6|Yes||2|192|null|null null|null|null|null|VALUE120|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING184|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING192|STRING192|STRING192|null|null|15022.00|null|15022.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85041.00|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|193|null|null null|null|null|null|VALUE120|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING185|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING193|STRING192|STRING192|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|194|null|null null|null|null|null|VALUE120|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300296|404|null|RER|RCR|XCX|null|null|null|STRING186|null|101|null|null|null|1003|STRING6|null|null|null|null|STRING194|STRING192|STRING192|null|null|null|null|null|null|null|230049|null|null|101|STRING1|null|null|null|null|PCP|101|6500132.00|null|101.000|101.000|6500132.00|null|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|195|null|null null|null|null|null|VALUE120|10102418|10102418|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING187|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING195|STRING192|STRING192|null|null|null|null|null|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|196|null|null null|null|null|null|VALUE120|10102419|10102419|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300111|404|null|RER|RCR|XCX|null|null|null|STRING188|null|101|null|null|null|1001|null|null|null|null|null|STRING196|STRING192|STRING192|null|null|15003.00|null|15003.00|null|null|230008|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85050.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|197|null|null null|null|null|null|VALUE121|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY5|null|STRING121|null|AA|null|null|STRING121|2022-01-10|2022-01-06|81351218|VALUE1|null|null|CITY108|888420221|null|2300123|404|null|RER|RCR|XCX|null|null|null|STRING189|null|101|null|null|null|1001|null|null|null|null|null|STRING197|STRING199|STRING199|null|null|15003.00|null|15003.00|null|null|230017|null|null|101|STRING2|null|null|null|null|PCP|101|6500153.00|null|101.000|101.000|6500153.00|85039.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY108|885321|8123401118|STRING199|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING121|10240415|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING123|null|null|null|null|null|124814120|null|null|null|null|null|null|null|STRING121|COMP1COUNTRY5|null|null|null|null|null|No||2|198|null|null null|null|null|null|VALUE1|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE1|null|null|2022-01-05|COUNTRY1|null|STRING1|null|AC|null|null|STRING1|2022-01-10|2022-01-06|81351101|VALUE1|null|null|CITY1|888420101|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING1|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING1|STRING1|STRING1|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY1|885201|8123401001|STRING1|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING1|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING1|null|null|null|null|null|124814001|null|STRING1|2140001|2022-01-06|null|null|null|STRING1|COMP1COUNTRY1|null|null|null|null|null|No||2|0|null|null null|null|null|null|VALUE2|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-04|COUNTRY2|null|STRING2|null|AC|null|null|STRING2|2022-01-11|2022-01-07|81351102|VALUE1|null|null|CITY2|888420102|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING3|null|101|null|null|null|1001|STRING3|null|null|null|null|STRING3|STRING3|STRING3|null|null|15001.00|null|15001.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85002.00|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-07|CITY2|885202|8123401002|STRING3|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING2|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING2|null|null|null|null|null|124814002|null|STRING1|2140002|2022-01-07|null|null|null|STRING2|COMP1COUNTRY2|null|null|null|null|null|No||2|2|null|null null|null|null|null|VALUE5|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING5|null|AC|null|null|STRING5|2022-01-11|2022-01-07|81351105|VALUE1|null|null|CITY5|888420105|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING6|null|101|null|null|null|1001|null|null|null|null|null|STRING6|STRING6|STRING6|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500128.00|null|101.000|101.000|6500128.00|85005.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY5|885205|8123401005|STRING6|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING5|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING5|null|null|null|null|null|124814005|null|STRING1|2140005|2022-01-07|null|null|null|STRING5|COMP1COUNTRY1|null|null|null|null|null|No||2|5|null|null null|null|null|null|VALUE12|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE6|null|null|2022-01-04|COUNTRY1|null|STRING12|null|AC|null|null|STRING12|2022-01-13|2022-01-07|81351112|VALUE1|null|null|CITY12|888420112|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING17|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING17|STRING17|STRING17|null|null|15009.00|null|15009.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|85014.00|101.000|101.000|101.000|101.000|STRING4|STRING1|2022-01-07|CITY12|885212|8123401012|STRING17|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING12|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING12|null|null|null|null|null|124814012|null|STRING1|2140012|2022-01-07|null|null|null|STRING12|COMP1COUNTRY1|null|null|null|null|null|No||2|16|null|null null|null|null|null|VALUE37|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING37|null|AC|null|null|STRING37|2022-01-11|2022-01-07|81351137|VALUE1|null|null|CITY34|888420137|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING55|null|101|null|null|null|1002|null|null|null|null|null|STRING55|STRING56|STRING56|null|null|15003.00|null|15003.00|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|85029.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY34|885237|8123401037|STRING56|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING37|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING37|null|null|null|null|null|124814037|null|STRING1|2140031|2022-01-07|null|null|null|STRING37|COMP1COUNTRY1|null|null|null|null|null|No||2|55|null|null null|null|null|null|VALUE48|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING48|null|AC|null|null|STRING48|2022-01-11|2022-01-07|81351148|VALUE1|null|null|CITY43|888420148|null|2300135|404|null|RER|CRC|XCX|null|null|null|STRING68|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING68|STRING69|STRING69|null|null|null|null|null|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-07|CITY43|885248|8123401048|STRING69|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING48|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING48|null|null|null|null|null|124814047|null|STRING1|2140038|2022-01-07|null|null|null|STRING48|COMP1COUNTRY1|null|null|null|null|null|No||2|68|null|null null|null|null|null|VALUE48|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING48|null|AC|null|null|STRING48|2022-01-11|2022-01-07|81351148|VALUE1|null|null|CITY43|888420148|null|2300135|404|null|RER|CRC|XCX|null|null|null|STRING69|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING69|STRING69|STRING69|null|null|15018.00|null|15018.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|85033.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-07|CITY43|885248|8123401048|STRING69|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING48|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING48|null|null|null|null|null|124814047|null|STRING1|2140038|2022-01-07|null|null|null|STRING48|COMP1COUNTRY1|null|null|null|null|null|No||2|69|null|null null|null|null|null|VALUE62|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE22|null|null|2022-01-04|COUNTRY3|null|STRING62|null|AC|null|null|STRING62|2022-01-11|2022-01-07|81351161|VALUE1|null|null|CITY56|888420162|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING94|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING94|STRING95|STRING95|null|null|15004.00|null|15004.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85009.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-07|CITY56|885262|8123401061|STRING95|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING62|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING63|null|null|null|null|null|124814061|null|STRING1|2140049|2022-01-07|null|null|null|STRING62|COMP1COUNTRY3|null|null|null|null|null|No||2|94|null|null null|null|null|null|VALUE96|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING96|null|AC|null|null|STRING96|2022-01-11|2022-01-07|81351194|VALUE1|null|null|CITY73|888420196|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING147|null|101|null|null|null|1002|null|null|null|null|null|STRING150|STRING151|STRING151|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|85029.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY73|885296|8123401094|STRING151|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING96|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING98|null|null|null|null|null|124814095|null|STRING1|2140075|2022-01-07|null|null|null|STRING96|COMP1COUNTRY1|null|null|null|null|null|No||2|150|null|null null|null|null|null|VALUE96|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING96|null|AC|null|null|STRING96|2022-01-11|2022-01-07|81351194|VALUE1|null|null|CITY73|888420196|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING148|null|101|null|null|null|1002|null|null|null|null|null|STRING151|STRING151|STRING151|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY73|885296|8123401094|STRING151|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING96|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING98|null|null|null|null|null|124814095|null|STRING1|2140075|2022-01-07|null|null|null|STRING96|COMP1COUNTRY1|null|null|null|null|null|No||2|151|null|null null|null|null|null|VALUE96|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING96|null|AC|null|null|STRING96|2022-01-11|2022-01-07|81351194|VALUE1|null|null|CITY73|888420196|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING149|null|101|null|null|null|1002|null|null|null|null|null|STRING152|STRING151|STRING151|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY73|885296|8123401094|STRING151|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING96|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING98|null|null|null|null|null|124814095|null|STRING1|2140075|2022-01-07|null|null|null|STRING96|COMP1COUNTRY1|null|null|null|null|null|No||2|152|null|null null|null|null|null|VALUE103|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY10|null|STRING103|null|AC|null|null|STRING103|2022-01-11|2022-01-07|81351201|VALUE1|null|null|CITY92|888420203|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING161|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING164|STRING165|STRING165|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-07|CITY92|885303|8123401101|STRING165|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING103|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB11|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING105|null|null|null|null|null|124814102|null|STRING1|2140081|2022-01-07|null|null|null|STRING103|COMP1COUNTRY9|null|null|null|null|null|No||2|164|null|null null|null|null|null|VALUE106|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING106|null|AC|null|null|STRING106|2022-01-11|2022-01-07|81351203|VALUE1|null|null|CITY20|888420206|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING64|null|101|null|null|null|1001|null|null|null|null|null|STRING172|STRING173|STRING173|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85036.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY20|885306|8123401103|STRING173|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING106|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING107|null|null|null|null|null|124814105|null|STRING1|2140083|2022-01-07|null|null|null|STRING106|COMP1COUNTRY1|null|null|null|null|null|No||2|172|null|null null|null|null|null|VALUE106|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING106|null|AC|null|null|STRING106|2022-01-11|2022-01-07|81351203|VALUE1|null|null|CITY20|888420206|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING165|null|101|null|null|null|1001|null|null|null|null|null|STRING173|STRING173|STRING173|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85036.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY20|885306|8123401103|STRING173|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING106|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING107|null|null|null|null|null|124814105|null|STRING1|2140083|2022-01-07|null|null|null|STRING106|COMP1COUNTRY1|null|null|null|null|null|No||2|173|null|null null|null|null|null|VALUE106|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-14|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING106|null|AC|null|null|STRING106|2022-01-14|2022-01-07|81351203|VALUE1|null|null|CITY20|888420206|null|2300233|404|null|RER|RCR|XCX|null|null|null|STRING166|null|101|null|null|null|1001|null|null|null|null|null|STRING174|STRING173|STRING173|null|null|15003.00|null|15003.00|null|null|230044|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY20|885306|8123401103|STRING173|null|2022-01-06|STRING6|10020.000|STRING3|STORE1|STRING1|TYPE3|STRING106|10240402|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|null|1923002|1923001|null|null|null|null|10349200.00|STRING108|null|null|null|null|null|124814105|null|STRING1|2140083|2022-01-07|null|null|null|STRING106|COMP1COUNTRY1|null|null|null|null|null|No||2|174|null|null null|null|null|null|VALUE110|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE35|null|null|2022-01-04|COUNTRY1|null|STRING110|null|AC|null|null|STRING110|2022-01-11|2022-01-07|81351207|VALUE1|null|null|CITY97|888420210|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING172|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING180|STRING181|STRING181|null|null|15014.00|null|15014.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85030.00|101.000|101.000|101.000|101.000|STRING17|STRING1|2022-01-07|CITY97|885310|8123401107|STRING181|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING110|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING112|null|null|null|null|null|124814109|null|STRING1|2140085|2022-01-07|null|null|null|STRING110|COMP1COUNTRY1|null|null|null|null|null|No||2|180|null|null null|null|null|null|VALUE6|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-12|null|null|null|null|2022-01-03|null|STRING1|AC|AC|AC|null|null|null|2022-01-03|COUNTRY2|null|STRING6|null|AC|null|null|STRING6|2022-01-12|2022-01-08|81351106|VALUE1|null|null|CITY6|888420106|null|2300107|404|null|RER|RCR|XCX|null|null|null|STRING7|null|101|null|null|null|1001|null|null|null|null|null|STRING7|STRING7|STRING7|null|null|15003.00|null|15003.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85006.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-08|CITY6|885206|8123401006|STRING7|null|2022-01-06|STRING2|10020.000|STRING1|STORE1|STRING2|TYPE1|STRING6|10240402|null|null|null|null|null|null|COMPANY2|COUNTRYAB2|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING6|null|null|null|null|null|124814006|null|STRING1|2140006|2022-01-08|null|null|null|STRING6|COMP2COUNTRY2|null|null|null|null|null|No||2|6|null|null null|null|null|null|VALUE6|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AB|2022-01-12|null|null|null|null|2022-01-03|null|STRING1|AB|AB|AB|null|null|null|2022-01-03|COUNTRY2|null|STRING6|null|AB|null|null|STRING6|2022-01-12|2022-01-08|81351106|VALUE1|null|null|CITY6|888420106|null|2300108|404|null|RER|RCR|XCX|null|null|null|STRING8|null|101|null|null|null|1003|null|null|null|null|null|STRING8|STRING7|STRING7|null|null|15003.00|15001.00|15003.00|null|null|230006|null|null|101|STRING1|null|null|null|null|PCP|101|6500130.00|5001.00|101.000|101.000|6500130.00|85007.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-08|CITY6|885206|8123401006|STRING7|null|2022-01-06|STRING3|10020.000|STRING1|STORE1|STRING2|TYPE1|STRING6|10240402|null|null|null|null|null|null|COMPANY2|COUNTRYAB2|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING6|null|null|null|null|null|124814006|null|STRING1|2140006|2022-01-08|null|null|null|STRING6|COMP2COUNTRY2|null|null|null|null|null|No||2|7|null|null null|null|null|null|VALUE6|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-12|null|null|null|null|2022-01-03|null|STRING1|AC|AC|AC|null|null|null|2022-01-03|COUNTRY2|null|STRING6|null|AC|null|null|STRING6|2022-01-12|2022-01-08|81351106|VALUE1|null|null|CITY6|888420106|null|2300107|404|null|RER|RCR|XCX|null|null|null|STRING9|null|101|null|null|null|1001|null|null|null|null|null|STRING9|STRING7|STRING7|null|null|15003.00|15002.00|15003.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|5002.00|101.000|101.000|6500126.00|85008.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-08|CITY6|885206|8123401006|STRING7|null|2022-01-06|STRING2|10020.000|STRING1|STORE1|STRING2|TYPE1|STRING6|10240402|null|null|null|null|null|null|COMPANY2|COUNTRYAB2|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING6|null|null|null|null|null|124814006|null|STRING1|2140006|2022-01-08|null|null|null|STRING6|COMP2COUNTRY2|null|null|null|null|null|No||2|8|null|null null|null|null|null|VALUE122|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING122|null|AC|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|101|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|PCP|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null||2|199|null|null ================================================ FILE: tests/resources/feature/gab/setup/schema/dummy_sales_kpi.json ================================================ { "type": "struct", "fields": [ { "name": "order_date", "type": "date", "nullable": true, "metadata": {} }, { "name": "article_id", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/gab/setup/schema/lkp_query_builder.json ================================================ { "type": "struct", "fields": [ { "name": "query_id", "type": "integer", "nullable": true, "metadata": {} }, { "name": "query_label", "type": "string", "nullable": true, "metadata": {} }, { "name": "query_type", "type": "string", "nullable": true, "metadata": {} }, { "name": "mappings", "type": "string", "nullable": true, "metadata": {} }, { "name": "intermediate_stages", "type": "string", "nullable": true, "metadata": {} }, { "name": "recon_window", "type": "string", "nullable": true, "metadata": {} }, { "name": "timezone_offset", "type": "integer", "nullable": true, "metadata": {} }, { "name": "start_of_the_week", "type": "string", "nullable": true, "metadata": {} }, { "name": "is_active", "type": "string", "nullable": true, "metadata": {} }, { "name": "queue", "type": "string", "nullable": true, "metadata": {} }, { "name": "lh_created_on", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/gab/setup/schema/order_events.json ================================================ { "type": "struct", "fields": [ {"name": "request_timestamp", "type": "string","nullable": true,"metadata": {}}, {"name": "data_pack_id", "type": "string","nullable": true,"metadata": {}}, {"name": "record_number", "type": "integer","nullable": true,"metadata": {}}, {"name": "update_mode", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_order_header", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_order_schedule", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_order_item", "type": "string","nullable": true,"metadata": {}}, {"name": "orgsales_orgp", "type": "string","nullable": true,"metadata": {}}, {"name": "order_header_key", "type": "string","nullable": true,"metadata": {}}, {"name": "order_line_key", "type": "string","nullable": true,"metadata": {}}, {"name": "derived_order_header", "type": "string","nullable": true,"metadata": {}}, {"name": "derived_order_line_k", "type": "string","nullable": true,"metadata": {}}, {"name": "return_reason", "type": "string","nullable": true,"metadata": {}}, {"name": "reqmnt_category", "type": "string","nullable": true,"metadata": {}}, {"name": "delivery_status10", "type": "string","nullable": true,"metadata": {}}, {"name": "req_del_dt_item", "type": "date","nullable": true,"metadata": {}}, {"name": "reason_for_rejsize", "type": "string","nullable": true,"metadata": {}}, {"name": "invoice_item_price", "type": "string","nullable": true,"metadata": {}}, {"name": "id_of_the_customer", "type": "string","nullable": true,"metadata": {}}, {"name": "logistics_profit_ctr", "type": "string","nullable": true,"metadata": {}}, {"name": "material_availabilit", "type": "date","nullable": true,"metadata": {}}, {"name": "mso_store", "type": "string","nullable": true,"metadata": {}}, {"name": "name_of_orderer", "type": "string","nullable": true,"metadata": {}}, {"name": "overall_delivery_sta", "type": "string","nullable": true,"metadata": {}}, {"name": "overall_processing_s20", "type": "string","nullable": true,"metadata": {}}, {"name": "overall_processing_s21", "type": "string","nullable": true,"metadata": {}}, {"name": "coupon_code", "type": "string","nullable": true,"metadata": {}}, {"name": "org_grape_bapcx", "type": "string","nullable": true,"metadata": {}}, {"name": "cust_service_rep", "type": "string","nullable": true,"metadata": {}}, {"name": "customer_purchase_or25", "type": "date","nullable": true,"metadata": {}}, {"name": "delivery_country_cod", "type": "string","nullable": true,"metadata": {}}, {"name": "delivery_city_code", "type": "string","nullable": true,"metadata": {}}, {"name": "delivery_post_code", "type": "string","nullable": true,"metadata": {}}, {"name": "delivery_state_code", "type": "string","nullable": true,"metadata": {}}, {"name": "delivery_status30", "type": "string","nullable": true,"metadata": {}}, {"name": "ops_del_block_sohdr", "type": "string","nullable": true,"metadata": {}}, {"name": "ops_del_block_soscl", "type": "string","nullable": true,"metadata": {}}, {"name": "ecom_crm_id", "type": "string","nullable": true,"metadata": {}}, {"name": "conf_del_date_size", "type": "date","nullable": true,"metadata": {}}, {"name": "created_on", "type": "date","nullable": true,"metadata": {}}, {"name": "time", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_doc_item_cat", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_campaign_id", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_coupon_code", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_city", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_postal_code", "type": "string","nullable": true,"metadata": {}}, {"name": "shp_promotion_code", "type": "string","nullable": true,"metadata": {}}, {"name": "size_grid", "type": "string","nullable": true,"metadata": {}}, {"name": "main_chan_frm_src", "type": "string","nullable": true,"metadata": {}}, {"name": "prctr_billing", "type": "string","nullable": true,"metadata": {}}, {"name": "prere_indfrm_src", "type": "string","nullable": true,"metadata": {}}, {"name": "reg__clr_from_src", "type": "string","nullable": true,"metadata": {}}, {"name": "update_flag", "type": "string","nullable": true,"metadata": {}}, {"name": "usage", "type": "string","nullable": true,"metadata": {}}, {"name": "so_header_usgindp", "type": "string","nullable": true,"metadata": {}}, {"name": "vas_customer_defined", "type": "string","nullable": true,"metadata": {}}, {"name": "adidas_group_article", "type": "string","nullable": true,"metadata": {}}, {"name": "billto_cust", "type": "string","nullable": true,"metadata": {}}, {"name": "requirement_type", "type": "string","nullable": true,"metadata": {}}, {"name": "shipto_cust__r2", "type": "string","nullable": true,"metadata": {}}, {"name": "soldto_cust_r2", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_doc_category", "type": "string","nullable": true,"metadata": {}}, {"name": "product_division", "type": "string","nullable": true,"metadata": {}}, {"name": "promotion_code", "type": "string","nullable": true,"metadata": {}}, {"name": "sd_categ_precdoc", "type": "string","nullable": true,"metadata": {}}, {"name": "so_hdrpreceding_doc", "type": "string","nullable": true,"metadata": {}}, {"name": "so_itmpreceding_doc", "type": "string","nullable": true,"metadata": {}}, {"name": "so_scl_prec_doc", "type": "string","nullable": true,"metadata": {}}, {"name": "article__region__s", "type": "string","nullable": true,"metadata": {}}, {"name": "reference_1", "type": "string","nullable": true,"metadata": {}}, {"name": "mkt_place_order_num", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_representative", "type": "string","nullable": true,"metadata": {}}, {"name": "subtotal_1_source", "type": "decimal","nullable": true,"metadata": {}}, {"name": "subtotal_2_source", "type": "decimal","nullable": true,"metadata": {}}, {"name": "subtotal_3_source", "type": "decimal","nullable": true,"metadata": {}}, {"name": "subtotal_4_source", "type": "decimal","nullable": true,"metadata": {}}, {"name": "subtotal_5_source", "type": "decimal","nullable": true,"metadata": {}}, {"name": "subtotal_6_source", "type": "decimal","nullable": true,"metadata": {}}, {"name": "grid_value", "type": "string","nullable": true,"metadata": {}}, {"name": "orgcompcodep", "type": "string","nullable": true,"metadata": {}}, {"name": "created_by", "type": "string","nullable": true,"metadata": {}}, {"name": "miscdistchcopap", "type": "string","nullable": true,"metadata": {}}, {"name": "document_currency", "type": "string","nullable": true,"metadata": {}}, {"name": "reason_for_order", "type": "string","nullable": true,"metadata": {}}, {"name": "opsplantp", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_group", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_office", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_unit", "type": "string","nullable": true,"metadata": {}}, {"name": "storage_location", "type": "string","nullable": true,"metadata": {}}, {"name": "so_net_price_2", "type": "decimal","nullable": true,"metadata": {}}, {"name": "sales_order_net_valu", "type": "decimal","nullable": true,"metadata": {}}, {"name": "so_conf_qty", "type": "decimal","nullable": true,"metadata": {}}, {"name": "so_cum_order_qty", "type": "decimal","nullable": true,"metadata": {}}, {"name": "so_net_price", "type": "decimal","nullable": true,"metadata": {}}, {"name": "so_net_value", "type": "decimal","nullable": true,"metadata": {}}, {"name": "so_org_qty", "type": "decimal","nullable": true,"metadata": {}}, {"name": "so_conf_qty_actual", "type": "decimal","nullable": true,"metadata": {}}, {"name": "sales_order_qty", "type": "decimal","nullable": true,"metadata": {}}, {"name": "sales_odr_qty_actual", "type": "decimal","nullable": true,"metadata": {}}, {"name": "article_campaign_id", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_document_type", "type": "string","nullable": true,"metadata": {}}, {"name": "order_date_header", "type": "date","nullable": true,"metadata": {}}, {"name": "billing_city", "type": "string","nullable": true,"metadata": {}}, {"name": "billing_postal_code", "type": "string","nullable": true,"metadata": {}}, {"name": "customer_po_time", "type": "string","nullable": true,"metadata": {}}, {"name": "customer_purchase_or101", "type": "string","nullable": true,"metadata": {}}, {"name": "overall_rej_status", "type": "string","nullable": true,"metadata": {}}, {"name": "changed_on", "type": "date","nullable": true,"metadata": {}}, {"name": "epoch_status", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_order_canqty", "type": "decimal","nullable": true,"metadata": {}}, {"name": "epoch_entry_type", "type": "string","nullable": true,"metadata": {}}, {"name": "epoch_entry_by", "type": "string","nullable": true,"metadata": {}}, {"name": "epoch_order_type", "type": "string","nullable": true,"metadata": {}}, {"name": "epoch_line_type", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_marketplace", "type": "string","nullable": true,"metadata": {}}, {"name": "confirmed_delivery_t", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_city_addres112", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_city_addres113", "type": "string","nullable": true,"metadata": {}}, {"name": "shipping_city_addres114", "type": "string","nullable": true,"metadata": {}}, {"name": "billing_city_address115", "type": "string","nullable": true,"metadata": {}}, {"name": "billing_city_address116", "type": "string","nullable": true,"metadata": {}}, {"name": "billing_city_address117", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_seller_org", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_locale_code", "type": "string","nullable": true,"metadata": {}}, {"name": "customer_po_type", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_carrier_serv", "type": "string","nullable": true,"metadata": {}}, {"name": "qualifier", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_document_typ", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_return_code", "type": "string","nullable": true,"metadata": {}}, {"name": "refund_process_date", "type": "date","nullable": true,"metadata": {}}, {"name": "refund_process_time", "type": "string","nullable": true,"metadata": {}}, {"name": "omni_cancel_reason", "type": "string","nullable": true,"metadata": {}}, {"name": "sales_order_ecom_fre", "type": "decimal","nullable": true,"metadata": {}}, {"name": "omnihub_custom_order", "type": "string","nullable": true,"metadata": {}}, {"name": "vas_packing_type_so", "type": "string","nullable": true,"metadata": {}}, {"name": "vas_spl_ser_type_so", "type": "string","nullable": true,"metadata": {}}, {"name": "vas_tktlbl_type_so", "type": "string","nullable": true,"metadata": {}}, {"name": "exchange_flag", "type": "string","nullable": true,"metadata": {}}, {"name": "exchange_type", "type": "string","nullable": true,"metadata": {}}, {"name": "customer_po_timedw", "type": "string","nullable": true,"metadata": {}}, {"name": "cnc_store_id", "type": "string","nullable": true,"metadata": {}}, {"name": "last_hold__type", "type": "string","nullable": true,"metadata": {}}, {"name": "last_hold_released_t", "type": "string","nullable": true,"metadata": {}}, {"name": "last_hold_release_dt", "type": "date","nullable": true,"metadata": {}}, {"name": "dynamic_pricing_iden", "type": "string","nullable": true,"metadata": {}}, {"name": "dynamic_pricing_valu", "type": "string","nullable": true,"metadata": {}}, {"name": "dymamic_pricing_amnt", "type": "decimal","nullable": true,"metadata": {}}, {"name": "exchange_reason", "type": "string","nullable": true,"metadata": {}}, {"name": "omnihub_site_id", "type": "string","nullable": true,"metadata": {}}, {"name": "international_shipme", "type": "string","nullable": true,"metadata": {}}, {"name": "exchange_variant", "type": "string","nullable": true,"metadata": {}}, {"name": "secondary_article_ca", "type": "string","nullable": true,"metadata": {}}, {"name": "secondary_article_pr", "type": "string","nullable": true,"metadata": {}}, {"name": "secondary_coupon_cod", "type": "string","nullable": true,"metadata": {}}, {"name": "double_discount_flag", "type": "string","nullable": true,"metadata": {}}, {"name": "extraction_date", "type": "string","nullable": true,"metadata": {}}, {"name": "lhe_batch_id", "type": "integer","nullable": true,"metadata": {}}, {"name": "lhe_row_id", "type": "long","nullable": true,"metadata": {}}, {"name": "source_update_date", "type": "date","nullable": true,"metadata": {}}, {"name": "source_update_time", "type": "string","nullable": true,"metadata": {}} ] } ================================================ FILE: tests/resources/feature/gab/usecases/dummy_sales_kpi/1_article_category.sql ================================================ SELECT "category_a" AS category_name ,"article1" AS article_id UNION SELECT "category_a" AS category_name ,"article2" AS article_id UNION SELECT "category_a" AS category_name ,"article3" AS article_id UNION SELECT "category_a" AS category_name ,"article4" AS article_id UNION SELECT "category_b" AS category_name ,"article5" AS article_id UNION SELECT "category_b" AS category_name ,"article6" AS article_id UNION SELECT "category_b" AS category_name ,"article7" AS article_id ================================================ FILE: tests/resources/feature/gab/usecases/dummy_sales_kpi/2_dummy_sales_kpi.sql ================================================ SELECT {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour) {% endif %} AS order_date, {{ to_date }} AS to_date, b.category_name, COUNT(a.article_id) qty_articles, SUM(amount) total_amount FROM `{{ database }}`.`dummy_sales_kpi` a {{ joins }} LEFT JOIN article_categories b ON a.article_id = b.article_id WHERE TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= ( '{{start_date}}' + INTERVAL '{{offset_value}}' HOUR ) AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < ( '{{ end_date}}' + INTERVAL '{{offset_value}}' HOUR ) GROUP BY 1,2,3 ================================================ FILE: tests/resources/feature/gab/usecases/dummy_sales_kpi/scenario/dummy_sales_kpi.json ================================================ { "query_label_filter": ["dummy_sales_kpi"], "queue_filter": ["Low"], "cadence_filter": ["DAY","WEEK","MONTH","QUARTER","YEAR"], "target_database": "test_db", "start_date": "2016-01-01", "end_date": "2018-12-31", "rerun_flag": "N", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder", "calendar_table": "dim_calendar" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/1_order_events.sql ================================================ SELECT {{ to_date }} AS to_date, {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + INTERVAL '{{offset_value}}' HOUR) {% endif %} AS order_date, sales_order_schedule, delivery_country_cod, COUNT(*) orders, SUM(sales_order_qty) total_sales FROM {{ database }}.order_events {{ joins }} WHERE {{ filter_date_column }} >= ( '{{start_date}}' + INTERVAL '{{offset_value}}' HOUR ) AND {{ filter_date_column }} < ( '{{ end_date}}' + INTERVAL '{{offset_value}}' HOUR ) AND order_date_header IS NOT NULL GROUP BY ALL ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events.json ================================================ { "query_label_filter": ["order_events"], "queue_filter": ["Medium"], "cadence_filter": ["All"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "N", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events_nam.json ================================================ { "query_label_filter": ["order_events_nam"], "queue_filter": ["Medium"], "cadence_filter": ["MONTH","QUARTER"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "N", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events_negative_timezone_offset.json ================================================ { "query_label_filter": ["order_events_negative_timezone_offset"], "queue_filter": ["Medium"], "cadence_filter": ["WEEK"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "Y", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events_snapshot.json ================================================ { "query_label_filter": ["order_events_snapshot"], "queue_filter": ["Medium"], "cadence_filter": ["DAY","WEEK"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "N", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_empty_reconciliation.json ================================================ { "query_label_filter": ["order_events_empty_reconciliation_window"], "queue_filter": ["Medium"], "cadence_filter": ["WEEK"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "Y", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_empty_requested_cadence.json ================================================ { "query_label_filter": ["order_events_negative_timezone_offset"], "queue_filter": ["Medium"], "cadence_filter": [""], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "Y", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_not_configured_cadence.json ================================================ { "query_label_filter": ["order_events_negative_timezone_offset"], "queue_filter": ["Medium"], "cadence_filter": ["YEAR"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "Y", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_unexisting_cadence.json ================================================ { "query_label_filter": ["order_events_unexisting_cadence"], "queue_filter": ["Medium"], "cadence_filter": ["WEEK"], "target_database": "test_db", "start_date": "2022-01-01", "end_date": "2022-12-31", "rerun_flag": "Y", "target_table": "gab_use_case_results", "source_database": "test_db", "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/", "lookup_table": "lkp_query_builder" } ================================================ FILE: tests/resources/feature/heartbeat/control/default/data/ctr_heart_tbl_heartb_feed.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||UNPAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_heart_tbl_exec_sensor.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset|||2025-08-14 23:00:00|1927384615203749|data-product_job_name_orders|NEW_EVENT_AVAILABLE|2025-08-14 23:00:00|||UNPAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_heart_tbl_trigger_job.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset|||2025-08-14 23:00:00|1927384615203749|data-product_job_name_orders|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00|UNPAUSED|TRUE delta_table|dummy_order|batch|dummy_heartbeat_asset||||1015557820139870|data-product_job_name_orders|IN_PROGRESS|2025-08-14 23:00:00|2025-08-14 23:00:00||UNPAUSED|true kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_heart_tbl_updated.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset|||2025-08-14 23:00:00|1927384615203749|data-product_job_name_orders|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00|UNPAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_sensor_tbl_upd_status.json ================================================ {"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null} {"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"} {"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "dummy_delta_table_1927384615203749","assets": ["dummy_heartbeat_asset"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-08-14 23:00:00.00000","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"} ================================================ FILE: tests/resources/feature/heartbeat/control/default/schema/ctrl_heart_tbl_schema.json ================================================ { "type": "struct", "fields": [ { "name":"sensor_source", "type":"string", "nullable": true, "metadata": {} }, { "name":"sensor_id", "type":"string", "nullable": true, "metadata": {} }, { "name":"sensor_read_type", "type":"string", "nullable": true, "metadata": {} }, { "name":"asset_description", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_key", "type":"string", "nullable": true, "metadata": {} }, { "name":"preprocess_query", "type":"string", "nullable": true, "metadata": {} }, { "name":"latest_event_fetched_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"trigger_job_id", "type":"string", "nullable": true, "metadata": {} }, { "name":"trigger_job_name", "type":"string", "nullable": true, "metadata": {} }, { "name":"status", "type":"string", "nullable": true, "metadata": {} }, { "name":"status_change_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"job_start_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"job_end_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"job_state", "type":"string", "nullable": true, "metadata": {} }, { "name":"dependency_flag", "type":"string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/heartbeat/control/default/schema/ctrl_heart_tbl_trig_schema.json ================================================ { "type": "struct", "fields": [ { "name":"sensor_source", "type":"string", "nullable": true, "metadata": {} }, { "name":"sensor_id", "type":"string", "nullable": true, "metadata": {} }, { "name":"sensor_read_type", "type":"string", "nullable": true, "metadata": {} }, { "name":"asset_description", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_key", "type":"string", "nullable": true, "metadata": {} }, { "name":"preprocess_query", "type":"string", "nullable": true, "metadata": {} }, { "name":"trigger_job_id", "type":"string", "nullable": true, "metadata": {} }, { "name":"trigger_job_name", "type":"string", "nullable": true, "metadata": {} }, { "name":"status", "type":"string", "nullable": true, "metadata": {} }, { "name":"job_state", "type":"string", "nullable": true, "metadata": {} }, { "name":"dependency_flag", "type":"string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctr_heart_tbl_heartb_feed.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales||||||TRUE kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||COMPLETE|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_heart_tbl_exec_sensor.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales||||||TRUE kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||COMPLETE|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_heart_tbl_trigger_job.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE delta_table|dummy_order|batch|dummy_heartbeat_asset||||1015557820139870|data-product_job_name_orders|IN PROGRESS||||UNPAUSED|true kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00|COMPLETE|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00||TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_heart_tbl_updated.csv ================================================ sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00||TRUE kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00 ||2025-08-14 23:00:00|COMPLETE|TRUE ================================================ FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_sensor_tbl_upd_status.json ================================================ {"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null} {"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"} {"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "dummy_sap_asset_2604918372561094","assets": ["dummy_heartbeat_sap_bw"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-08-14 23:00:00","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"} {"sensor_id": "sales__domain_workspace_load_dummy_topic_2604918372561094","assets": null,"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-08-14 23:00:00","checkpoint_location": null,"upstream_key": "None","upstream_value": "None"} ================================================ FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/schema/ctrl_heart_tbl_schema.json ================================================ { "type": "struct", "fields": [ { "name":"sensor_source", "type":"string", "nullable": true, "metadata": {} }, { "name":"sensor_id", "type":"string", "nullable": true, "metadata": {} }, { "name":"sensor_read_type", "type":"string", "nullable": true, "metadata": {} }, { "name":"asset_description", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_key", "type":"string", "nullable": true, "metadata": {} }, { "name":"preprocess_query", "type":"string", "nullable": true, "metadata": {} }, { "name":"latest_event_fetched_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"trigger_job_id", "type":"string", "nullable": true, "metadata": {} }, { "name":"trigger_job_name", "type":"string", "nullable": true, "metadata": {} }, { "name":"status", "type":"string", "nullable": true, "metadata": {} }, { "name":"status_change_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"job_start_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"job_end_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"job_state", "type":"string", "nullable": true, "metadata": {} }, { "name":"dependency_flag", "type":"string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/heartbeat/setup/default/column_list/heartbeat_sensor_control_table.json ================================================ { "sensor_source": "string", "sensor_id": "string", "sensor_read_type": "string", "asset_description": "string", "upstream_key": "string", "preprocess_query": "string", "latest_event_fetched_timestamp": "timestamp", "trigger_job_id": "string", "trigger_job_name": "string", "status": "string", "status_change_timestamp": "timestamp", "job_start_timestamp": "timestamp", "job_end_timestamp": "timestamp", "job_state": "string", "dependency_flag": "string" } ================================================ FILE: tests/resources/feature/heartbeat/setup/default/column_list/sensor_table.json ================================================ { "sensor_id": "string", "assets": "array", "status": "string", "status_change_timestamp": "timestamp", "checkpoint_location": "string", "upstream_key": "string", "upstream_value": "string" } ================================================ FILE: tests/resources/feature/heartbeat/setup/default/data/setup_heartbeat_data.csv ================================================ sensor_source,sensor_id,sensor_read_type,asset_description,upstream_key,preprocess_query,trigger_job_id,trigger_job_name,job_state,dependency_flag delta_table,dummy_delta_table,streaming,dummy_heartbeat_asset,,,1927384615203749,data-product_job_name_orders,UNPAUSED,TRUE sap_bw,dummy_sap_asset,batch,dummy_heartbeat_sap_bw,LOAD_DATE,,2604918372561094,data-product_job_name_sales,UNPAUSED,TRUE kafka,sales: domain.workspace.load.dummy_topic,streaming,dummy_heartbeat_kafka,,,2604918372561094,data-product_job_name_sales,UNPAUSED,TRUE ================================================ FILE: tests/resources/feature/heartbeat/setup/default/data/setup_sensor_data.json ================================================ {"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null} {"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"} {"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "dummy_delta_table_1927384615203749","assets": ["dummy_heartbeat_asset"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"} ================================================ FILE: tests/resources/feature/heartbeat/setup/default/schema/schema_sensor_df.json ================================================ { "type": "struct", "fields": [ { "name":"sensor_id", "type":"string", "nullable": true, "metadata": {} }, { "name": "assets", "type": { "containsNull": true, "elementType": "string", "type": "array" }, "nullable": true, "metadata": {} }, { "name":"status", "type":"string", "nullable": true, "metadata": {} }, { "name":"status_change_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"checkpoint_location", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_key", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_value", "type":"string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/column_list/heartbeat_sensor_control_table.json ================================================ { "sensor_source": "string", "sensor_id": "string", "sensor_read_type": "string", "asset_description": "string", "upstream_key": "string", "preprocess_query": "string", "latest_event_fetched_timestamp": "timestamp", "trigger_job_id": "string", "trigger_job_name": "string", "status": "string", "status_change_timestamp": "timestamp", "job_start_timestamp": "timestamp", "job_end_timestamp": "timestamp", "job_state": "string", "dependency_flag": "string" } ================================================ FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/column_list/sensor_table.json ================================================ { "sensor_id": "string", "assets": "array", "status": "string", "status_change_timestamp": "timestamp", "checkpoint_location": "string", "upstream_key": "string", "upstream_value": "string" } ================================================ FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/data/setup_heartbeat_data.csv ================================================ sensor_source,sensor_id,sensor_read_type,asset_description,upstream_key,preprocess_query,trigger_job_id,trigger_job_name,job_state,dependency_flag delta_table,dummy_delta_table,streaming,dummy_heartbeat_asset,,,1927384615203749,data-product_job_name_orders,PAUSED,TRUE sap_bw,dummy_sap_asset,batch,dummy_heartbeat_sap_bw,LOAD_DATE,,2604918372561094,data-product_job_name_sales,,TRUE kafka,sales: domain.workspace.load.dummy_topic,streaming,dummy_heartbeat_kafka,,,2604918372561094,data-product_job_name_sales,COMPLETE,TRUE ================================================ FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/data/setup_sensor_data.json ================================================ {"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null} {"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"} {"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"} {"sensor_id": "dummy_sap_asset_2604918372561094","assets": ["dummy_heartbeat_sap_bw"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"} ================================================ FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/schema/schema_sensor_df.json ================================================ { "type": "struct", "fields": [ { "name":"sensor_id", "type":"string", "nullable": true, "metadata": {} }, { "name": "assets", "type": { "containsNull": true, "elementType": "string", "type": "array" }, "nullable": true, "metadata": {} }, { "name":"status", "type":"string", "nullable": true, "metadata": {} }, { "name":"status_change_timestamp", "type":"timestamp", "nullable": true, "metadata": {} }, { "name":"checkpoint_location", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_key", "type":"string", "nullable": true, "metadata": {} }, { "name":"upstream_value", "type":"string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "options": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db", "dbtable": "jdbc_format", "driver": "org.sqlite.JDBC", "numPartitions": 1 } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.jdbc_format_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/correct_arguments/data" } ] } ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_format/predicates/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "options": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/predicates/tests.db", "dbtable": "options", "driver": "org.sqlite.JDBC", "predicates": "[customer=customer1]" } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.options_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/predicates/data" } ] } ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_format/wrong_arguments/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "options": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/wrong_arguments/tests.db", "table": "error_because_should_be_dbtable", "driver": "org.sqlite.JDBC", "numPartitions": 1 } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.jdbc_format_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/wrong_arguments/data" } ] } ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db", "table": "jdbc_function", "properties": { "driver": "org.sqlite.JDBC" } }, "options": { "numPartitions": 1 } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.jdbc_function_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_function/correct_arguments/data" } ] } ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/jdbc_reader/jdbc_function/wrong_arguments/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "jdbc", "jdbc_args": { "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/wrong_arguments/tests.db", "dbtable": "error_because_should_be_table_or_query", "properties": { "driver": "org.sqlite.JDBC" } }, "options": { "numPartitions": 1 } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "db_table": "test_db.jdbc_function_table", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_function/wrong_arguments/data" } ] } ================================================ FILE: tests/resources/feature/materialize_cdf/acon_create_table.json ================================================ { "function": "create_table", "path": "file:///app/tests/resources/feature/materialize_cdf/data/table/streaming_with_cdf.sql" } ================================================ FILE: tests/resources/feature/materialize_cdf/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "_change_type", "type": "string", "nullable": true, "metadata": {} }, { "name": "_commit_version", "type": "long", "nullable": true, "metadata": {} }, { "name": "_commit_timestamp", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/materialize_cdf/data/control/part-01_cdf.csv ================================================ salesorder|item|date|customer|article|amount|_change_type|_commit_version 1|1|20160601|customer1|article1|1000|insert|1 1|2|20160601|customer1|article2|2000|insert|1 1|3|20160601|customer1|article3|500|insert|1 2|1|20170215|customer2|article4|1000|insert|1 2|2|20170215|customer2|article6|5000|insert|1 2|3|20170215|customer2|article1|3000|insert|1 3|1|20170215|customer1|article5|20000|insert|1 3|2|20170215|customer1|article2|12000|insert|1 3|3|20170215|customer1|article4|9000|insert|1 4|1|20170430|customer3|article3|8000|insert|1 4|2|20170430|customer3|article7|7000|insert|1 4|3|20170430|customer3|article1|3000|insert|1 4|4|20170430|customer3|article2|5000|insert|1 ================================================ FILE: tests/resources/feature/materialize_cdf/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/materialize_cdf/data/source/part-02.csv ================================================ salesorder|item|date|customer|article|amount 5|1|20180601|customer1|article1|1000 5|2|20180601|customer1|article2|2000 5|3|20180601|customer1|article3|500 6|1|20190215|customer2|article4|1000 6|2|20190215|customer2|article6|5000 6|3|20190215|customer2|article1|3000 ================================================ FILE: tests/resources/feature/materialize_cdf/data/table/streaming_with_cdf.sql ================================================ CREATE TABLE test_db.streaming_with_cdf (salesorder INT, item INT, date INT, customer STRING, article STRING, amount INT) USING DELTA PARTITIONED BY (date) LOCATION 'file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf' TBLPROPERTIES( 'delta.enableChangeDataFeed'='true' ) ================================================ FILE: tests/resources/feature/materialize_cdf/streaming_with_clean_and_vacuum.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|", "mode": "DROPMALFORMED" }, "location": "file:///app/tests/lakehouse/in/feature/materialize_cdf/streaming_with_cdf/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "db_table": "test_db.streaming_with_cdf", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/data" } ], "terminate_specs": [ { "function": "expose_cdf", "args": { "db_table": "test_db.streaming_with_cdf", "materialized_cdf_location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_data", "materialized_cdf_options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_checkpoint" }, "vacuum_cdf": true, "vacuum_hours": 240, "clean_cdf": true, "days_to_keep": 1 } } ], "exec_env": { "spark.sql.sources.partitionColumnTypeInference.enabled": true } } ================================================ FILE: tests/resources/feature/materialize_cdf/streaming_without_clean_cdf.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|", "mode": "DROPMALFORMED" }, "location": "file:///app/tests/lakehouse/in/feature/materialize_cdf/streaming_with_cdf/data", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "db_table": "test_db.streaming_with_cdf", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/data" } ], "terminate_specs": [ { "function": "expose_cdf", "args": { "db_table": "test_db.streaming_with_cdf", "materialized_cdf_location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_data", "materialized_cdf_options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_checkpoint" }, "clean_cdf": false } } ], "exec_env": { "spark.sql.sources.partitionColumnTypeInference.enabled": true } } ================================================ FILE: tests/resources/feature/notification/test_attachement.txt ================================================ Test attachemment ================================================ FILE: tests/resources/feature/reconciliation/data/current.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": 200 }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 400 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": 600 } ] ================================================ FILE: tests/resources/feature/reconciliation/data/current_different_rows.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": 200 }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 400 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": 600 }, { "country": "es", "consumer": 4, "date": "20211115", "net_sales": 250 } ] ================================================ FILE: tests/resources/feature/reconciliation/data/current_fail.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": 100 }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 400 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": 600 } ] ================================================ FILE: tests/resources/feature/reconciliation/data/current_nulls_and_zeros.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": null }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 0 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": null } ] ================================================ FILE: tests/resources/feature/reconciliation/data/current_nulls_and_zeros_fail.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": 0 }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 0 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": null } ] ================================================ FILE: tests/resources/feature/reconciliation/data/truth.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": 200 }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 400 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": 600 } ] ================================================ FILE: tests/resources/feature/reconciliation/data/truth_different_rows.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": 200 }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 400 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": 600 }, { "country": "uk", "consumer": 4, "date": "20211115", "net_sales": 250 } ] ================================================ FILE: tests/resources/feature/reconciliation/data/truth_empty.json ================================================ [] ================================================ FILE: tests/resources/feature/reconciliation/data/truth_nulls_and_zeros.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": null }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 0 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": null } ] ================================================ FILE: tests/resources/feature/reconciliation/data/truth_nulls_and_zeros_fail.json ================================================ [ { "country": "pt", "consumer": 1, "date": "20211112", "net_sales": null }, { "country": "ge", "consumer": 2, "date": "20211113", "net_sales": 0 }, { "country": "pt", "consumer": 3, "date": "20211114", "net_sales": null } ] ================================================ FILE: tests/resources/feature/schema_evolution/append_load/batch_append_disabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.schema_evolution_append_load" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "ARTICLE": "article" } } }, { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_timestamp" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/batch_append_disabled_cast.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.schema_evolution_append_load" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_timestamp" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/batch_append_enabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.schema_evolution_append_load" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "ARTICLE": "article" }, "escape_col_names": false } }, { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_timestamp" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": true } } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/batch_append_enabled_cast.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "db_table": "test_db.schema_evolution_append_load" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "date" } } ] }, { "spec_id": "appended_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "incremental_filter", "args": { "input_col": "date", "increment_df": "max_sales_bronze_timestamp" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "appended_sales", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": true } } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/batch_init_disabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_part-01_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/batch_init_enabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_part-01_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": true } } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|new_column 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1| 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1| 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2| 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3| 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4| 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6| 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6| 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3|new ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6 20180110120052t|request1|1|1|1|7|1|N|20180110||article2|120|2 20180110120052t|request1|1|1|8|4|1|X|20170430||article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-05.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|request_id 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1| 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1| 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2| 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3| 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4| 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6| 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6| 20180110120052t||1|1|1|7|1|N|20180110|customer5|article2|120|2|request1 20180110120052t||1|1|8|4|1|X|20170430|customer3|article3|80|3|request1 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-06.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article6|50|2 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5||120|2 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|new_column 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1|new 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1|new 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6|new 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2|new 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2|new 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4|new 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3|new ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-05.csv ================================================ actrequest_timestamp|request_id|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-06.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/control/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/control/control_schema_add_column.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/control/control_schema_rename.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-01_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-02_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-03_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-04_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-05_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-06_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_delta_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled_rename.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_delta_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "ARTICLE": "article" }, "escape_col_names": false } }, { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/batch_delta_enabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_delta_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data" }, { "spec_id": "sales_bronze", "read_type": "batch", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data" } ], "transform_specs": [ { "spec_id": "max_sales_bronze_timestamp", "input_id": "sales_bronze", "transformers": [ { "function": "get_max_value", "args": { "input_col": "actrequest_timestamp" } } ] }, { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "rename", "args": { "cols": { "ARTICLE": "article" }, "escape_col_names": false } }, { "function": "incremental_filter", "args": { "input_col": "actrequest_timestamp", "increment_df": "max_sales_bronze_timestamp" } }, { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date", "delete_predicate": "new.recordmode in ('R','D','X')" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": true } } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/batch_init_disabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_part-01_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "ranking_key_asc": [ "recordmode" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/batch_init_enabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_part-01_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data" } ], "transform_specs": [ { "spec_id": "condensed_sales", "input_id": "sales_source", "transformers": [ { "function": "condense_record_mode_cdc", "args": { "business_key": [ "salesorder", "item" ], "ranking_key_desc": [ "actrequest_timestamp", "datapakid", "partno", "record" ], "ranking_key_asc": [ "recordmode" ], "record_mode_col": "recordmode", "valid_record_modes": [ "", "N", "R", "D", "X" ] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "condensed_sales", "write_type": "merge", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data", "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date" } } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": true } } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code|new_column 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2| 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4| 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2|new 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7| 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2| 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6| 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5| 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1|new 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3| 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1| 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5| 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1| 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3| 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4| ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5 20180110120052t|request1|1|1|1|7|1|N|20180110||article2|120|2 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article6|50|2 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5||120|2 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-05.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|request_id 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2| 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4| 20180110120052t|0|1|1|5|2|2||20170215|customer2|article2|50|2|request1 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7| 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2| 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6| 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5| 20180110120052t||1|1|1|7|1|N|20180110|customer5|article2|120|2|request1 20180110120052t|0|1|1|3|1|1||20160601|customer1|article1|150|1|request1 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3| 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1| 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5| 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1| 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3| 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4| ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-06.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6 00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1 00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5 00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120|2 00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90|4 00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80|3 00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7 00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1 00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2 00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6 00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3 00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5 00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|new_column 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1|new 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1|new 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6|new 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2|new 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2|new 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4|new 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3|new ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-03.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-04.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-05.csv ================================================ actrequest_timestamp|request_id|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-06.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2 20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1 20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1 20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6 20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2 20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2 20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3 ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/control/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/control/control_schema_add_column.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/control/control_schema_rename.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-01_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-02_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-03_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-04_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-05_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-06_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/batch_init.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data" } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data" } ] } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/batch_merge_disabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data" } ], "transform_specs": [ { "spec_id": "transformed_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "rename", "args": { "cols": { "ARTICLE": "article" } } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "transformed_sales", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": false } } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/batch_merge_enabled.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data" } ], "transform_specs": [ { "spec_id": "transformed_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "rename", "args": { "cols": { "ARTICLE": "article" } } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data" } ], "exec_env": { "spark.databricks.delta.schema.autoMerge.enabled": true } } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/batch_overwrite.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data" } ], "transform_specs": [ { "spec_id": "transformed_sales", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "code": "StringType" } } }, { "function": "rename", "args": { "cols": { "ARTICLE": "article" }, "escape_col_names": false } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "transformed_sales", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data", "options": { "overwriteSchema": true } } ] } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/data/control/part-02.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code|new_column 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1| 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1| 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2| 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3| 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4| 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6| 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6| 20180110120052t|request1|1|1|1|7|1|N|20180110|customer5||120|2|new 20180110120052t|request1|1|1|8|4|1|X|20170430|customer3||80|3|new ================================================ FILE: tests/resources/feature/schema_evolution/full_load/data/source/part-01.csv ================================================ actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code 00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1 00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1 00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2 00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3 00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4 00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6 00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6 ================================================ FILE: tests/resources/feature/schema_evolution/full_load/data/source/part-02.csv ================================================ actrequest_timestamp|request_id|datapakid|partno|record|salesorder|item|recordmode|date|ARTICLE|amount|code|new_column 20180110120052t|request1|1|1|1|7|1|N|20180110|article2|120|2|new 20180110120052t|request1|1|1|2|1|1|X|20160601|article1|100|1|new 20180110120052t|request1|1|1|3|1|1||20160601|article1|150|1|new 20180110120052t|request1|1|1|4|2|2|X|20170215|article6|50|6|new 20180110120052t|request1|1|1|5|2|2||20170215|article2|50|2|new 20180110120052t|request1|1|1|6|3|2|D|20170215|article2|120|2|new 20180110120052t|request1|1|1|7|3|3|R|20170215|article4|-90|4|new 20180110120052t|request1|1|1|8|4|1|X|20170430|article3|80|3|new ================================================ FILE: tests/resources/feature/schema_evolution/full_load/schema/control/control_schema_merge_enabled.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/schema/control/control_schema_overwrite.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "string", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/schema/source/source_part-01_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/schema_evolution/full_load/schema/source/source_part-02_schema.json ================================================ { "type": "struct", "fields": [ { "name": "actrequest_timestamp", "type": "string", "nullable": true, "metadata": {} }, { "name": "request_id", "type": "string", "nullable": true, "metadata": {} }, { "name": "datapakid", "type": "integer", "nullable": true, "metadata": {} }, { "name": "partno", "type": "integer", "nullable": true, "metadata": {} }, { "name": "record", "type": "integer", "nullable": true, "metadata": {} }, { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "recordmode", "type": "string", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ARTICLE", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "code", "type": "integer", "nullable": true, "metadata": {} }, { "name": "new_column", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/sftp_reader/data/file.csv ================================================ column1|column2 1|1 ================================================ FILE: tests/resources/feature/sftp_reader/data/file1.csv ================================================ column1|column2 2|2 ================================================ FILE: tests/resources/feature/sftp_reader/data/file2.csv ================================================ column1|column2 3|3 ================================================ FILE: tests/resources/feature/sftp_reader/data/file3.json ================================================ {"colUserName":"TestName", "colCity":"TestCity", "colState":"TestState"} ================================================ FILE: tests/resources/feature/sftp_reader/data/file4.xml ================================================ userOne 50 CityTest userTwo 40 CityTest2 userThree 30 CityTest3 ================================================ FILE: tests/resources/feature/sftp_reader/data/file5.txt ================================================ value1 value2 value3 ================================================ FILE: tests/resources/feature/sharepoint/exceptions/acons/drive_exception.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_TEST", "secret": "CLIENT_SECRET", "site_name": "mock_site", "drive_name": "", "folder_relative_path": "sp_test", "file_name": "sharepoint_test.csv", "local_path": "mock_path", "conflict_behaviour": "replace" } } ] } ================================================ FILE: tests/resources/feature/sharepoint/exceptions/acons/endpoint_exception.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_TEST", "secret": "CLIENT_SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "folder_relative_path": "sp_test", "file_name": "sharepoint_test.csv", "local_path": "mock_path", "conflict_behaviour": "replace" } } ] } ================================================ FILE: tests/resources/feature/sharepoint/exceptions/acons/local_path_exception.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_TEST", "secret": "CLIENT_SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "folder_relative_path": "sp_test", "file_name": "sharepoint_test.csv", "local_path": "", "conflict_behaviour": "replace" } } ] } ================================================ FILE: tests/resources/feature/sharepoint/exceptions/acons/site_exception.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_TEST", "secret": "CLIENT_SECRET", "site_name": "", "drive_name": "mock_drive", "folder_relative_path": "sp_test", "file_name": "sharepoint_test.csv", "local_path": "mock_path", "conflict_behaviour": "replace" } } ] } ================================================ FILE: tests/resources/feature/sharepoint/exceptions/acons/streaming_exception.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "streaming", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_TEST", "secret": "CLIENT_SECRET", "site_name": "files_ingestion", "drive_name": "Exports_DART_dev", "folder_relative_path": "sp_test", "file_name": "sharepoint_test.csv", "local_path": "LOCAL_PATH", "conflict_behaviour": "replace" } } ] } ================================================ FILE: tests/resources/feature/sharepoint/exceptions/schemas/schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_file_name_and_file_pattern_conflict_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_pattern": "sample_*", "local_options": { "header": true, "delimiter": ",", "inferSchema": true } } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_conflict_file_name_pattern", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_conflict_file_name_pattern/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_file_name_unsupported_extension_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "bad.txt", "local_options": { "header": true, "delimiter": ",", "inferSchema": true } } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_bad_extension", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_bad_extension/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_archive_enabled_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_archive_enabled", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_archive_enabled/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_archive_success_subfolder_override_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_pattern": "*", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true, "archive_success_subfolder": "processed" } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_archive_success_subfolder_override", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_archive_success_subfolder_override/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_no_csv_files_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_no_csv_files", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_no_csv_files/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_one_file_schema_mismatch_custom_error_subfolder_should_archive_error.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_pattern": "*", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true, "archive_error_subfolder": "failed" } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_schema_mismatch_custom_error_subfolder", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_schema_mismatch_custom_error_subfolder/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_one_file_schema_mismatch_should_archive_error.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_pattern": "*", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true, "archive_success_subfolder": "done", "archive_error_subfolder": "error" } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_schema_mismatch", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_schema_mismatch/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_pattern_matches_no_files_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_pattern": "does_not_match_*.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_pattern_matches_no_files", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_pattern_matches_no_files/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_pattern_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_pattern": "sample_*", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_pattern", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_pattern/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_pattern": "*", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_path_does_not_exist_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "missing_folder", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_missing_folder", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_missing_folder/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_relative_path_looks_like_file_unsupported_extension_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test/bad.txt", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_folder_path_bad_ext", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_path_bad_ext/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_archive_default_enabled_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true } } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_archive_default_enabled", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_archive_default_enabled/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_archive_enabled_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_archive_enabled", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_archive_enabled/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_archive_success_subfolder_override_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true, "archive_success_subfolder": "processed" } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_archive_success_subfolder_override", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_archive_success_subfolder_override/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_download_error_should_archive_error.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_download_error", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_download_error/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_empty_file_should_archive_error.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "empty.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true, "archive_success_subfolder": "done", "archive_error_subfolder": "error" } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_empty_file", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_empty_file/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test/sample_1.csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_full_path", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_with_file_name_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test/sample_1.csv", "file_name": "other.csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_conflict", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_conflict/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_with_file_pattern_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test/sample_1.csv", "file_pattern": "*.csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true } } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_full_path_with_file_pattern_should_fail", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path_with_file_pattern_should_fail/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_with_file_type_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test/sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true } } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_full_path_with_file_type_should_fail", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path_with_file_type_should_fail/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_spark_load_fails_should_archive_error.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": true } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single_spark_load_fails", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_spark_load_fails/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_name": "sample_1.csv", "file_type": "csv", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_single", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/acons/read_unsupported_file_type_should_fail.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_ID", "secret": "SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/", "folder_relative_path": "sp_test", "file_type": "json", "local_options": { "header": true, "delimiter": ",", "inferSchema": true }, "archive_enabled": false } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "delta", "db_table": "test_db.sharepoint_reader_bad_file_type", "write_type": "overwrite", "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_bad_file_type/" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/data/bad_schema.csv ================================================ col_a,col_c 1,999 ================================================ FILE: tests/resources/feature/sharepoint/reader/data/other.csv ================================================ col_a,col_b 999,999 ================================================ FILE: tests/resources/feature/sharepoint/reader/data/sample_1.csv ================================================ col_a,col_b 1,2 ================================================ FILE: tests/resources/feature/sharepoint/reader/data/sample_2.csv ================================================ col_a,col_b 3,4 ================================================ FILE: tests/resources/feature/sharepoint/reader/mocks/get_drive_id.json ================================================ { "value": [ { "name": "mock_drive", "id": "test_drive_id" } ] } ================================================ FILE: tests/resources/feature/sharepoint/reader/mocks/get_file_metadata.json ================================================ { "id": "test_item_id", "name": "sample.csv", "createdDateTime": "2026-01-01T00:00:00Z", "lastModifiedDateTime": "2026-01-01T00:00:00Z", "@microsoft.graph.downloadUrl": "https://download.mock/sample.csv" } ================================================ FILE: tests/resources/feature/sharepoint/reader/mocks/get_site_id.json ================================================ { "id": "test_site_id", "displayName": "mock_site" } ================================================ FILE: tests/resources/feature/sharepoint/reader/mocks/rename_file.json ================================================ {} ================================================ FILE: tests/resources/feature/sharepoint/writer/acons/write_to_local_success.json ================================================ { "input_specs": [ { "spec_id": "sharepoint_input", "read_type": "batch", "data_format": "csv", "options": { "header": true, "delimiter": "|" }, "location": "/app/tests/lakehouse/in/feature/sharepoint/data/", "schema": { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } } ], "output_specs": [ { "spec_id": "sharepoint_output", "input_id": "sharepoint_input", "data_format": "sharepoint", "sharepoint_opts": { "client_id": "CLIENT_ID", "tenant_id": "TENANT_TEST", "secret": "CLIENT_SECRET", "site_name": "mock_site", "drive_name": "mock_drive", "folder_relative_path": "sp_test", "file_name": "sharepoint_test", "local_path": "/app/tests/lakehouse/out/feature/sharepoint/writer/data/", "conflict_behaviour": "replace" } } ] } ================================================ FILE: tests/resources/feature/sharepoint/writer/data/file_control.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/sharepoint/writer/data/file_source.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/sharepoint/writer/mocks/create_upload_session.json ================================================ { "uploadUrl": "test_site_id" } ================================================ FILE: tests/resources/feature/sharepoint/writer/mocks/get_drive_id.json ================================================ { "value": [ { "name": "mock_drive", "id": "test_drive_id" } ] } ================================================ FILE: tests/resources/feature/sharepoint/writer/mocks/get_site_id.json ================================================ { "id": "test_site_id", "displayName": "mock_site" } ================================================ FILE: tests/resources/feature/sharepoint/writer/schemas/schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_default_scenario1.json ================================================ { "function": "compute_table_statistics", "table_or_view": "test_db.DummyTableBronzeComplexDefaultScenario1" } ================================================ FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_default_scenario2.json ================================================ { "function": "compute_table_statistics", "table_or_view": "test_db.DummyTableBronzeComplexDefaultScenario2" } ================================================ FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_different_delimiter_scenario1.json ================================================ { "function": "compute_table_statistics", "table_or_view": "test_db.DummyTableBronzeComplexDifferentDelimiterScenario1" } ================================================ FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_different_delimiter_scenario2.json ================================================ { "function": "compute_table_statistics", "table_or_view": "test_db.DummyTableBronzeComplexDifferentDelimiterScenario2" } ================================================ FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_simple_split_scenario.json ================================================ { "function": "compute_table_statistics", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario" } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_table.json ================================================ { "function": "create_table", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table.sql" } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_table_complex_default_scenario.json ================================================ { "function": "create_table", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table_complex_default_scenario.sql", "delimiter": ";", "advanced_parser": true } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_table_complex_different_delimiter_scenario.json ================================================ { "function": "create_table", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table_complex_different_delimiter_scenario.sql", "delimiter": "===", "advanced_parser": true } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_table_simple_split_scenario.json ================================================ { "function": "create_table", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table_simple_split_scenario.sql" } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_view.json ================================================ { "function": "create_view", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view.sql" } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_view_complex_default_scenario.json ================================================ { "function": "create_view", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view_complex_default_scenario.sql", "advanced_parser": true } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_view_complex_different_delimiter_scenario.json ================================================ { "function": "create_view", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view_complex_different_delimiter_scenario.sql", "delimiter": "===", "advanced_parser": true } ================================================ FILE: tests/resources/feature/table_manager/create/acon_create_view_simple_split_scenario.json ================================================ { "function": "create_view", "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view_simple_split_scenario.sql" } ================================================ FILE: tests/resources/feature/table_manager/create/table/test_table_complex_default_scenario.sql ================================================ -- New table manager test table, to check if new parser works as expected and deals well with different delimiters (;). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. CREATE TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ( id INT COMMENT 'id with special (< characters ;', col1 STRING COMMENT 'col1 with >) special character " and ;', col2 INT COMMENT 'col2 with () special character \" and ;', col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;', col4 STRING COMMENT "col4 with special /* character ;", year INT COMMENT "year with */ special character ;", month INT COMMENT "month with special -- character ;", day INT COMMENT "day with special \" character ;" ) USING DELTA PARTITIONED BY (year, month, day) LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_default_scenario1' TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`'); -- New table manager test table, to check if new parser works as expected and deals well to different delimiters (;). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. /* New table manager test table, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of "", '', -- */ CREATE TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ( id INT COMMENT 'id with special (< characters ;', col1 STRING COMMENT 'col1 with >) special character " and ;', col2 INT COMMENT 'col2 with () special character \" and ;', col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;', col4 STRING COMMENT "col4 with special /* character ;", year INT COMMENT "year with */ special character ;", month INT COMMENT "month with special -- character ;", day INT COMMENT "day with special \" character ;" ) USING DELTA PARTITIONED BY (year, month, day) LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_default_scenario2' TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`') ================================================ FILE: tests/resources/feature/table_manager/create/table/test_table_complex_different_delimiter_scenario.sql ================================================ -- New table manager test table, to check if new parser works as expected and deals well with different delimiters (===). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. CREATE TABLE test_db.DummyTableBronzeComplexDifferentDelimiterScenario1 ( id INT COMMENT 'id with special (< characters ;', col1 STRING COMMENT 'col1 with >) special character " and ;', col2 INT COMMENT 'col2 with () special character \" and ;', col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;', col4 STRING COMMENT "col4 with special /* character ;", year INT COMMENT "year with */ special character ;", month INT COMMENT "month with special -- character ;", day INT COMMENT "day with special \" character ;" ) USING DELTA PARTITIONED BY (year, month, day) LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_different_delimiter_scenario1' TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`')=== -- New table manager test table, to check if new parser works as expected and deals well to different delimiters (===). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. /* New table manager test table, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of "", '', -- */ CREATE TABLE test_db.DummyTableBronzeComplexDifferentDelimiterScenario2 ( id INT COMMENT 'id with special (< characters ;', col1 STRING COMMENT 'col1 with >) special character " and ;', col2 INT COMMENT 'col2 with () special character \" and ;', col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;', col4 STRING COMMENT "col4 with special /* character ;", year INT COMMENT "year with */ special character ;", month INT COMMENT "month with special -- character ;", day INT COMMENT "day with special \" character ;" ) USING DELTA PARTITIONED BY (year, month, day) LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_different_delimiter_scenario2' TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`') ================================================ FILE: tests/resources/feature/table_manager/create/table/test_table_simple_split_scenario.sql ================================================ CREATE TABLE test_db.DummyTableBronzeSimpleSplitScenario (id INT, col1 STRING, col2 INT, col3 BOOLEAN, col4 STRING, year INT, month INT, day INT) USING DELTA PARTITIONED BY (year, month, day) LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_simple_split_scenario' TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`') ================================================ FILE: tests/resources/feature/table_manager/create/view/test_view_complex_default_scenario.sql ================================================ -- New table manager test view, to check if new parser works as expected and deals well with different delimiters (;). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. CREATE VIEW test_db.DummyViewBronzeComplexDefaultScenario1 (id,col1,col2,col3,col4) AS SELECT id,col1,CONCAT_WS(";",col2) AS col2,col3,col4 FROM test_db.DummyTableBronzeComplexDefaultScenario1; -- New table manager test view, to check if new parser works as expected and deals well with different delimiters (;). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. CREATE VIEW test_db.DummyViewBronzeComplexDefaultScenario2 (id,col1,col2,col3,col4) AS SELECT id,col1,col2,CONCAT_WS(";",col3) AS col3,col4 FROM test_db.DummyTableBronzeComplexDefaultScenario2 ================================================ FILE: tests/resources/feature/table_manager/create/view/test_view_complex_different_delimiter_scenario.sql ================================================ -- New table manager test view, to check if new parser works as expected and deals well with different delimiters (===). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. CREATE VIEW test_db.DummyViewBronzeComplexDifferentDelimiterScenario1 (id,col1,col2,col3,col4) AS SELECT id,col1,CONCAT_WS(";",col2) AS col2,col3,col4 FROM test_db.DummyTableBronzeComplexDifferentDelimiterScenario1=== -- New table manager test view, to check if new parser works as expected and deals well with different delimiters (===). -- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */. CREATE VIEW test_db.DummyViewBronzeComplexDifferentDelimiterScenario2 (id,col1,col2,col3,col4) AS SELECT id,col1,col2,CONCAT_WS(";",col3) AS col3,col4 FROM test_db.DummyTableBronzeComplexDifferentDelimiterScenario2 ================================================ FILE: tests/resources/feature/table_manager/create/view/test_view_simple_split_scenario.sql ================================================ CREATE VIEW test_db.DummyViewBronzeSimpleSplitScenario (id,col1,col2,col3,col4) AS SELECT id,col1,col2,col3,col4 FROM test_db.DummyTableBronzeSimpleSplitScenario ================================================ FILE: tests/resources/feature/table_manager/delete/acon_delete_where_table_simple_split_scenario.json ================================================ { "function": "delete_where", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario", "where_clause": "year=2021" } ================================================ FILE: tests/resources/feature/table_manager/describe/acon_describe_simple_split_scenario.json ================================================ { "function": "describe", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario" } ================================================ FILE: tests/resources/feature/table_manager/drop/acon_drop_table_simple_split_scenario.json ================================================ { "function": "drop_table", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario" } ================================================ FILE: tests/resources/feature/table_manager/drop/acon_drop_view_simple_split_scenario.json ================================================ { "function": "drop_view", "table_or_view": "test_db.DummyViewBronzeSimpleSplitScenario" } ================================================ FILE: tests/resources/feature/table_manager/execute_sql/acon_execute_sql_complex_default_scenario.json ================================================ { "function": "execute_sql", "sql": "/* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;).The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col1 COMMENT 'comment ; for col1'; /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col2 COMMENT 'comment for col2'; /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col1 COMMENT 'comment \" for col1'; /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col2 COMMENT 'comment () <> for col2'", "advanced_parser": "True" } ================================================ FILE: tests/resources/feature/table_manager/execute_sql/acon_execute_sql_complex_different_delimiter_scenario.json ================================================ { "function": "execute_sql", "sql": "/* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===).The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col1 COMMENT 'comment === for col1'=== /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col2 COMMENT 'comment for col2'=== /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col1 COMMENT 'comment \" for col1'=== /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col2 COMMENT 'comment () <> for col2'", "delimiter": "===", "advanced_parser": "True" } ================================================ FILE: tests/resources/feature/table_manager/execute_sql/acon_execute_sql_simple_split_scenario.json ================================================ { "function": "execute_sql", "sql": "ALTER TABLE test_db.DummyTableBronzeSimpleSplitScenario ALTER COLUMN col1 COMMENT 'comment for col1'" } ================================================ FILE: tests/resources/feature/table_manager/get_tbl_pk/get_tbl_pk_simple_split_scenario.json ================================================ { "function": "get_tbl_pk", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario" } ================================================ FILE: tests/resources/feature/table_manager/optimize/optimize_location.json ================================================ { "function": "optimize", "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data", "where_clause": "year >= 2021 and month >= 09 and day > 01", "optimize_zorder_col_list": "col1,col2" } ================================================ FILE: tests/resources/feature/table_manager/optimize/optimize_location_simple_split_scenario.json ================================================ { "function": "optimize", "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_simple_split_scenario", "where_clause": "year >= 2021 and month >= 09 and day > 01", "optimize_zorder_col_list": "col1,col2" } ================================================ FILE: tests/resources/feature/table_manager/optimize/optimize_table.json ================================================ { "function": "optimize", "table_or_view": "test_db.DummyTableBronze", "where_clause": "year >= 2021 and month >= 09 and day > 01", "optimize_zorder_col_list": "col1,col2" } ================================================ FILE: tests/resources/feature/table_manager/optimize/optimize_table_simple_split_scenario.json ================================================ { "function": "optimize", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario", "where_clause": "year >= 2021 and month >= 09 and day > 01", "optimize_zorder_col_list": "col1,col2" } ================================================ FILE: tests/resources/feature/table_manager/show_tbl_properties/show_tbl_properties_simple_split_scenario.json ================================================ { "function": "show_tbl_properties", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario" } ================================================ FILE: tests/resources/feature/table_manager/vacuum/acon_vacuum_location.json ================================================ { "function": "vacuum", "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data", "vacuum_hours": 185 } ================================================ FILE: tests/resources/feature/table_manager/vacuum/acon_vacuum_location_simple_split_scenario.json ================================================ { "function": "vacuum", "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_simple_split_scenario", "vacuum_hours": 185 } ================================================ FILE: tests/resources/feature/table_manager/vacuum/acon_vacuum_table_simple_split_scenario.json ================================================ { "function": "vacuum", "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario", "vacuum_hours": 168 } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/acons/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_new/" } ], "transform_specs": [ { "spec_id": "incremented_historical", "input_id": "sales_historical", "transformers": [ { "function": "with_literals", "args": { "literals": { "is_historical": true } } } ] }, { "spec_id": "incremented_new", "input_id": "sales_new", "transformers": [ { "function": "with_literals", "args": { "literals": { "is_historical": false } } } ] }, { "spec_id": "union_dataframes", "input_id": "incremented_historical", "transformers": [ { "function": "union", "args": {"union_with": ["incremented_new"]} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/batch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/acons/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_new/" } ], "transform_specs": [ { "spec_id": "incremented_historical", "input_id": "sales_historical", "transformers": [ { "function": "with_literals", "args": { "literals": { "is_historical": true } } } ] }, { "spec_id": "incremented_new", "input_id": "sales_new", "transformers": [ { "function": "with_literals", "args": { "literals": { "is_historical": false } } } ] }, { "spec_id": "union_dataframes", "input_id": "incremented_historical", "transformers": [ { "function": "union", "args": {"union_with": ["incremented_new"]} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming/data" } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/acons/streaming_batch.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_new/" }, { "spec_id": "customers", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/customers/" } ], "transform_specs": [ { "spec_id": "incremented_historical", "input_id": "sales_historical", "transformers": [ { "function": "with_literals", "args": { "literals": { "is_historical": true } } } ] }, { "spec_id": "incremented_new", "input_id": "sales_new", "transformers": [ { "function": "with_literals", "args": { "literals": { "is_historical": false } } } ] }, { "spec_id": "union_dataframes", "input_id": "incremented_historical", "transformers": [ { "function": "union", "args": {"union_with": ["incremented_new"]} } ] }, { "spec_id": "join_with_customers", "input_id": "union_dataframes", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "left outer", "join_condition": "a.customer = b.customer", "select_cols": ["a.*", "b.name as customer_name"] } }, {"function": "with_row_id"} ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming_batch/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming_batch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/acons/write_streaming_struct_data.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/struct_data_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/struct_data/" } ], "transform_specs": [ { "spec_id": "first_transform", "input_id": "sales_source", "transformers": [ { "function": "cast", "args": { "cols": { "date": "StringType", "amount": "StringType" } } }, { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "from_json", "args": { "input_col": "sample", "schema": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} }, { "name": "field3", "type": "double", "nullable": true, "metadata": {} }, { "name": "field4", "type": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} } ] }, "nullable": true, "metadata": {} } ] } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "flatten_schema", "args": { "max_level": 1 } } ] }, { "spec_id": "second_transform", "input_id": "first_transform", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "column_filter_exp", "args": { "exp": ["salesorder","item","article","sample_json_field1","sample_json_field4","item_amount_json"] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "second_transform", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/acons/write_streaming_struct_data_fail.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/struct_data_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/struct_data/" } ], "transform_specs": [ { "spec_id": "first_transform", "input_id": "sales_source", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "cast", "args": { "cols": { "date": "StringType", "amount": "StringType" } } }, { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "from_json", "args": { "input_col": "sample", "schema": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} }, { "name": "field3", "type": "double", "nullable": true, "metadata": {} }, { "name": "field4", "type": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} } ] }, "nullable": true, "metadata": {} } ] } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "flatten_schema", "args": { "max_level": 1 } } ] }, { "spec_id": "second_transform", "input_id": "first_transform", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "column_filter_exp", "args": { "exp": ["salesorder","item","article","sample_json_field1","sample_json_field4","item_amount_json"] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "second_transform", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data_fail/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data_fail/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/control/chain_control.csv ================================================ salesorder|item|date|customer|article|amount|is_historical|customer_name|lhe_row_id 0|1|20140601|customer1|article1|1000|true|Anna|0 0|2|20140601|customer1|article2|2000|true|Anna|8589934592 0|3|20140601|customer1|article3|500|true|Anna|1 1|1|20150601|customer1|article1|1000|true|Anna|2 1|2|20150601|customer1|article2|2000|true|Anna|8589934593 1|3|20150601|customer1|article3|500|true|Anna|8589934594 2|1|20160215|customer2|article4|1000|false|John|3 2|2|20160215|customer2|article6|5000|false|John|8589934595 2|3|20160215|customer2|article1|3000|false|John|4 3|1|20160215|customer1|article5|20000|false|Anna|8589934596 6|1|20160218|customer3|article7|100|false|Sarah|5 6|2|20160218|customer3|article9|500|false|Sarah|6 6|3|20160218|customer3|article8|300|false|Sarah|8589934597 7|1|20160218|customer5|article7|2000|false||8589934598 ================================================ FILE: tests/resources/feature/transformations/chain_transformations/control/struct_data.json ================================================ [ { "salesorder": 1, "item": 1, "article": "article1", "amount": "1000", "sample": "{\"field1\": \"value1\", \"field2\": \"value2\", \"field4\": {\"field1\": \"value1\", \"field2\": \"value2\"}}", "date2": "20160601", "customer2": "customer1", "constant": "just a constant", "length_customer2": 9, "sample_json_field1": "value1", "sample_json_field2": "value2", "sample_json_field3": null, "sample_json_field4": {"field1": "value1", "field2": "value2"}, "item_amount_json": "{\"item\":1,\"amount\":\"1000\"}" }, { "salesorder": 1, "item": 2, "article": "article2", "amount": "2000", "sample": "{\"field1\": \"value3\", \"field2\": \"value4\", \"field4\": {\"field1\": \"1value\", \"field2\": \"2value\"}}", "date2": "20160601", "customer2": "customer1", "constant": "just a constant", "length_customer2": 9, "sample_json_field1": "value3", "sample_json_field2": "value4", "sample_json_field3": null, "sample_json_field4": {"field1": "1value", "field2": "2value"}, "item_amount_json": "{\"item\":2,\"amount\":\"2000\"}" }, { "salesorder": 1, "item": 3, "article": "article3", "amount": "500", "sample": "{\"field1\": \"value5\", \"field3\": 6.25, \"field4\": {\"field1\": \"1value1\", \"field2\": \"2value2\"}}", "date2": "20160601", "customer2": "customer1", "constant": "just a constant", "length_customer2": 9, "sample_json_field1": "value5", "sample_json_field2": null, "sample_json_field3": 6.25, "sample_json_field4": {"field1": "1value1", "field2": "2value2"}, "item_amount_json": "{\"item\":3,\"amount\":\"500\"}" } ] ================================================ FILE: tests/resources/feature/transformations/chain_transformations/schema/customer_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "name", "type": "string", "nullable": true, "metadata": {} }, { "name": "birth_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/schema/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/schema/struct_data_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "sample", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/chain_transformations/source/customers.csv ================================================ customer|name|birth_date customer1|Anna|01012002 customer2|John|04051980 customer3|Sarah|02051940 ================================================ FILE: tests/resources/feature/transformations/chain_transformations/source/sales_historical.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/chain_transformations/source/sales_new.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/transformations/chain_transformations/source/struct_data.csv ================================================ salesorder|item|date|customer|article|amount|sample 1|1|20160601|customer1|article1|1000|{"field1":"value1","field2":"value2","field4":{"field1":"value1","field2":"value2"}} 1|2|20160601|customer1|article2|2000|{"field1":"value3","field2":"value4","field4":{"field1":"1value","field2":"2value"}} 1|3|20160601|customer1|article3|500|{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}} ================================================ FILE: tests/resources/feature/transformations/column_creators/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "with_literals", "args": { "literals": { "dummy_string": "this is a string", "dummy_int": 100, "dummy_double": 10.2, "dummy_boolean": true } } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/batch/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/batch/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/column_creators/data/control/part-01.json ================================================ [ { "salesorder": 1, "item": 1, "article": "article1", "amount": 1000, "date": 20160601, "customer": "customer1", "dummy_string": "this is a string", "dummy_int": 100, "dummy_double": 10.2, "dummy_boolean": true }, { "salesorder": 1, "item": 2, "article": "article2", "amount": 2000, "date": 20160601, "customer": "customer1", "dummy_string": "this is a string", "dummy_int": 100, "dummy_double": 10.2, "dummy_boolean": true }, { "salesorder": 1, "item": 3, "article": "article3", "amount": 500, "date": 20160601, "customer": "customer1", "dummy_string": "this is a string", "dummy_int": 100, "dummy_double": 10.2, "dummy_boolean": true } ] ================================================ FILE: tests/resources/feature/transformations/column_creators/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/column_creators/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/column_creators/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "with_literals", "args": { "literals": { "dummy_string": "this is a string", "dummy_int": 100, "dummy_double": 10.2, "dummy_boolean": true } } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/streaming/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "json", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "explode_columns", "args": { "explode_arrays": true } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/batch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/data/control/part-01.csv ================================================ salesorder|item|article|amount|manufacturing_countries|sub_articles|date2|customer2|constant|length_customer2|item_amount_json 1|1|article1|1000|Portugal|article101|20220101|customer1|just a constant|9|{"item":1,"amount":1000} 1|1|article1|1000|Portugal|article102|20220101|customer1|just a constant|9|{"item":1,"amount":1000} 1|1|article1|1000|Spain|article101|20220101|customer1|just a constant|9|{"item":1,"amount":1000} 1|1|article1|1000|Spain|article102|20220101|customer1|just a constant|9|{"item":1,"amount":1000} 1|2|article2|1000|Portugal|article201|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Portugal|article202|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Portugal|article203|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Algeria|article201|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Algeria|article202|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Algeria|article203|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Italy|article201|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Italy|article202|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 1|2|article2|1000|Italy|article203|20220102|customer2|just a constant|9|{"item":2,"amount":1000} 2|1|article3|1200|Norway|article301|20220102|customer3|just a constant|9|{"item":1,"amount":1200} 2|1|article4|1500|Portugal|article401|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Portugal|article402|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Portugal|article403|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Malaysia|article401|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Malaysia|article402|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Malaysia|article403|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Germany|article401|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Germany|article402|20220103|customer2|just a constant|9|{"item":1,"amount":1500} 2|1|article4|1500|Germany|article403|20220103|customer2|just a constant|9|{"item":1,"amount":1500} ================================================ FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/data/source/part-01.json ================================================ {"salesorder": 1,"item": 1,"date": 20220101,"customer":"customer1","article": "article1","amount": 1000,"manufacturing_countries": ["Portugal", "Spain"], "sub_articles": ["article101", "article102"]} {"salesorder": 1,"item": 2,"date": 20220102,"customer":"customer2","article": "article2","amount": 1000,"manufacturing_countries": ["Portugal", "Algeria", "Italy"], "sub_articles": ["article201", "article202", "article203"]} {"salesorder": 2,"item": 1,"date": 20220102,"customer":"customer3","article": "article3","amount": 1200,"manufacturing_countries": ["Norway"], "sub_articles": ["article301"]} {"salesorder": 2,"item": 1,"date": 20220103,"customer":"customer2","article": "article4","amount": 1500,"manufacturing_countries": ["Portugal", "Malaysia", "Germany"], "sub_articles": ["article401", "article402", "article403"]} ================================================ FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "manufacturing_countries", "nullable": true, "metadata": {}, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "name": "sub_articles", "nullable": true, "metadata": {}, "type": { "containsNull": true, "elementType": "string", "type": "array" } } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "json", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "explode_columns", "args": { "explode_arrays": true } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/streaming/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "json", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "from_json", "args": { "input_col": "agg_fields", "schema": { "type": "struct", "fields": [ { "name": "field1", "nullable": true, "metadata": {}, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "name": "field2", "type": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} } ] }, "nullable": true, "metadata": {} } ] } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "flatten_schema", "args": { "max_level": 2 } }, { "function": "explode_columns", "args": { "explode_arrays": true, "map_cols_to_explode": [ "sample" ] } }, { "function": "flatten_schema", "args": { "max_level": 2 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data/control/part-01.csv ================================================ salesorder|item|article|amount|sub_articles|sample_key|sample_value|agg_fields|date2|customer2|constant|length_customer2|agg_fields_json_field1|agg_fields_json_field2_field1|agg_fields_json_field2_field2|item_amount_json 1|1|article1|1000|article101|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article101|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article101|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article101|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article102|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article102|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article102|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000} 1|1|article1|1000|article102|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000} 1|2|article2|1000|article201|field1|value3|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article201|field2|value4|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article201|field1|value5|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article201|field2|value6|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article202|field1|value3|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article202|field2|value4|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article202|field1|value5|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article202|field2|value6|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article203|field1|value3|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article203|field2|value4|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article203|field1|value5|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 1|2|article2|1000|article203|field2|value6|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000} 2|1|article3|1200|article301|field1|value7|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Malaysia|||{"item":1,"amount":1200} 2|1|article3|1200|article301|field2|value8|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Malaysia|||{"item":1,"amount":1200} 2|1|article3|1200|article301|field1|value7|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Germany|||{"item":1,"amount":1200} 2|1|article3|1200|article301|field2|value8|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Germany|||{"item":1,"amount":1200} 2|1|article4|1500|article401|field1|value9|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500} 2|1|article4|1500|article401|field2|value10|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500} 2|1|article4|1500|article402|field1|value9|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500} 2|1|article4|1500|article402|field2|value10|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500} 2|1|article4|1500|article403|field1|value9|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500} 2|1|article4|1500|article403|field2|value10|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500} ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data/source/part-01.json ================================================ {"salesorder":1,"item":1,"date":20220101,"customer":"customer1","article":"article1","amount":1000,"sub_articles":["article101","article102"],"sample":[{"field1":"value1","field2":"value2"}],"agg_fields":{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}} {"salesorder":1,"item":2,"date":20220102,"customer":"customer2","article":"article2","amount":1000,"sub_articles":["article201","article202","article203"],"sample":[{"field1":"value3","field2":"value4"},{"field1":"value5","field2":"value6"}],"agg_fields":{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}} {"salesorder":2,"item":1,"date":20220102,"customer":"customer3","article":"article3","amount":1200,"sub_articles":["article301"],"sample":[{"field1":"value7","field2":"value8"}],"agg_fields":{"field1":["Malaysia","Germany"]}} {"salesorder":2,"item":1,"date":20220103,"customer":"customer2","article":"article4","amount":1500,"sub_articles":["article401","article402","article403"],"sample":[{"field1":"value9","field2":"value10"}],"agg_fields":{"field2":{"field1":"value2","field2":"value3"}}} ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "sub_articles", "nullable": true, "metadata": {}, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "name": "sample", "nullable": true, "metadata": {}, "type": { "containsNull": true, "elementType": { "keyType": "string", "type": "map", "valueContainsNull": true, "valueType": "string" }, "type": "array" } }, { "name": "agg_fields", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "json", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "from_json", "args": { "input_col": "agg_fields", "schema": { "type": "struct", "fields": [ { "name": "field1", "nullable": true, "metadata": {}, "type": { "containsNull": true, "elementType": "string", "type": "array" } }, { "name": "field2", "type": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} } ] }, "nullable": true, "metadata": {} } ] } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "flatten_schema", "args": { "max_level": 2 } }, { "function": "explode_columns", "args": { "explode_arrays": true, "map_cols_to_explode": [ "sample" ] } }, { "function": "flatten_schema", "args": { "max_level": 2 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/streaming/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "json", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "from_json", "args": { "input_col": "sample", "schema": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} }, { "name": "field3", "type": "double", "nullable": true, "metadata": {} }, { "name": "field4", "type": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} } ] }, "nullable": true, "metadata": {} } ] } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "flatten_schema", "args": { "max_level": 2 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/batch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/data/control/part-01.csv ================================================ salesorder|item|article|amount|sample|date2|customer2|constant|length_customer2|sample_json_field1|sample_json_field2|sample_json_field3|sample_json_field4_field1|sample_json_field4_field2|item_amount_json 1|1|article1|1000|{"field1":"value1","field2":"value2","field4":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|value1|value2||value1|value2|{"item":1,"amount":1000} 1|2|article2|1000|{"field1":"value3","field2":"value4","field4":{"field1":"1value","field2":"2value"}}|20220102|customer2|just a constant|9|value3|value4||1value|2value|{"item":2,"amount":1000} 2|1|article3|1200|{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}|20220102|customer3|just a constant|9|value5||6.25|1value1|2value2|{"item":1,"amount":1200} 2|1|article4|1500|{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}|20220103|customer2|just a constant|9|value5||6.25|1value1|2value2|{"item":1,"amount":1500} ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/data/source/part-01.json ================================================ {"salesorder":1,"item":1,"date":20220101,"customer":"customer1","article":"article1","amount":1000,"sample":{"field1":"value1","field2":"value2","field4":{"field1":"value1","field2":"value2"}}} {"salesorder":1,"item":2,"date":20220102,"customer":"customer2","article":"article2","amount":1000,"sample":{"field1":"value3","field2":"value4","field4":{"field1":"1value","field2":"2value"}}} {"salesorder":2,"item":1,"date":20220102,"customer":"customer3","article":"article3","amount":1200,"sample":{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}} {"salesorder":2,"item":1,"date":20220103,"customer":"customer2","article":"article4","amount":1500,"sample":{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}} ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "sample", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "json", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data" } ], "transform_specs": [ { "spec_id": "sales_source", "input_id": "sales_source", "transformers": [ { "function": "rename", "args": { "cols": { "date": "date2", "customer": "customer2" } } }, { "function": "with_expressions", "args": { "cols_and_exprs": { "constant": "'just a constant'", "length_customer2": "length(customer2)" } } }, { "function": "from_json", "args": { "input_col": "sample", "schema": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} }, { "name": "field3", "type": "double", "nullable": true, "metadata": {} }, { "name": "field4", "type": { "type": "struct", "fields": [ { "name": "field1", "type": "string", "nullable": true, "metadata": {} }, { "name": "field2", "type": "string", "nullable": true, "metadata": {} } ] }, "nullable": true, "metadata": {} } ] } } }, { "function": "to_json", "args": { "in_cols": [ "item", "amount" ], "out_col": "item_amount_json" } }, { "function": "flatten_schema", "args": { "max_level": 2 } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_source", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/streaming/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/data_maskers/data/control/drop_columns.csv ================================================ salesorder|item|date|amount 1|1|20160601|1000 1|2|20160601|2000 1|3|20160601|500 ================================================ FILE: tests/resources/feature/transformations/data_maskers/data/control/hash_masking.csv ================================================ salesorder|item|date|amount|customer|customer_hash|article|article_hash 1|1|20160601|-14577491|customer1|dea26157fa355301663174eac368538cff8939f36681d6712dedba439ab98b70|article1|36b3061d4fb72c32379a2ad0f05ace632371107ce414a1b3d51ef64247f53952 1|2|20160601|1268485177|customer1|dea26157fa355301663174eac368538cff8939f36681d6712dedba439ab98b70|article2|8e3ba57e23105c9aaceb58b2ad0f5de979199a7732a6ee3734404ca7745c6fef 1|3|20160601|-2108627946|customer1|dea26157fa355301663174eac368538cff8939f36681d6712dedba439ab98b70|article3|12717ebdf09ca4f2b2318796b6653e9b96989eda7726da4d94b73a3614476ae6 ================================================ FILE: tests/resources/feature/transformations/data_maskers/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/data_maskers/drop_columns.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/data" } ], "transform_specs": [ { "spec_id": "masked_data", "input_id": "sales_source", "transformers": [ { "function": "column_dropper", "args": { "cols": ["customer", "article"] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "masked_data", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/data_maskers/drop_columns/data" } ] } ================================================ FILE: tests/resources/feature/transformations/data_maskers/drop_columns_control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/data_maskers/hash_masking.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/data" } ], "transform_specs": [ { "spec_id": "masked_data", "input_id": "sales_source", "transformers": [ { "function": "hash_masker", "args": { "cols": ["customer", "article"] } }, { "function": "hash_masker", "args": { "cols": ["amount"], "approach": "MURMUR3", "suffix": "" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "masked_data", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/data_maskers/hash_masking/data" } ] } ================================================ FILE: tests/resources/feature/transformations/data_maskers/hash_masking_control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "customer_hash", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_hash", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/data_maskers/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/date_transformers/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} }, { "name": "order_date2", "type": "date", "nullable": true, "metadata": {} }, { "name": "order_date3", "type": "string", "nullable": true, "metadata": {} }, { "name": "ship_date", "type": "string", "nullable": true, "metadata": {} }, { "name": "ship_date2", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "order_date2_day", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date2_month", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date2_week", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date2_quarter", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date2_year", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ship_date2_day", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ship_date2_month", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ship_date2_week", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ship_date2_quarter", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ship_date2_year", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/date_transformers/data/control/part-01.csv ================================================ salesorder|order_date|order_date2|order_date3|ship_date|ship_date2|order_date2_day|order_date2_month|order_date2_week|order_date2_quarter|order_date2_year|ship_date2_day|ship_date2_month|ship_date2_week|ship_date2_quarter|ship_date2_year| 1|2016-06-01|2016-06-01|16-1-6|16-6-2|2016-02-06 23:40:43|1|6|22|2|2016|6|2|5|1|2016| 2|2016-07-03|2016-07-03|16-3-7|16-22-5|2016-05-22 22:12:54|3|7|26|3|2016|22|5|20|2|2016| 3|2017-01-02|2017-01-02|17-2-1|17-1-3|2017-03-01 07:43:11|2|1|1|1|2017|1|3|9|1|2017| ================================================ FILE: tests/resources/feature/transformations/date_transformers/data/source/part-01.csv ================================================ salesorder|order_date|order_date2|order_date3|ship_date|ship_date2 1|2016-06-01|01-06-2016|20160601|2016-06-02 23:40:43|2016-02-06T23:40:43.000Z 2|2016-07-03|03-07-2016|20160703|2016-22-05 22:12:54|2016-05-22T22:12:54.000Z 3|2017-01-02|02-01-2017|20170102|2017-01-03 07:43:11|2017-03-01T07:43:11.000Z ================================================ FILE: tests/resources/feature/transformations/date_transformers/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "order_date", "type": "date", "nullable": true, "metadata": {} }, { "name": "order_date2", "type": "string", "nullable": true, "metadata": {} }, { "name": "order_date3", "type": "string", "nullable": true, "metadata": {} }, { "name": "ship_date", "type": "string", "nullable": true, "metadata": {} }, { "name": "ship_date2", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/date_transformers/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/date_transformers/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/date_transformers/data" } ], "transform_specs": [ { "spec_id": "sales_with_new_dates", "input_id": "sales_source", "transformers": [ { "function": "add_current_date", "args": { "output_col": "curr_date" } }, { "function": "convert_to_date", "args": { "cols": ["order_date2"], "source_format": "dd-MM-yyyy" } }, { "function": "convert_to_date", "args": { "cols": ["order_date3"], "source_format": "yyyyMMdd" } }, { "function": "convert_to_timestamp", "args": { "cols": ["ship_date"], "source_format": "yyyy-dd-MM HH:mm:ss" } }, { "function": "format_date", "args": { "cols": ["order_date3", "ship_date"], "target_format": "yy-d-M" } }, { "function": "get_date_hierarchy", "args": { "cols": ["order_date2", "ship_date2"] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_with_new_dates", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/date_transformers/streaming/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/date_transformers/streaming/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/batch.json ================================================ { "input_specs": [ { "spec_id": "orders_source", "read_type": "batch", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/data/part-01.csv" } ], "transform_specs": [ { "spec_id": "orders_duplicate_no_args", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows" } ] }, { "spec_id": "orders_duplicate_empty", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows", "args": { "cols": [] } } ] }, { "spec_id": "orders_duplicate", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows", "args": { "cols": ["order_number","item_number"] } } ] } ], "output_specs": [ { "spec_id": "orders_duplicate_no_args_write", "input_id": "orders_duplicate_no_args", "write_type": "overwrite", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/batch/orders_duplicate_no_args/data" }, { "spec_id": "orders_duplicate_empty_write", "input_id": "orders_duplicate_empty", "write_type": "overwrite", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/batch/orders_duplicate_empty/data" }, { "spec_id": "orders_duplicate_write", "input_id": "orders_duplicate", "write_type": "overwrite", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/batch/columns/data" } ] } ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/batch_distinct.json ================================================ [ { "order_number": 1, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 20, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 22, "date": 20220102, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 3, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 4, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 2, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 2, "article_number": "article2", "amount": 300, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 3, "article_number": "article3", "amount": 200, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" } ] ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/batch_drop_duplicates.json ================================================ [ { "order_number": 1, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 20, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 3, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 4, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 2, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 2, "article_number": "article2", "amount": 300, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 3, "article_number": "article3", "amount": 200, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" } ] ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/streaming_distinct.json ================================================ [ { "order_number": 1, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 20, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 22, "date": 20220102, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 3, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 4, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 2, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 2, "article_number": "article2", "amount": 300, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 3, "article_number": "article3", "amount": 200, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 3, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 3, "item_number": 2, "article_number": "article2", "amount": 15, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 3, "item_number": 2, "article_number": "article2", "amount": 220, "date": 20220103, "customer_number": "customer3", "country": "portugal", "city": "porto" }, { "order_number": 4, "item_number": 1, "article_number": "article3", "amount": 350, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 5, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 5, "item_number": 1, "article_number": "article2", "amount": 300, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 5, "item_number": 2, "article_number": "article4", "amount": 10, "date": 20220102, "customer_number": "customer2", "country": "spain", "city": "madrid" } ] ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/streaming_drop_duplicates.json ================================================ [ { "order_number": 1, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 20, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 3, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 4, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 2, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 2, "article_number": "article2", "amount": 300, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 3, "article_number": "article3", "amount": 200, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 3, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 3, "item_number": 2, "article_number": "article2", "amount": 15, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 4, "item_number": 1, "article_number": "article3", "amount": 350, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 5, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 5, "item_number": 2, "article_number": "article4", "amount": 10, "date": 20220102, "customer_number": "customer2", "country": "spain", "city": "madrid" } ] ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/source/part-01.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 1|1|20220101|customer1|portugal|porto|article1|10 1|2|20220101|customer1|portugal|porto|article2|20 1|2|20220102|customer1|portugal|porto|article2|22 1|3|20220101|customer1|portugal|porto|article3|120 1|3|20220101|customer1|portugal|porto|article3|120 1|4|20220101|customer1|portugal|porto|article3|120 1|4|20220101|customer1|portugal|porto|article3|120 2|1|20220102|customer2|germany|nuremberg|article1|3 2|2|20220102|customer2|germany|nuremberg|article2|300 2|3|20220102|customer2|germany|nuremberg|article3|200 2|3|20220102|customer2|germany|nuremberg|article3|200 ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/source/part-02.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 3|1|20220101|customer1|portugal|porto|article1|10 3|2|20220101|customer1|portugal|porto|article2|15 3|2|20220103|customer3|portugal|porto|article2|220 4|1|20220101|customer1|portugal|porto|article3|350 4|1|20220101|customer1|portugal|porto|article3|350 5|1|20220102|customer2|germany|nuremberg|article1|3 5|1|20220102|customer2|germany|nuremberg|article2|300 5|2|20220102|customer2|spain|madrid|article4|10 5|2|20220102|customer2|spain|madrid|article4|10 5|2|20220102|customer2|spain|madrid|article4|10 ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "order_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "country", "type": "string", "nullable": true, "metadata": {} }, { "name": "city", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/drop_duplicate_rows/streaming.json ================================================ { "input_specs": [ { "spec_id": "orders_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/data" } ], "transform_specs": [ { "spec_id": "orders_duplicate_no_args", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows" } ] }, { "spec_id": "orders_duplicate_empty", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows", "args": { "cols": [] } } ] }, { "spec_id": "orders_duplicate", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows", "args": { "cols": ["order_number","item_number"] } } ] } ], "output_specs": [ { "spec_id": "orders_duplicate_no_args_write", "input_id": "orders_duplicate_no_args", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_no_args/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_no_args/data" }, { "spec_id": "orders_duplicate_empty_write", "input_id": "orders_duplicate_empty", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_empty/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_empty/data" }, { "spec_id": "orders_duplicate_write", "input_id": "orders_duplicate", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/columns/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/columns/data" } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/batch.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales" }, { "spec_id": "customers", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "left outer", "join_condition": "a.customer = b.customer", "select_cols": ["a.*", "b.name as customer_name"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "db_table": "test_db.batch_join", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/batch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/control_scenario_1_and_2_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer_name", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/control_scenario_3_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "name", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/customer_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "name", "type": "string", "nullable": true, "metadata": {} }, { "name": "birth_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/data/control/control_scenario_1_and_2.csv ================================================ salesorder|item|date|customer|article|amount|customer_name 1|1|20160601|customer1|article1|1000|Anna 1|2|20160601|customer1|article2|2000|Anna 1|3|20160601|customer1|article3|500|Anna 2|1|20170215|customer2|article4|1000|John 2|2|20170215|customer2|article6|5000|John 2|3|20170215|customer2|article1|3000|John 3|1|20170215|customer1|article5|20000|Anna 3|2|20170215|customer1|article2|12000|Anna 3|3|20170215|customer1|article4|9000|Anna 4|1|20170430|customer3|article3|8000|Sarah 4|2|20170430|customer3|article7|7000|Sarah 4|3|20170430|customer3|article1|3000|Sarah 4|4|20170430|customer3|article2|5000|Sarah ================================================ FILE: tests/resources/feature/transformations/joiners/data/control/control_scenario_3.csv ================================================ salesorder|item|date|customer|article|amount|name 1|1|20160601|customer1|article1|1000|Anna 1|2|20160601|customer1|article2|2000|Anna 1|3|20160601|customer1|article3|500|Anna 2|1|20170215|customer2|article4|1000|John 2|2|20170215|customer2|article6|5000|John 2|3|20170215|customer2|article1|3000|John 3|1|20170215|customer1|article5|20000|Anna 3|2|20170215|customer1|article2|12000|Anna 3|3|20170215|customer1|article4|9000|Anna 4|1|20170430|customer3|article3|8000|Sarah 4|2|20170430|customer3|article7|7000|Sarah 4|3|20170430|customer3|article1|3000|Sarah 4|4|20170430|customer3|article2|5000|Sarah ================================================ FILE: tests/resources/feature/transformations/joiners/data/source/customer-part-01.csv ================================================ customer|name|birth_date customer1|Anna|01012002 customer2|John|04051980 customer3|Sarah|02051940 ================================================ FILE: tests/resources/feature/transformations/joiners/data/source/sales-part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/joiners/data/source/sales-part-02.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20170215|customer2|article4|1000 2|2|20170215|customer2|article6|5000 2|3|20170215|customer2|article1|3000 3|1|20170215|customer1|article5|20000 3|2|20170215|customer1|article2|12000 3|3|20170215|customer1|article4|9000 4|1|20170430|customer3|article3|8000 4|2|20170430|customer3|article7|7000 4|3|20170430|customer3|article1|3000 4|4|20170430|customer3|article2|5000 ================================================ FILE: tests/resources/feature/transformations/joiners/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/streaming.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales" }, { "spec_id": "customers", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "left outer", "join_condition": "a.customer = b.customer", "select_cols": ["a.*", "b.name as customer_name"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "db_table": "test_db.streaming_join", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming/data" } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/streaming_foreachBatch.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales" }, { "spec_id": "customers", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "left outer", "join_condition": "a.customer = b.customer", "select_cols": ["a.*", "b.name as customer_name"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "db_table": "test_db.streaming_join_foreachBatch", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_foreachBatch/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_foreachBatch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/streaming_without_broadcast.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales" }, { "spec_id": "customers", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "left outer", "join_condition": "a.customer = b.customer", "select_cols": ["a.*", "b.name as customer_name"], "broadcast_join": false } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "db_table": "test_db.streaming_without_broadcast", "data_format": "delta", "partitions": [ "customer", "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_broadcast/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_broadcast/data" } ] } ================================================ FILE: tests/resources/feature/transformations/joiners/streaming_without_column_rename.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales" }, { "spec_id": "customers", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "left outer", "join_condition": "a.customer = b.customer", "select_cols": ["a.*", "b.name"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "db_table": "test_db.streaming_join_without_column_rename", "data_format": "delta", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_column_rename/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_column_rename/data" } ] } ================================================ FILE: tests/resources/feature/transformations/multiple_transform/batch.json ================================================ { "input_specs": [ { "spec_id": "orders_source", "read_type": "batch", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/multiple_transform/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/multiple_transform/data" } ], "transform_specs": [ { "spec_id": "orders_customer_cols", "input_id": "orders_source", "transformers": [ { "function": "column_filter_exp", "args": { "exp": ["date", "country", "customer_number"] } } ] }, { "spec_id": "orders_kpi_cols", "input_id": "orders_source", "transformers": [ { "function": "column_filter_exp", "args": { "exp": ["date", "city", "amount"] } } ] } ], "output_specs": [ { "spec_id": "orders_bronze_customer_cols", "input_id": "orders_customer_cols", "write_type": "overwrite", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_customer_cols/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_customer_cols/checkpoint" } }, { "spec_id": "orders_bronze_kpi_cols", "input_id": "orders_kpi_cols", "write_type": "overwrite", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_kpi_cols/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_kpi_cols/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/multiple_transform/data/control/part-01.json ================================================ [ { "order_number": 1, "item_number": 1, "article_number": "article1", "amount": 10, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 2, "article_number": "article2", "amount": 20, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 1, "item_number": 3, "article_number": "article3", "amount": 120, "date": 20220101, "customer_number": "customer1", "country": "portugal", "city": "porto" }, { "order_number": 2, "item_number": 1, "article_number": "article1", "amount": 3, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 2, "article_number": "article2", "amount": 300, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" }, { "order_number": 2, "item_number": 3, "article_number": "article3", "amount": 200, "date": 20220102, "customer_number": "customer2", "country": "germany", "city": "nuremberg" } ] ================================================ FILE: tests/resources/feature/transformations/multiple_transform/data/source/part-01.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 1|1|20220101|customer1|portugal|porto|article1|10 1|2|20220101|customer1|portugal|porto|article2|20 1|3|20220101|customer1|portugal|porto|article3|120 2|1|20220102|customer2|germany|nuremberg|article1|3 2|2|20220102|customer2|germany|nuremberg|article2|300 2|3|20220102|customer2|germany|nuremberg|article3|200 ================================================ FILE: tests/resources/feature/transformations/multiple_transform/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "order_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "country", "type": "string", "nullable": true, "metadata": {} }, { "name": "city", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/null_handlers/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": false, "metadata": {} }, { "name": "customer", "type": "string", "nullable": false, "metadata": {} }, { "name": "amount", "type": "float", "nullable": false, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/null_handlers/data/control/replace_nulls.csv ================================================ salesorder|customer|amount 1|customer1|-999 -999|customer2|200.5 3|UNKNOWN|100.0 ================================================ FILE: tests/resources/feature/transformations/null_handlers/data/control/replace_nulls_col_subset.csv ================================================ salesorder|customer|amount 1|customer1|-999 |customer2|200.5 3||100.0 ================================================ FILE: tests/resources/feature/transformations/null_handlers/data/source/part-01.csv ================================================ salesorder|customer|amount 1|customer1| |customer2|200.50 3||100.00 ================================================ FILE: tests/resources/feature/transformations/null_handlers/replace_nulls.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/data" } ], "transform_specs": [ { "spec_id": "sales_without_nulls", "input_id": "sales_source", "transformers": [ { "function": "replace_nulls" } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_without_nulls", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/null_handlers/replace_nulls_col_subset.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/data" } ], "transform_specs": [ { "spec_id": "sales_without_nulls", "input_id": "sales_source", "transformers": [ { "function": "replace_nulls", "args": { "subset_cols": ["amount"] } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "sales_without_nulls", "write_type": "append", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls_col_subset/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls_col_subset/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/transformations/null_handlers/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "float", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/optimizers/data/source/part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/batch.json ================================================ { "input_specs": [ { "spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "with_filepath": true, "options": { "mode": "FAILFAST", "header": true, "delimiter": "|", "inferSchema": true }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/regex_transformers/with_regex_value/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/regex_transformers/with_regex_value/data" } ], "transform_specs": [ { "spec_id": "with_extraction_date", "input_id": "sales_source", "transformers": [ { "function": "with_regex_value", "args": { "input_col": "lhe_extraction_filepath", "output_col": "extraction_date", "drop_input_col": true, "regex": ".*WE_SO_SCL_(\\d+).csv" } } ] } ], "output_specs": [ { "spec_id": "sales_bronze", "input_id": "with_extraction_date", "write_type": "overwrite", "data_format": "delta", "location": "file:///app/tests/lakehouse/out/feature/transformations/regex_transformers/with_regex_value/data" } ] } ================================================ FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "extraction_date", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/data/control/part-01.csv ================================================ salesorder|item|date|customer|article|amount|extraction_date 1|1|20160601|customer1|article1|1000|202108111400000029 1|2|20160601|customer1|article2|2000|202108111400000029 1|3|20160601|customer1|article3|500|202108111400000029 ================================================ FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/data/source/WE_SO_SCL_202108111400000029.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20160601|customer1|article1|1000 1|2|20160601|customer1|article2|2000 1|3|20160601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/unions/batch_union.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_union/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/batch_unionByName.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union_by_name", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_unionByName/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/batch_unionByName_diff_schema.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv" }, { "spec_id": "sales_shipment", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/sales-shipment-part-01.csv" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union_by_name", "args": { "union_with": ["sales_new", "sales_shipment"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_unionByName_diff_schema/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/batch_unionByName_diff_schema_error.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv" }, { "spec_id": "sales_shipment", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/sales-shipment-part-01.csv" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union_by_name", "args": { "union_with": ["sales_new", "sales_shipment"], "allow_missing_columns": false } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_unionByName_diff_schema_error/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/batch_union_diff_schema.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv" }, { "spec_id": "sales_shipment", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/sales-shipment-part-01.csv" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new", "sales_shipment"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_union_diff_schema/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/data/control/control_sales.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 ================================================ FILE: tests/resources/feature/transformations/unions/data/control/control_sales_shipment.csv ================================================ salesorder|item|date|customer|article|amount|ship_date 1|1|20150601|customer1|article1|1000| 1|2|20150601|customer1|article2|2000| 1|3|20150601|customer1|article3|500| 2|1|20160215|customer2|article4|1000| 2|2|20160215|customer2|article6|5000| 2|3|20160215|customer2|article1|3000| 3|1|20160215|customer1|article5|20000| 4|1|20170215|customer2|article4|1000|20170216 4|2|20170215|customer2|article6|5000|20170216 5|1|20170215|customer1|article5|20000|20170216 5|3|20170215|customer2|article1|3000|20170216 ================================================ FILE: tests/resources/feature/transformations/unions/data/control/control_sales_shipment_streaming.csv ================================================ salesorder|item|date|customer|article|amount|ship_date 0|1|20140601|customer1|article1|1000| 0|2|20140601|customer1|article2|2000| 0|3|20140601|customer1|article3|500| 1|1|20150601|customer1|article1|1000| 1|2|20150601|customer1|article2|2000| 1|3|20150601|customer1|article3|500| 2|1|20160215|customer2|article4|1000| 2|2|20160215|customer2|article6|5000| 2|3|20160215|customer2|article1|3000| 3|1|20160215|customer1|article5|20000| 4|1|20170215|customer2|article4|1000|20170216 4|2|20170215|customer2|article6|5000|20170216 5|1|20170215|customer1|article5|20000|20170216 5|3|20170215|customer2|article1|3000|20170216 6|1|20160218|customer3|article7|100| 6|2|20160218|customer3|article9|500| 6|3|20160218|customer3|article8|300| 7|1|20160218|customer5|article7|2000| 8|1|20190215|customer2|article4|1000|20190216 8|2|20190215|customer2|article6|5000|20190216 9|3|20190215|customer2|article1|3000|20190216 9|1|20190215|customer1|article5|20000|20190216 ================================================ FILE: tests/resources/feature/transformations/unions/data/control/control_sales_shipment_streaming_foreachBatch.csv ================================================ salesorder|item|date|customer|article|amount|ship_date 0|1|20140601|customer1|article1|1000| 0|2|20140601|customer1|article2|2000| 0|3|20140601|customer1|article3|500| 1|1|20150601|customer1|article1|1000| 1|2|20150601|customer1|article2|2000| 1|3|20150601|customer1|article3|500| 2|1|20160215|customer2|article4|1000| 2|2|20160215|customer2|article6|5000| 2|3|20160215|customer2|article1|3000| 3|1|20160215|customer1|article5|20000| 4|1|20170215|customer2|article4|1000|20170216 4|2|20170215|customer2|article6|5000|20170216 5|1|20170215|customer1|article5|20000|20170216 5|3|20170215|customer2|article1|3000|20170216 6|1|20160218|customer3|article7|100| 6|2|20160218|customer3|article9|500| 6|3|20160218|customer3|article8|300| 7|1|20160218|customer5|article7|2000| 8|1|20190215|customer2|article4|1000|20190216 8|2|20190215|customer2|article6|5000|20190216 9|3|20190215|customer2|article1|3000|20190216 9|1|20190215|customer1|article5|20000|20190216 ================================================ FILE: tests/resources/feature/transformations/unions/data/control/control_sales_streaming.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/transformations/unions/data/control/control_sales_streaming_foreachBatch.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/transformations/unions/data/source/sales-historical-part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/unions/data/source/sales-historical-part-02.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/unions/data/source/sales-new-part-01.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 ================================================ FILE: tests/resources/feature/transformations/unions/data/source/sales-new-part-02.csv ================================================ salesorder|item|date|customer|article|amount 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/transformations/unions/data/source/sales-shipment-part-01.csv ================================================ salesorder|item|date|customer|article|amount|ship_date 4|1|20170215|customer2|article4|1000|20170216 4|2|20170215|customer2|article6|5000|20170216 5|3|20170215|customer2|article1|3000|20170216 5|1|20170215|customer1|article5|20000|20170216 ================================================ FILE: tests/resources/feature/transformations/unions/data/source/sales-shipment-part-02.csv ================================================ salesorder|item|date|customer|article|amount|ship_date 8|1|20190215|customer2|article4|1000|20190216 8|2|20190215|customer2|article6|5000|20190216 9|3|20190215|customer2|article1|3000|20190216 9|1|20190215|customer1|article5|20000|20190216 ================================================ FILE: tests/resources/feature/transformations/unions/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/unions/sales_shipment_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "ship_date", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/unions/streaming_union.json ================================================ { "input_specs": [ { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/" }, { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_new", "transformers": [ { "function": "union", "args": { "union_with": ["sales_historical"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/streaming_unionByName_diff_schema.json ================================================ { "input_specs": [ { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/" }, { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/" }, { "spec_id": "sales_shipment", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_new", "transformers": [ { "function": "union_by_name", "args": { "union_with": ["sales_historical", "sales_shipment"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/streaming_unionByName_diff_schema_foreachBatch.json ================================================ { "input_specs": [ { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/" }, { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/" }, { "spec_id": "sales_shipment", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_new", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "union_by_name", "args": { "union_with": ["sales_historical", "sales_shipment"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema_foreachBatch/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema_foreachBatch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/unions/streaming_union_foreachBatch.json ================================================ { "input_specs": [ { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/" }, { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_new", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "union", "args": { "union_with": ["sales_historical"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union_foreachBatch/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union_foreachBatch/data" } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/data/control/streaming_drop_duplicates.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 1|2|2017-05-10 01:01:01|customer1|portugal|porto|article2|20 2|1|2017-05-10 01:01:01|customer2|germany|nuremberg|article1|3 2|2|2017-05-10 01:01:01|customer2|germany|nuremberg|article2|300 3|1|2017-05-12 01:01:01|customer1|portugal|porto|article1|10 3|2|2017-05-12 01:01:01|customer1|portugal|porto|article2|15 3|2|2017-05-12 01:01:01|customer3|portugal|porto|article2|220 1|1|2017-05-10 01:01:01|customer1|portugal|porto|article1|10 1|2|2017-05-10 01:01:01|customer1|portugal|porto|article2|22 1|3|2017-05-10 01:01:01|customer1|portugal|porto|article3|120 1|4|2017-05-10 01:01:01|customer1|portugal|porto|article3|120 2|3|2017-05-10 01:01:01|customer2|germany|nuremberg|article3|200 4|1|2017-05-12 01:01:01|customer1|portugal|porto|article3|350 5|1|2017-05-12 01:01:01|customer2|germany|nuremberg|article1|3 5|1|2017-05-12 01:01:01|customer2|germany|nuremberg|article2|300 5|2|2017-05-12 01:01:01|customer2|spain|madrid|article4|10 5|2|2017-05-10 10:01:12|customer2|spain|madrid|article4|10 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/data/source/part-01.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10 1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|20 1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|22 1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 2|1|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article1|3 2|2|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article2|300 2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200 2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200 5|2|2017-05-10 10:01:12.000|customer2|spain|madrid|article4|10 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/data/source/part-02.csv ================================================ order_number|orders_duplicate_no_args|date|customer_number|country|city|article_number|amount 3|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article1|10 3|2|2017-05-12 01:01:01.000|customer1|portugal|porto|article2|15 3|2|2017-05-12 01:01:01.000|customer3|portugal|porto|article2|220 4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350 4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350 5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article1|3 5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article2|300 5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10 5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10 5|2|2017-05-06 10:01:12.000|customer2|spain|madrid|article4|10 5|2|2017-05-04 10:01:12.000|customer2|spain|madrid|article4|1000 1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "order_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "country", "type": "string", "nullable": true, "metadata": {} }, { "name": "city", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/streaming_drop_duplicates.json ================================================ { "input_specs": [ { "spec_id": "orders_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates/data" } ], "transform_specs": [ { "spec_id": "orders_duplicate_no_args", "input_id": "orders_source", "transformers": [ { "function": "drop_duplicate_rows", "args": { "watermarker": {"col": "date", "watermarking_time":"2 days"} } } ] } ], "dq_specs": [ { "spec_id": "dq_validator", "input_id": "orders_duplicate_no_args", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates/dq", "result_sink_db_table": "test_db.validator_full_overwrite", "result_sink_explode": true, "result_sink_extra_columns": ["validation_results.result.*"], "source": "orders_source", "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "date" } }, { "function": "expect_table_row_count_to_be_between", "args": { "min_value": 0, "max_value": 20 } } ] } ], "output_specs": [ { "spec_id": "orders_duplicate_no_args_write", "input_id": "orders_duplicate_no_args", "write_type": "append", "data_format": "delta", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates/data" } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data/control/streaming_drop_duplicates_overall_watermark.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 1|1|2017-05-10 01:01:01|customer1|portugal|porto|article1|10 1|2|2017-05-10 01:01:01|customer1|portugal|porto|article2|22 1|4|2017-05-10 01:01:01|customer1|portugal|porto|article3|120 2|1|2017-05-10 01:01:01|customer2|germany|nuremberg|article1|3 2|2|2017-05-10 01:01:01|customer2|germany|nuremberg|article2|300 2|3|2017-05-10 01:01:01|customer2|germany|nuremberg|article3|200 3|1|2017-05-12 01:01:01|customer1|portugal|porto|article1|10 3|2|2017-05-12 01:01:01|customer1|portugal|porto|article2|15 3|2|2017-05-12 01:01:01|customer3|portugal|porto|article2|220 4|1|2017-05-12 01:01:01|customer1|portugal|porto|article3|350 5|1|2017-05-12 01:01:01|customer2|germany|nuremberg|article1|3 1|3|2017-05-10 01:01:01|customer1|portugal|porto|article3|120 5|2|2017-05-10 10:01:12|customer2|spain|madrid|article4|10 5|2|2017-05-12 01:01:03|customer2|spain|madrid|article4|10 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data/source/part-01.csv ================================================ order_number|item_number|date|customer_number|country|city|article_number|amount 1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10 1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|20 1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|22 1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120 2|1|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article1|3 2|2|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article2|300 2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200 2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200 5|2|2017-05-10 10:01:12.000|customer2|spain|madrid|article4|10 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data/source/part-02.csv ================================================ order_number|orders_duplicate_no_args|date|customer_number|country|city|article_number|amount 3|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article1|10 3|2|2017-05-12 01:01:01.000|customer1|portugal|porto|article2|15 3|2|2017-05-12 01:01:01.000|customer3|portugal|porto|article2|220 4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350 4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350 5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article1|3 5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article2|300 5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10 5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10 5|2|2017-05-12 01:01:02.000|customer2|spain|madrid|article4|10 5|2|2017-05-12 01:01:03.000|customer2|spain|madrid|article4|10 5|2|2017-05-06 10:01:12.000|customer23|spain|madrid|article4|10 5|2|2017-05-04 10:01:12.000|customer22|spain|madrid|article4|1000 1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/source_schema.json ================================================ { "type": "struct", "fields": [ { "name": "order_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item_number", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "country", "type": "string", "nullable": true, "metadata": {} }, { "name": "city", "type": "string", "nullable": true, "metadata": {} }, { "name": "article_number", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/streaming_drop_duplicates_overall_watermark.json ================================================ { "input_specs": [ { "spec_id": "orders_source", "read_type": "streaming", "data_format": "csv", "options": { "mode": "FAILFAST", "header": true, "delimiter": "|" }, "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/source_schema.json", "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data" } ], "transform_specs": [ { "spec_id": "watermarking_orders", "input_id": "orders_source", "transformers": [ { "function": "with_watermark", "args" : {"watermarker_column": "date", "watermarker_time":"2 days"} }, {"function":"drop_duplicate_rows"}, { "function": "group_and_rank", "args": { "group_key": [ "order_number", "item_number", "customer_number", "city" ], "ranking_key": [ "date" ] } } ] } ], "output_specs": [ { "spec_id": "orders_duplicate_no_args_write", "input_id": "watermarking_orders", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data" } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/customer_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "name", "type": "string", "nullable": true, "metadata": {} }, { "name": "birth_date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/control/streaming_inner_join.csv ================================================ salesorder|item|date|customer|article|amount|customer_name 3|1|2017-05-12 01:01:01|customer1|article5|20000|Anna 3|2|2017-05-12 01:01:01|customer1|article2|12000|Anna 3|3|2017-05-12 01:01:01|customer1|article4|9000|Anna 4|1|2017-05-12 01:01:01|customer3|article3|8000|Sarah 4|2|2017-05-12 01:01:01|customer3|article7|7000|Sarah 4|3|2017-05-12 01:01:01|customer3|article1|3000|Sarah 4|4|2017-05-12 01:01:01|customer3|article2|5000|Sarah 2|1|2017-05-12 01:01:01|customer2|article4|1000|John 2|2|2017-05-12 01:01:01|customer2|article6|5000|John 2|3|2017-05-12 01:01:01|customer2|article1|3000|John 1|1|2017-05-10 01:01:01|customer1|article1|1000|Anna 1|2|2017-05-10 01:01:01|customer1|article2|2000|Anna 1|3|2017-05-10 01:01:01|customer1|article3|500|Anna ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/source/customer-part-01.csv ================================================ customer|name|birth_date|date customer1|Anna|01012002|2017-05-10 01:01:01.000 customer2|John|04051980|2017-05-10 01:01:01.000 customer3|Sarah|02051940|2017-05-10 01:01:01.000 customer7|George|02051940|2017-05-10 01:01:01.000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/source/sales-part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|2017-05-10 01:01:01.000|customer1|article1|1000 1|2|2017-05-10 01:01:01.000|customer1|article2|2000 1|3|2017-05-10 01:01:01.000|customer1|article3|500 1|3|2017-05-10 01:01:01.000|customer10|article3|500 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/source/sales-part-02.csv ================================================ salesorder|item|date|customer|article|amount 2|1|2017-05-12 01:01:01.000|customer2|article4|1000 2|2|2017-05-12 01:01:01.000|customer2|article6|5000 2|3|2017-05-12 01:01:01.000|customer2|article1|3000 3|1|2017-05-12 01:01:01.000|customer1|article5|20000 3|2|2017-05-12 01:01:01.000|customer1|article2|12000 3|3|2017-05-12 01:01:01.000|customer1|article4|9000 4|1|2017-05-12 01:01:01.000|customer3|article3|8000 4|2|2017-05-12 01:01:01.000|customer3|article7|7000 4|3|2017-05-12 01:01:01.000|customer3|article1|3000 4|4|2017-05-12 01:01:01.000|customer3|article2|5000 4|4|2017-05-07 01:01:01.000|customer3|article2|5000 1|3|2017-05-14 01:01:01.000|customer100|article3|500 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/streaming_inner_join.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/data/sales" }, { "spec_id": "customers", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "join_with": "customers", "join_type": "inner", "join_condition": "a.customer = b.customer and a.date between b.date and b.date + interval 4 days", "select_cols": ["a.*", "b.name as customer_name"], "watermarker": {"a":{"col": "date", "watermarking_time": "2 days"}, "b": {"col": "date", "watermarking_time": "2 days"}} } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "db_table": "test_db.streaming_inner_join", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_inner_join/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_inner_join/data" } ], "exec_env": { "spark.sql.streaming.stateStore.stateSchemaCheck": false } } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/streaming_inner_join_control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer_name", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/customer_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customerId", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customerClickTime", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/control/streaming_left_outer_join.csv ================================================ customerId|customerBuyTime|customerClickTime 0|2018-03-06 04:32:09.076|2018-03-06 04:32:31.941 1|2018-03-06 04:32:09.276| 2|2018-03-06 04:32:09.476| 3|2018-03-06 04:32:10.676|2018-03-06 04:32:31.941 3|2018-03-06 07:54:10.876|2018-03-06 07:54:10.876 4|2018-03-06 04:32:10.876| 5|2018-03-06 04:32:10.076|2018-03-06 04:32:32.341 10|2018-03-06 04:32:00| 10|2018-03-06 04:53:10.676|2018-03-06 04:53:10.676 11|2018-03-06 04:53:10.876|2018-03-06 04:54:00.676 11|2018-03-06 04:53:10.876|2018-03-06 04:54:00.676 15|2018-03-06 04:32:05.676| 21|2018-03-03 04:53:10.876| ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-01.csv ================================================ customerId|customerClickTime 0|2018-03-06T04:32:31.941+0000 3|2018-03-06T04:32:31.941+0000 5|2018-03-06T04:32:32.341+0000 8|2018-03-06T04:32:32.941+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-02.csv ================================================ customerId|customerClickTime 10|2018-03-06T04:53:10.676+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-03.csv ================================================ customerId|customerClickTime 11|2018-03-06T04:54:00.676+0000 0|2018-03-06T07:53:10.876+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-04.csv ================================================ customerId|customerClickTime ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-05.csv ================================================ customerId|customerClickTime 3|2018-03-06T07:54:10.876+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-01.csv ================================================ customerId|customerBuyTime 0|2018-03-06T04:32:09.076+0000 1|2018-03-06T04:32:09.276+0000 2|2018-03-06T04:32:09.476+0000 21|2018-03-03T04:53:10.876+0000 10|2018-03-06T04:32:00.000+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-02.csv ================================================ customerId|customerBuyTime 3|2018-03-06T04:32:10.676+0000 4|2018-03-06T04:32:10.876+0000 5|2018-03-06T04:32:10.076+0000 15|2018-03-06T04:32:05.676+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-03.csv ================================================ customerId|customerBuyTime 10|2018-03-06T04:53:10.676+0000 11|2018-03-06T04:53:10.876+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-04.csv ================================================ customerId|customerBuyTime 11|2018-03-06T04:53:10.876+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-05.csv ================================================ customerId|customerBuyTime 3|2018-03-06T07:54:10.876+0000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customerId", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customerBuyTime", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/streaming_left_outer_join.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/data/sales" }, { "spec_id": "customers", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "left_df_alias": "df", "right_df_alias": "join_with", "join_with": "customers", "join_type": "left outer", "join_condition": "df.customerId = join_with.customerId and join_with.customerClickTime BETWEEN df.customerBuyTime AND df.customerBuyTime + INTERVAL 1 MINUTE", "select_cols": ["df.*", "join_with.customerClickTime"], "watermarker": {"df":{"col": "customerBuyTime", "watermarking_time": "10 seconds"}, "join_with": {"col": "customerClickTime", "watermarking_time": "20 seconds"}} } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "append", "data_format": "delta", "partitions": [ "customerId" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_left_outer_join/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_left_outer_join/data" } ], "exec_env": { "spark.sql.streaming.stateStore.stateSchemaCheck": false } } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/streaming_left_outer_join_control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customerId", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customerBuyTime", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customerClickTime", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/customer_schema.json ================================================ { "type": "struct", "fields": [ { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "name", "type": "string", "nullable": true, "metadata": {} }, { "name": "birth_date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/control/streaming_right_outer_join.csv ================================================ salesorder|item|date|customer|article|amount|customer_name 3|1|2017-05-12 01:01:01|customer1|article5|20000|Anna 3|2|2017-05-12 01:01:01|customer1|article2|12000|Anna 3|3|2017-05-12 01:01:01|customer1|article4|9000|Anna 4|1|2017-05-12 01:01:01|customer3|article3|8000|Sarah 2|1|2017-05-12 01:01:01|customer2|article4|1000|John 2|2|2017-05-12 01:01:01|customer2|article6|5000|John 2|3|2017-05-12 01:01:01|customer2|article1|3000|John 1|1|2017-05-12 00:00:01|customer1|article1|1000|Anna 1|2|2017-05-12 00:00:01|customer1|article2|2000|Anna 1|3|2017-05-12 00:00:01|customer1|article3|500|Anna| ||||||Fran| ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/source/customer-part-01.csv ================================================ customer|name|birth_date|date customer1|Anna|01012002|2017-05-12 23:01:01.000 customer2|John|04051980|2017-05-12 23:01:01.000 customer3|Sarah|02051940|2017-05-12 23:01:01.000 customer5|Fran|02051940|2017-05-05 00:01:01.000 customer6|Nuno|02051940|2017-05-12 00:01:01.000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/source/sales-part-01.csv ================================================ salesorder|item|date|customer|article|amount 1|1|2017-05-12 00:00:01.000|customer1|article1|1000 1|2|2017-05-12 00:00:01.000|customer1|article2|2000 1|3|2017-05-12 00:00:01.000|customer1|article3|500 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/source/sales-part-02.csv ================================================ salesorder|item|date|customer|article|amount 2|1|2017-05-12 01:01:01.000|customer2|article4|1000 2|2|2017-05-12 01:01:01.000|customer2|article6|5000 2|3|2017-05-12 01:01:01.000|customer2|article1|3000 3|1|2017-05-12 01:01:01.000|customer1|article5|20000 3|2|2017-05-12 01:01:01.000|customer1|article2|12000 3|3|2017-05-12 01:01:01.000|customer1|article4|9000 4|1|2017-05-12 01:01:01.000|customer3|article3|8000 4|3|2017-05-12 01:01:01.000|customer800|article1|3000 4|4|2017-05-05 01:01:01.000|customer3|article2|5000 4|4|2017-05-07 01:01:01.000|customer800|article2|5000 ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/streaming_right_outer_join.json ================================================ { "input_specs": [ { "spec_id": "sales", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/data/sales" }, { "spec_id": "customers", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/customer_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/data/customers" } ], "transform_specs": [ { "spec_id": "join_with_customers", "input_id": "sales", "transformers": [ { "function": "join", "args": { "left_df_alias": "df", "right_df_alias": "join_with", "join_with": "customers", "join_type": "right outer", "join_condition": "df.customer = join_with.customer and join_with.date >= df.date AND join_with.date <= df.date + interval 1 days", "select_cols": ["df.*", "join_with.name as customer_name"], "watermarker": {"df":{"col": "date", "watermarking_time": "2 days"}, "join_with": {"col": "date", "watermarking_time": "2 days"}} } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "join_with_customers", "write_type": "merge", "db_table": "test_db.streaming_outer_join", "data_format": "delta", "partitions": [ "date" ], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_right_outer_join/checkpoint" }, "merge_opts": { "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.customer_name == new.customer_name", "update_predicate": "new.date >= current.date" }, "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_right_outer_join/data" } ] } ================================================ FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/streaming_right_outer_join_control_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "timestamp", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer_name", "type": "string", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_batch_console.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } }, { "function": "coalesce", "args": { "num_partitions": 1 } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "console", "options": { "limit": 8, "truncate": false, "vertical": false } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_batch_dataframe.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } }, { "function": "coalesce", "args": { "num_partitions": 1 } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "dataframe" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_batch_files.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "location": "file:///app/tests/lakehouse/out/feature/writers/write_batch_files/data" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_batch_jdbc.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "coalesce", "args": {"num_partitions": 1} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "jdbc", "partitions": ["date"], "options":{ "url": "jdbc:sqlite:/app/tests/lakehouse/out/feature/writers/write_batch_jdbc/test.db", "dbtable": "write_batch_jdbc", "driver": "org.sqlite.JDBC" } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_batch_rest_api.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "with_literals", "args": {"literals": {"payload": "{\"a\": \"a value\"}"}} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "rest_api", "options": { "rest_api_url": "https://www.dummy-url.local/dummy-endpoint", "rest_api_method": "post", "rest_api_header": {"Authorization": "Bearer dummytoken"} } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_batch_table.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "coalesce", "args": {"num_partitions": 1} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "db_table": "test_db.write_batch_table", "location": "file:///app/tests/lakehouse/out/feature/writers/write_batch_table/data" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_console.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "console" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_dataframe.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "dataframe" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_df_with_checkpoint.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST", "maxFilesPerTrigger": "1" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST", "maxFilesPerTrigger": "1" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "dataframe", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_df_with_checkpoint/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_files.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_files/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_files/data" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_console.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "console" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_dataframe.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "dataframe" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_df_with_checkpoint.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST", "maxFilesPerTrigger": "1" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "force_streaming_foreach_batch_processing": true, "transformers": [ { "function": "union", "args": { "union_with": ["sales_new"] } } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "dataframe", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_df_with_checkpoint/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_files.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "force_streaming_foreach_batch_processing": true, "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_files/checkpoint" }, "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_files/data" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_jdbc.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "force_streaming_foreach_batch_processing": true, "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "coalesce", "args": {"num_partitions": 1} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "jdbc", "partitions": ["date"], "options":{ "url": "jdbc:sqlite:/app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_jdbc/test.db", "dbtable": "write_streaming_foreachBatch_jdbc", "driver": "org.sqlite.JDBC", "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_jdbc/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_table.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "batch", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "force_streaming_foreach_batch_processing": true, "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "coalesce", "args": {"num_partitions": 1} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "db_table": "test_db.write_streaming_foreachBatch_table", "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_table/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_table/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_multiple_dfs.json ================================================ { "input_specs": [ { "spec_id": "bronze_sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "bronze_sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "output_specs": [ { "spec_id": "sales_historical", "input_id": "bronze_sales_historical", "data_format": "dataframe" }, { "spec_id": "sales_new", "input_id": "bronze_sales_new", "data_format": "dataframe" } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_rest_api.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "with_literals", "args": {"literals": {"payload": "{\"a\": \"a value\"}"}} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "data_format": "rest_api", "options": { "rest_api_url": "https://www.dummy-url.local/dummy-endpoint", "rest_api_method": "put", "rest_api_basic_auth_username": "dummy_user", "rest_api_basic_auth_password": "dummy_password" } } ] } ================================================ FILE: tests/resources/feature/writers/acons/write_streaming_table.json ================================================ { "input_specs": [ { "spec_id": "sales_historical", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/" }, { "spec_id": "sales_new", "read_type": "streaming", "data_format": "csv", "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json", "options": { "header": true, "delimiter": "|", "mode": "FAILFAST" }, "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/" } ], "transform_specs": [ { "spec_id": "union_dataframes", "input_id": "sales_historical", "transformers": [ {"function": "union", "args": {"union_with": ["sales_new"]} }, {"function": "coalesce", "args": {"num_partitions": 1} } ] } ], "output_specs": [ { "spec_id": "sales", "input_id": "union_dataframes", "write_type": "append", "data_format": "delta", "partitions": ["date"], "db_table": "test_db.write_streaming_table", "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_table/data", "options": { "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_table/checkpoint" } } ] } ================================================ FILE: tests/resources/feature/writers/control/writers_control.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_1.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 ================================================ FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_2.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_foreachBatch_1.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 ================================================ FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_foreachBatch_2.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/feature/writers/schema/sales_schema.json ================================================ { "type": "struct", "fields": [ { "name": "salesorder", "type": "integer", "nullable": true, "metadata": {} }, { "name": "item", "type": "integer", "nullable": true, "metadata": {} }, { "name": "date", "type": "integer", "nullable": true, "metadata": {} }, { "name": "customer", "type": "string", "nullable": true, "metadata": {} }, { "name": "article", "type": "string", "nullable": true, "metadata": {} }, { "name": "amount", "type": "integer", "nullable": true, "metadata": {} } ] } ================================================ FILE: tests/resources/feature/writers/source/sales_historical_1.csv ================================================ salesorder|item|date|customer|article|amount 0|1|20140601|customer1|article1|1000 0|2|20140601|customer1|article2|2000 0|3|20140601|customer1|article3|500 ================================================ FILE: tests/resources/feature/writers/source/sales_historical_2.csv ================================================ salesorder|item|date|customer|article|amount 1|1|20150601|customer1|article1|1000 1|2|20150601|customer1|article2|2000 1|3|20150601|customer1|article3|500 ================================================ FILE: tests/resources/feature/writers/source/sales_new_1.csv ================================================ salesorder|item|date|customer|article|amount 2|1|20160215|customer2|article4|1000 2|2|20160215|customer2|article6|5000 2|3|20160215|customer2|article1|3000 3|1|20160215|customer1|article5|20000 ================================================ FILE: tests/resources/feature/writers/source/sales_new_2.csv ================================================ salesorder|item|date|customer|article|amount 6|1|20160218|customer3|article7|100 6|2|20160218|customer3|article9|500 6|3|20160218|customer3|article8|300 7|1|20160218|customer5|article7|2000 ================================================ FILE: tests/resources/unit/custom_configs/custom_engine_config.yaml ================================================ notif_disallowed_email_servers: - dummy.file.server ================================================ FILE: tests/resources/unit/heartbeat/heartbeat_acon_creation/setup/column_list/heartbeat_sensor_control_table.json ================================================ { "sensor_source": "string", "sensor_id": "string", "sensor_read_type": "string", "asset_description": "string", "upstream_key": "string", "preprocess_query": "string", "latest_event_fetched_timestamp": "timestamp", "trigger_job_id": "string", "trigger_job_name": "string", "status": "string", "status_change_timestamp": "timestamp", "job_start_timestamp": "timestamp", "job_end_timestamp": "timestamp", "job_state": "string", "dependency_flag": "string" } ================================================ FILE: tests/resources/unit/heartbeat/heartbeat_acon_creation/setup/column_list/sensor_table.json ================================================ { "sensor_id": "string", "assets": "array", "status": "string", "status_change_timestamp": "timestamp", "checkpoint_location": "string", "upstream_key": "string", "upstream_value": "string" } ================================================ FILE: tests/resources/unit/heartbeat/heartbeat_anchor_job/setup/column_list/heartbeat_sensor_control_table.json ================================================ { "sensor_source": "string", "sensor_id": "string", "sensor_read_type": "string", "asset_description": "string", "upstream_key": "string", "preprocess_query": "string", "latest_event_fetched_timestamp": "timestamp", "trigger_job_id": "string", "trigger_job_name": "string", "status": "string", "status_change_timestamp": "timestamp", "job_start_timestamp": "timestamp", "job_end_timestamp": "timestamp", "job_state": "string", "dependency_flag": "string" } ================================================ FILE: tests/resources/unit/heartbeat/heartbeat_anchor_job/setup/column_list/sensor_table.json ================================================ { "sensor_id": "string", "assets": "array", "status": "string", "status_change_timestamp": "timestamp", "checkpoint_location": "string", "upstream_key": "string", "upstream_value": "string" } ================================================ FILE: tests/resources/unit/sharepoint_reader/data/sample_ok.csv ================================================ col_a,col_b 1,2 ================================================ FILE: tests/resources/unit/sharepoint_reader/data/sample_other_delim.csv ================================================ col_a;col_b 1;2 ================================================ FILE: tests/unit/__init__.py ================================================ """Tests utilities.""" ================================================ FILE: tests/unit/test_acon_validation.py ================================================ """Unit tests for ACON validators.""" import pytest @pytest.mark.parametrize( "scenario", [ { "name": "Validate delete objects function", "acon": { "operations": [ { "manager": "file", "function": "delete_objects", "bucket": "example-bucket", "object_paths": ["path/to/delete/"], "dry_run": True, } ], }, }, { "name": "Validate copy objects function with missing parameters", "acon": { "operations": [ { "manager": "file", "function": "copy_objects", "bucket": "example-bucket", "source_object": ["path/to/copy/"], } ] }, "exception": """Errors found during validation: Missing mandatory parameters for file manager function copy_objects: ['destination_bucket', 'destination_object', 'dry_run'] Type validation errors for file manager function copy_objects: ["Parameter 'source_object' expected str, got list"]""", # noqa: E501 }, { "name": "Validate list of operations", "acon": { "operations": [ { "manager": "file", "function": "delete_objects", "bucket": "example-bucket", "object_paths": ["path/to/delete/"], "dry_run": True, }, { "manager": "table", "function": "execute_sql", "sql": "create example_table", }, { "manager": "table", "function": "optimize", "table_or_view": "example_table", }, ], }, }, { "name": "Validate list of operations with errors", "acon": { "operations": [ { "manager": "file", "function": "delete_objects", "bucket": "example-bucket", "object_paths": "path/to/delete/", "dry_run": "test string", }, { "manager": "table", "function": "execute_sql", "sql": 10, }, { "manager": "table", "function": "optimize_dataset", "table_or_view": "example_table", }, ] }, "exception": """Errors found during validation: Type validation errors for file manager function delete_objects: ["Parameter 'object_paths' expected list, got str", "Parameter 'dry_run' expected bool, got str"] Type validation errors for table manager function execute_sql: ["Parameter 'sql' expected str, got int"] Function 'optimize_dataset' not supported for table manager""", # noqa: E501 }, ], ) def test_manager_validation(scenario: dict) -> None: """Test to validate manager acons.""" from lakehouse_engine.engine import validate_manager_list acon = scenario["acon"] exception = scenario.get("exception", None) if exception: with pytest.raises(Exception) as e: validate_manager_list(acon) assert str(e.value) == exception else: validate_manager_list(acon) ================================================ FILE: tests/unit/test_custom_configs.py ================================================ """Unit tests for overwritten the default configs.""" from lakehouse_engine.core import exec_env from lakehouse_engine.utils.logging_handler import LoggingHandler from tests.conftest import UNIT_RESOURCES LOGGER = LoggingHandler(__name__).get_logger() TEST_PATH = "custom_configs" TEST_RESOURCES = f"{UNIT_RESOURCES}/{TEST_PATH}" def test_custom_config() -> None: """Testing using a custom configuration.""" default_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers LOGGER.info(f"Default disallowed email server: {default_configs}") # Testing custom configurations using a dictionary exec_env.ExecEnv.set_default_engine_config( custom_configs_dict={"notif_disallowed_email_servers": ["dummy.server.test"]}, ) dict_custom_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers LOGGER.info( f"Custom disallowed email server using dictionary: {dict_custom_configs}" ) assert default_configs != dict_custom_configs # Testing custom configurations using a file exec_env.ExecEnv.set_default_engine_config( custom_configs_file_path=f"{TEST_RESOURCES}/custom_engine_config.yaml", ) file_custom_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers LOGGER.info( f"Custom disallowed email server using configuration file: " f"{file_custom_configs}" ) assert default_configs != file_custom_configs # Resetting to the default configurations exec_env.ExecEnv.set_default_engine_config(package="tests.configs") reset_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers LOGGER.info(f"Reset disallowed email server: {reset_configs}") assert default_configs == reset_configs ================================================ FILE: tests/unit/test_databricks_utils.py ================================================ """Unit tests for DatabricksUtils in lakehouse_engine.utils.databricks_utils.""" import sys import types from unittest.mock import MagicMock, patch from lakehouse_engine.utils.databricks_utils import DatabricksUtils CONTEXT_KEYS = { "runId": "76890", "jobId": "657890", "jobName": "sadp-template-dummy_job", "workspaceId": "213245431", "usagePolicyId": "4567890", } CONTROL_DATA = { "run_id": "76890", "job_id": "657890", "job_name": "sadp-template-dummy_job", "workspace_id": "213245431", "policy_id": "4567890", "dp_name": "sadp-template", "environment": "dev", } def test_get_usage_context_for_serverless() -> None: """Test for get_usage_context_for_serverless method in DatabricksUtils.""" # Create a fake module and function fake_module = types.ModuleType("dbruntime.databricks_repl_context") fake_module.get_context = MagicMock( # type: ignore[attr-defined] return_value=MagicMock() ) sys.modules["dbruntime"] = types.ModuleType("dbruntime") sys.modules["dbruntime.databricks_repl_context"] = fake_module mock_context = MagicMock(**CONTEXT_KEYS) # Patch get_context to return our mock context with patch( "dbruntime.databricks_repl_context.get_context", return_value=mock_context ): with patch( "lakehouse_engine.core.exec_env.ExecEnv.get_environment", return_value="dev" ): usage_stats: dict = {} DatabricksUtils.get_usage_context_for_serverless(usage_stats) assert ( usage_stats == CONTROL_DATA ), f"Expected usage_stats to be {CONTROL_DATA}, but got {usage_stats}" # Clean up after test del sys.modules["dbruntime.databricks_repl_context"] del sys.modules["dbruntime"] ================================================ FILE: tests/unit/test_failure_notification_creation.py ================================================ """Unit tests for the creation of failure notifications.""" import re import time import pytest from lakehouse_engine.core.definitions import TerminatorSpec from lakehouse_engine.terminators.notifier_factory import NotifierFactory from lakehouse_engine.utils.logging_handler import LoggingHandler from tests.utils.smtp_server import SMTPServer LOGGER = LoggingHandler(__name__).get_logger() @pytest.mark.parametrize( "scenario", [ { "name": "Email notification creation using a template.", "spec": [ TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "template": "failure_notification_email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "on_failure": True, }, ), ], "server": "localhost", "port": "1025", "expected": """ Job local in workspace local has failed with the exception: Test exception""", }, ], ) def test_failure_notification_creation(scenario: dict) -> None: """Testing notification creation. Args: scenario: scenario to test. """ expected_output = scenario["expected"] try: port = scenario["port"] server = scenario["server"] smtp_server = SMTPServer(server, port) smtp_server.start() # We sleep so the subprocess has time to start the debug smtp server time.sleep(2) NotifierFactory.generate_failure_notification( scenario["spec"], ValueError("Test exception") ) message = _parse_email_output(smtp_server.get_last_message().as_string()) assert message == expected_output finally: smtp_server.stop() def _parse_email_output(mail_content: str) -> str: """Parse the mail that was received in the debug smtp server. The regex is fetching the data between the encoding's field 'bit' and the next boundary of the email. Example notification content: Content-Type: multipart/mixed; boundary="===============1362798268250904879==" MIME-Version: 1.0 From: test-email@email.com To: test-email1@email.com, test-email2@email.com CC: BCC: Subject: Service Failure Importance: normal X-Peer: ('::1', 49472, 0, 0) X-MailFrom: test-email@email.com X-RcptTo: test-email1@email.com, test-email2@email.com --===============1362798268250904879== Content-Type: text/text; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Job local in workspace local has failed with the exception: Test exception --===============1362798268250904879==-- Args: mail_content: The content of the email to parse. Returns: The parsed email message. """ message = re.search("(?<=bit\n).*?(?=--=)", mail_content, re.S).group()[1:-1] return str(message) ================================================ FILE: tests/unit/test_heartbeat_acon_creation.py ================================================ """Module that tests the Acon creation function from the heartbeat module.""" from unittest.mock import Mock, patch import pytest from pyspark.sql import DataFrame from lakehouse_engine.algorithms.sensors.heartbeat import Heartbeat from lakehouse_engine.core.definitions import HeartbeatConfigSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import LAKEHOUSE, UNIT_RESOURCES from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "heartbeat_acon_creation" FEATURE_TEST_RESOURCES = f"{UNIT_RESOURCES}/heartbeat/{TEST_NAME}" _LOGGER = LoggingHandler(__name__).get_logger() _SETUP_DELTA_TABLES = ["heartbeat_sensor_control_table", "sensor_table"] def _create_heartbeat_table() -> None: """Create the necessary tables required for using Heartbeat.""" _LOGGER.info("Creating heartbeat tables") for table in _SETUP_DELTA_TABLES: DataframeHelpers.create_delta_table( cols=SchemaUtils.from_file_to_dict( f"file:///{FEATURE_TEST_RESOURCES}/setup/column_list/{table}.json" ), table=table, ) def _select_all(table: str) -> DataFrame: """Select all records from the specified table. Args: table (str): The name of the table. """ return ExecEnv.SESSION.sql(f"SELECT * FROM {table} ORDER BY sensor_id") # nosec def _check_acon(heartbeat_table: str, acon: dict, acon_result_list: dict) -> None: """Validates the generated ACON. Args: heartbeat_table (str): The name of the heartbeat control table. acon (dict): The initial ACON that feeds the heartbeat algorithm. acon_result_list (dict): The expected ACON configuration. """ _LOGGER.info("Checking acon creation.") for control_table_row in _select_all(heartbeat_table).collect(): result = Heartbeat._get_sensor_acon_from_heartbeat( HeartbeatConfigSpec.create_from_acon(acon), control_table_row ) print(result) assert result == acon_result_list[control_table_row["sensor_id"]] @pytest.mark.parametrize( "scenario", [ { "use_case_name": "delta_table", "rows_to_add": { "heartbeat": """ ("delta_table","dummy_order","batch", "delta_table_order_events",NULL,NULL,NULL, "9274610384726150","dummy_order_events","COMPLETED", NULL,NULL,NULL,"UNPAUSED","TRUE") """, }, "results": { "dummy_order": { "sensor_id": "dummy_order_9274610384726150", "assets": ["delta_table_order_events_9274610384726150"], "control_db_table_name": "test_db.sensor_table", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "delta", "db_table": "dummy_order", "options": None, "location": None, "schema": None, }, "preprocess_query": None, "base_checkpoint_location": None, "fail_on_empty_result": False, }, }, }, { "use_case_name": "kafka", "rows_to_add": { "heartbeat": """ ("kafka", "sales: sales.dummy_deliveries", "batch","delta_table_order_events",NULL,NULL,NULL, "1847362093847561","dummy_order_events","COMPLETED", NULL,NULL,NULL,"UNPAUSED","TRUE") """, }, "results": { "sales: sales.dummy_deliveries": { "sensor_id": "sales__sales_dummy_deliveries_1847362093847561", "assets": ["delta_table_order_events_1847362093847561"], "control_db_table_name": "test_db.sensor_table", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "kafka", "db_table": None, "options": { "kafka.bootstrap.servers": ["server1", "server2"], "subscribe": "sales.dummy_deliveries", "startingOffsets": "earliest", "kafka.security.protocol": "SSL", "kafka.ssl.truststore.location": "trust_store_location", "kafka.ssl.truststore.password": "key", "kafka.ssl.keystore.location": "keystore_location", "kafka.ssl.keystore.password": "key", }, "location": None, "schema": None, }, "preprocess_query": None, "base_checkpoint_location": None, "fail_on_empty_result": False, } }, }, { "use_case_name": "sap_b4", "rows_to_add": { "heartbeat": """ ("sap_b4","SAP_DUMMY_ID","batch", "dummy_tables","LOAD_DATE",NULL,NULL, "6039184726153847","dummy_order_events","COMPLETED", NULL,NULL,NULL,"UNPAUSED","FALSE"), ("sap_b4","SAP_DUMMY_ID2","batch", "dummy_tables","LOAD_DATE",NULL,NULL, "7482910364728193","dummy_order_events","COMPLETED", NULL,NULL,NULL,"UNPAUSED","FALSE") """, }, "results": { "SAP_DUMMY_ID": { "sensor_id": "SAP_DUMMY_ID_6039184726153847", "assets": ["dummy_tables_6039184726153847"], "control_db_table_name": "test_db.sensor_table", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "sap_b4", "db_table": None, "options": { "prepareQuery": ( "WITH sensor_new_data AS (SELECT CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, ANALYZED_STATUS " "FROM sap_table " "WHERE UPPER(CHAIN_ID) = UPPER('SAP_DUMMY_ID') " "AND UPPER(ANALYZED_STATUS) = UPPER('G'))" ), "query": ( "SELECT COUNT(1) as count, " "'LOAD_DATE' as UPSTREAM_KEY, " "max(LOAD_DATE) as UPSTREAM_VALUE FROM sensor_new_data " "WHERE LOAD_DATE > '19000101000000' HAVING COUNT(1) > 0" ), }, "location": None, "schema": None, }, "preprocess_query": None, "base_checkpoint_location": None, "fail_on_empty_result": False, }, "SAP_DUMMY_ID2": { "sensor_id": "SAP_DUMMY_ID2_7482910364728193", "assets": ["dummy_tables_7482910364728193"], "control_db_table_name": "test_db.sensor_table", "input_spec": { "spec_id": "sensor_upstream", "read_type": "batch", "data_format": "sap_b4", "db_table": None, "options": { "prepareQuery": ( "WITH sensor_new_data AS (SELECT CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, ANALYZED_STATUS " "FROM sap_table " "WHERE " "UPPER(CHAIN_ID) = UPPER('SAP_DUMMY_ID2') " "AND UPPER(ANALYZED_STATUS) = UPPER('G'))" ), "query": ( "SELECT COUNT(1) as count, " "'LOAD_DATE' as UPSTREAM_KEY, " "max(LOAD_DATE) as UPSTREAM_VALUE FROM sensor_new_data " "WHERE LOAD_DATE > '19000101000000' HAVING COUNT(1) > 0" ), }, "location": None, "schema": None, }, "preprocess_query": None, "base_checkpoint_location": None, "fail_on_empty_result": False, }, }, }, ], ) @patch("lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils") def test_get_sensor_acon(mock_get_db_utils: Mock, scenario: dict) -> None: """Test the acon creation. Args: mock_get_db_utils (Mock): The mocked object. scenario (dict): The test scenario to execute. Scenarios: 1- For delta tables source. 2- For kafka topics source. 3- For SAP sources. In this scenario we have two records that will yield two different acons. """ scenario_name = scenario["use_case_name"] records = scenario["rows_to_add"].get("heartbeat") acon_result_list = scenario["results"] heartbeat_table = "test_db.heartbeat_sensor_control_table" sensor_table = "test_db.sensor_table" acon = { "sensor_source": scenario_name, "data_format": "delta", "heartbeat_sensor_db_table": heartbeat_table, "lakehouse_engine_sensor_db_table": sensor_table, "token": "my-token", "domain": "adidas-domain.cloud.databricks.com", } _LOGGER.info(f"Scenario: {scenario_name}") _create_heartbeat_table() _LOGGER.info("Inserting records in heartbeat table.") ExecEnv.SESSION.sql( f"""INSERT INTO {heartbeat_table} VALUES {records}""" # nosec ) if scenario_name == "sap_b4": _LOGGER.info("Inserting records in sensors table.") acon.update( { "data_format": "sap_b4", "jdbc_db_table": "sap_table", "options": { "prepareQuery": "", "query": "", }, } ) if scenario_name == "kafka": acon.update( { "data_format": "kafka", "kafka_configs": { "sales": { "kafka_bootstrap_servers_list": ["server1", "server2"], "kafka_ssl_truststore_location": "trust_store_location", "kafka_ssl_keystore_location": "keystore_location", "truststore_pwd_secret_key": "trust_store_key", "keystore_pwd_secret_key": "keystore_pwd_secret_key", } }, } ) mock_db_utils = Mock() mock_secrets = Mock() mock_secrets.get.return_value = "key" mock_db_utils.secrets = mock_secrets mock_get_db_utils.return_value = mock_db_utils _check_acon(heartbeat_table, acon, acon_result_list) else: _check_acon(heartbeat_table, acon, acon_result_list) for table in _SETUP_DELTA_TABLES: LocalStorage.clean_folder(f"{LAKEHOUSE}{table}") ExecEnv.SESSION.sql(f"""DROP TABLE IF EXISTS test_db.{table}""") # nosec ================================================ FILE: tests/unit/test_heartbeat_anchor_job.py ================================================ """Module that tests the anchor job function from the heartbeat module.""" from unittest.mock import Mock, patch import pytest from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.engine import trigger_heartbeat_sensor_jobs from lakehouse_engine.utils.logging_handler import LoggingHandler from lakehouse_engine.utils.schema_utils import SchemaUtils from tests.conftest import LAKEHOUSE, UNIT_RESOURCES from tests.utils.dataframe_helpers import DataframeHelpers from tests.utils.local_storage import LocalStorage TEST_NAME = "heartbeat_anchor_job" FEATURE_TEST_RESOURCES = f"{UNIT_RESOURCES}/heartbeat/{TEST_NAME}" _LOGGER = LoggingHandler(__name__).get_logger() _SETUP_DELTA_TABLES = ["heartbeat_sensor_control_table"] def _create_heartbeat_table() -> None: """Create the necessary tables required for using Heartbeat.""" _LOGGER.info("Creating tables") for table in _SETUP_DELTA_TABLES: DataframeHelpers.create_delta_table( cols=SchemaUtils.from_file_to_dict( f"file:///{FEATURE_TEST_RESOURCES}/setup/column_list/{table}.json" ), table=table, ) @pytest.mark.parametrize( "scenario", [ { "use_case_name": "delta_table_trigger_2_jobs", "sensor_source": "delta_table", "trigger_jobs_records": { "heartbeat": """ ("delta_table","dummy_orders","batch", "delta_table_order_events",NULL,NULL,NULL, "3849201756384721","events_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"UNPAUSED","TRUE"), ("delta_table","dummy_sales","batch", "delta_table_order_events",NULL,NULL,NULL, "3849201756384721","events_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"UNPAUSED","TRUE"), ("delta_table","dummy_test","batch", "delta_table_order_events",NULL,NULL,NULL, "7601938475620193","events_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"UNPAUSED","TRUE"), ("delta_table","dummy_test2","batch", "delta_table_order_events",NULL,NULL,NULL, "7601938475620193","events_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"UNPAUSED","TRUE") """, }, "jobs_triggered_count": 2, "job_id": ["3849201756384721", "7601938475620193"], }, { "use_case_name": "kafka_trigger_1_job", "sensor_source": "kafka", "trigger_jobs_records": { "heartbeat": """ ("kafka","dummy_test3","batch", "delta_table_order_events",NULL,NULL,NULL, "5918374620193847","events_orders","COMPLETE", NULL,NULL,NULL,"UNPAUSED","FALSE"), ("kafka","dummy_test4","batch", "delta_table_order_events",NULL,NULL,NULL, "5918374620193847","events_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"UNPAUSED","TRUE") """, }, "jobs_triggered_count": 1, "job_id": ["5918374620193847"], }, { "use_case_name": "sap_b4_no_trigger", "sensor_source": "sap_b4", "trigger_jobs_records": { "heartbeat": """ ("sap_b4","dummy_test3","batch", "delta_table_order_events",NULL,NULL,NULL, "8203746159283746","events_orders","NEW_EVENT_AVAILABLE", NULL,NULL,NULL,"PAUSED","FALSE"), ("sap_b4","dummy_test4","batch", "delta_table_order_events",NULL,NULL,NULL, "8203746159283746","events_orders","COMPLETE", NULL,NULL,NULL,"UNPAUSED","TRUE") """ }, "jobs_triggered_count": 0, }, ], ) @patch( "lakehouse_engine.core.sensor_manager.SensorJobRunManager.run_job", return_value=("run_id", None), ) def test_anchor_job(mock_run_job: Mock, scenario: dict) -> None: """Test the number of jobs triggered. Args: mock_run_job (Mock): The mocked object. scenario: The test scenario to execute. Scenarios: 1- 2 different jobs id's each one with two hard dependencies. From the 4 records in the table, only two should trigger a job. 2- 1 job id with two records that can trigger the job. Only 1 comply with the specifications to trigger a job. 3- 1 job id with two records that can trigger the job. None comply with the specifications to trigger a job. """ scenario_name = scenario["use_case_name"] sensor_source = scenario["sensor_source"] records = scenario["trigger_jobs_records"].get("heartbeat") jobs_triggered_count = scenario["jobs_triggered_count"] heartbeat_table = "test_db.heartbeat_sensor_control_table" sensor_table = "test_db.sensor_table" acon = { "heartbeat_sensor_db_table": heartbeat_table, "lakehouse_engine_sensor_db_table": sensor_table, "data_format": "delta", "sensor_source": sensor_source, "token": "my-token", "domain": "adidas-domain.cloud.databricks.com", } _LOGGER.info(f"Scenario: {scenario_name}") _create_heartbeat_table() ExecEnv.SESSION.sql( f"""INSERT INTO {heartbeat_table} VALUES {records}""" # nosec ) trigger_heartbeat_sensor_jobs(acon=acon) assert mock_run_job.call_count == jobs_triggered_count if jobs_triggered_count > 0: triggered_job_id = scenario["job_id"] for call_args in mock_run_job.call_args_list: assert call_args[0][0] in triggered_job_id for table in _SETUP_DELTA_TABLES: LocalStorage.clean_folder(f"{LAKEHOUSE}{table}") ExecEnv.SESSION.sql(f"""DROP TABLE IF EXISTS test_db.{table}""") # nosec ================================================ FILE: tests/unit/test_log_filter_sensitive_data.py ================================================ """Unit tests focusing on the logging filter FilterSensitiveData.""" import logging from typing import Any from lakehouse_engine.utils.logging_handler import LoggingHandler STR_MSGS_TO_LOG = [ { # Sample acon being logged, password has comma and double quotes "original_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': " "'source', 'read_type': 'batch', 'data_format': 'sap_bw', 'options': " "{'driver': 'org.sqlite.JDBC', 'user': 'user', 'password': 'p,w\"d', " "'url': 'jdbc:url', 'dbtable': 'table', 'numPartitions': 2, 'extraction_type': " "'delta', 'partitionColumn': 'item', 'lowerBound': 1, 'upperBound': 3}}], " "'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', 'write_type': " "'append', 'data_format': 'delta', 'partitions': ['actrequest_timestamp'], " "'location': 'file:////path'}]}", "masked_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': " "'source', 'read_type': 'batch', 'data_format': 'sap_bw', 'options': " "{'driver': 'org.sqlite.JDBC', 'user': 'user', 'masked_cred': '******', " "'url': 'jdbc:url', 'dbtable': 'table', 'numPartitions': 2, 'extraction_type': " "'delta', 'partitionColumn': 'item', 'lowerBound': 1, 'upperBound': 3}}], " "'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', 'write_type': " "'append', 'data_format': 'delta', 'partitions': ['actrequest_timestamp'], " "'location': 'file:////path'}]}", }, { # no single neither double quotes "original_log": "prop1: prop2, password: pwd, secret: secret", "masked_log": "prop1: prop2, masked_cred: ******, " "masked_cred: ******, ", }, { # double quotes, password has single quotes and comma, ends with secret and space # and additional log "original_log": '"prop1": "prop2", "password": "p,w\'d", ' '"secret": "secret" other logs', "masked_log": '"prop1": "prop2", "masked_cred": "******", ' '"masked_cred": "******", other logs', }, { "original_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': " "'source', 'read_type': 'streaming', 'data_format': 'kafka', 'options': " "{'kafka.ssl.truststore.password': 'p,w\"d', 'kafka.ssl.keystore.password': " "'p,w\"d'}}], 'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', " "'write_type': 'append', 'data_format': 'delta', 'partitions': " "['actrequest_timestamp'], 'location': 'file:////path'}]}", "masked_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': " "'source', 'read_type': 'streaming', 'data_format': 'kafka', 'options': " "{'masked_cred': '******', 'masked_cred': '******', }], " "'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', 'write_type': " "'append', 'data_format': 'delta', 'partitions': ['actrequest_timestamp'], " "'location': 'file:////path'}]}", }, ] DICT_MSGS_TO_LOG = [ # fmt: off { # test with dict, because we rely on space after comma for the replace # and python might change the dict structure in the future "original_log": {"secret":"dummy_pwd","prop":"prop_val"}, # noqa: E231 "masked_log": "{'masked_cred': '******', 'prop': 'prop_val'}", }, # fmt: on ] LOGGER = LoggingHandler(__name__).get_logger() def test_log_filter_sensitive_data(caplog: Any) -> None: """Test the logging filter FilterSensitiveData. Given a set of messages, each message is logged (original_log) and tested against the expected output (masked_log). :param caplog: captures the log. """ with caplog.at_level(logging.INFO): for str_msg in STR_MSGS_TO_LOG: LOGGER.info(str_msg["original_log"]) assert str_msg["masked_log"] in caplog.text for dict_msg in DICT_MSGS_TO_LOG: LOGGER.info(dict_msg["original_log"]) assert dict_msg["masked_log"] in caplog.text ================================================ FILE: tests/unit/test_notification_creation.py ================================================ """Unit tests for notification creation functions.""" import pytest from lakehouse_engine.core.definitions import TerminatorSpec from lakehouse_engine.terminators.notifier_factory import NotifierFactory from lakehouse_engine.terminators.notifiers.email_notifier import EmailNotifier from lakehouse_engine.terminators.notifiers.exceptions import ( NotifierConfigException, NotifierTemplateConfigException, NotifierTemplateNotFoundException, ) from lakehouse_engine.utils.logging_handler import LoggingHandler from tests.conftest import FEATURE_RESOURCES LOGGER = LoggingHandler(__name__).get_logger() TEST_ATTACHEMENTS_PATH = FEATURE_RESOURCES + "/notification/" @pytest.mark.parametrize( "scenario", [ { "name": "Email notification creation using a template.", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "template": "failure_notification_email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "exception": "test-exception", }, ), "expected": """ Job local in workspace local has failed with the exception: test-exception""", }, { "name": "Error: missing template", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "template": "missing template", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "exception": "test-exception", }, ), "expected": "Template missing template does not exist", }, { "name": "Error: Malformed acon", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "exception": "test-exception", }, ), "expected": "Malformed Notification Definition", }, ], ) def test_notification_creation(scenario: dict) -> None: """Testing notification creation. Args: scenario: scenario to test. """ notifier = NotifierFactory.get_notifier(scenario["spec"]) if "Error: " in scenario["name"]: with pytest.raises( ( NotifierTemplateNotFoundException, NotifierConfigException, NotifierTemplateConfigException, ) ) as e: notifier.create_notification() assert str(e.value) == scenario["expected"] else: notifier.create_notification() assert notifier.notification["message"] == scenario["expected"] @pytest.mark.parametrize( "scenario", [ TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], "subject": "test-subject", "message": "test-message", }, ), TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "from": "test-email@email.com", "cc": ["test-email1@email.com", "test-email2@email.com"], "bcc": ["test-email3@email.com", "test-email4@email.com"], "mimetype": "html", "subject": "test-subject", "message": "test-message", "attachments": [ f"{TEST_ATTACHEMENTS_PATH}test_attachement.txt", f"{TEST_ATTACHEMENTS_PATH}test_image.png", ], }, ), ], ) def test_office365_notification_creation(scenario: TerminatorSpec) -> None: """Testing Office 365 notification creation.""" notifier = EmailNotifier(scenario) body = notifier._create_graph_api_email_body() for recipient, test_recipient in zip( body.message.to_recipients, scenario.args.get("to", []) ): assert recipient.email_address.address == test_recipient for recipient, test_recipient in zip( body.message.cc_recipients, scenario.args.get("cc", []) ): assert recipient.email_address.address == test_recipient for recipient, test_recipient in zip( body.message.bcc_recipients, scenario.args.get("bcc", []) ): assert recipient.email_address.address == test_recipient if body.message.attachments: for attachment, test_attachment in zip( body.message.attachments, scenario.args.get("attachments") ): assert attachment.name == test_attachment.split("/")[-1] with open(test_attachment, "rb") as file: assert attachment.content_bytes == file.read() ================================================ FILE: tests/unit/test_notification_factory.py ================================================ """Unit tests for notification factory module.""" import pytest from lakehouse_engine.core.definitions import TerminatorSpec from lakehouse_engine.terminators.notifier_factory import NotifierFactory from lakehouse_engine.terminators.notifiers.exceptions import NotifierNotFoundException from lakehouse_engine.utils.logging_handler import LoggingHandler LOGGER = LoggingHandler(__name__).get_logger() @pytest.mark.parametrize( "scenario", [ { "name": "Error: wrong type of notifier", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "snailmail", "template": "failure_notification_email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], }, ), "expected": "The requested notification format snailmail is not supported.", }, { "name": "Creation of email", "spec": TerminatorSpec( function="notify", args={ "server": "localhost", "port": "1025", "type": "email", "template": "failure_notification_email", "from": "test-email@email.com", "to": ["test-email1@email.com", "test-email2@email.com"], }, ), "expected": "email", }, ], ) def test_notification_factory(scenario: dict) -> None: """Testing notification factory. Args: scenario: scenario to test. """ if "Error: " in scenario["name"]: with pytest.raises(NotifierNotFoundException) as e: notifier = NotifierFactory.get_notifier(scenario["spec"]) assert scenario["expected"] == str(e.value) else: notifier = NotifierFactory.get_notifier(scenario["spec"]) assert notifier.type == scenario["expected"] ================================================ FILE: tests/unit/test_prisma_dq_rule_id.py ================================================ """Test the manual definition of dq functions when using prisma dq framework.""" import pytest from lakehouse_engine.core.definitions import DQFunctionSpec, DQSpec, DQType from lakehouse_engine.utils.dq_utils import PrismaUtils from lakehouse_engine.utils.logging_handler import LoggingHandler _LOGGER = LoggingHandler(__name__).get_logger() @pytest.mark.parametrize( "scenario", [ { "name": "Definition of DQ Functions using parameters without duplicates", "spec_id": "spec_without_duplicates", "dq_spec": { "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_1", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_3", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, ], }, }, { "name": "Error: Definition of DQ Functions using parameters " "with duplicates", "spec_id": "spec_with_duplicates", "dq_spec": { "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_1", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, ], }, }, ], ) def test_prisma_manual_function_definition(scenario: dict) -> None: """Test the manual definition of dq functions when using prisma dq framework. Args: scenario (dict): The test scenario. """ dq_functions = [ DQFunctionSpec(function=dq_function["function"], args=dq_function["args"]) for dq_function in scenario["dq_spec"]["dq_functions"] ] dq_spec_list = [ DQSpec( spec_id=scenario["spec_id"], input_id=scenario["name"], dq_type=DQType.PRISMA.value, dq_functions=dq_functions, ) ] if "Error: " in scenario["name"]: error = PrismaUtils.validate_rule_id_duplication(specs=dq_spec_list) expected_error = {"dq_spec_id: spec_with_duplicates": "rule_2; rule_1; rule_2"} _LOGGER.critical( f"A duplicate dq_rule_id was found!!!" "Please verify the following list:" f"{error}" ) assert error == expected_error else: PrismaUtils.validate_rule_id_duplication(specs=dq_spec_list) ================================================ FILE: tests/unit/test_prisma_function_definition.py ================================================ """Test the manual definition of dq functions when using prisma dq framework.""" import pytest from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.dq_processors.exceptions import DQSpecMalformedException from lakehouse_engine.utils.dq_utils import DQUtils @pytest.mark.parametrize( "scenario", [ { "name": "Error: missing meta parameters", "dq_spec": { "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "table": "test_table", "execution_point": "in_motion", }, }, }, ], }, "expected": "The dq function meta field must contain all the " "fields defined" ": ['dq_rule_id', 'execution_point', 'filters', 'schema', " "'table', 'column', 'dimension'].\n" "Found fields: ['table', 'execution_point'].\n" "Diff: ['column', 'dimension', 'dq_rule_id', 'filters', 'schema']", }, { "name": "Error: missing meta", "dq_spec": { "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "test_column", }, }, ], }, "expected": "The dq function must have a meta field containing all the " "fields defined: ['dq_rule_id', " "'execution_point', 'filters', 'schema', 'table', 'column', " "'dimension'].", }, { "name": "Definition of DQ Functions", "dq_spec": { "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", }, }, }, ], }, "expected": None, }, { "name": "Definition of DQ Functions with extra params", "dq_spec": { "dq_functions": [ { "function": "expect_column_to_exist", "args": { "column": "test_column", "meta": { "dq_rule_id": "rule_2", "execution_point": "in_motion", "schema": "test_db", "table": "dummy_sales", "column": "", "dimension": "", "filters": "", "note": "Test Notes", }, }, }, ], }, "expected": None, }, ], ) def test_prisma_manual_function_definition(scenario: dict) -> None: """Test the manual definition of dq functions when using prisma dq framework. Args: scenario (dict): The test scenario. """ dq_spec = scenario["dq_spec"] if "Error: " in scenario["name"]: with pytest.raises(DQSpecMalformedException) as e: DQUtils.validate_dq_functions( spec=dq_spec, execution_point="in_motion", extra_meta_arguments=ExecEnv.ENGINE_CONFIG.dq_functions_column_list, ) assert str(e.value) == scenario["expected"] else: DQUtils.validate_dq_functions( spec=dq_spec, execution_point="in_motion", extra_meta_arguments=ExecEnv.ENGINE_CONFIG.dq_functions_column_list, ) ================================================ FILE: tests/unit/test_rest_api_functions.py ================================================ """Test REST api related functions that cannot be tested inside Spark.""" import logging from collections import namedtuple from typing import Any from unittest.mock import patch from pyspark.sql import Row from lakehouse_engine.core.definitions import OutputSpec from lakehouse_engine.io.writers.rest_api_writer import RestApiWriter from lakehouse_engine.utils.logging_handler import LoggingHandler LOGGER = LoggingHandler(__name__).get_logger() RestResponse = namedtuple("RestResponse", "status_code text") @patch( "lakehouse_engine.io.writers.rest_api_writer.execute_api_request", return_value=RestResponse(status_code=200, text="ok"), ) def test_send_payload_to_rest_api_simple_params(_: Any, caplog: Any) -> None: """Test if the REST API payload creation process is correct w/ simple params. Args: _: ignored patch. caplog: captures the log. """ output_spec = OutputSpec( spec_id="test_output", input_id="test_input", write_type="overwrite", data_format="rest_api", options={ "rest_api_url": "https://www.dummy-url.local/dummy-endpoint", "rest_api_method": "post", "rest_api_header": {"Authorization": "Bearer dummytoken"}, }, ) row = Row(payload='{"dummy_payload":"dummy value"}') func = RestApiWriter._get_func_to_send_payload_to_rest_api(output_spec) func(row) str_to_assert = "Final payload: {'dummy_payload': 'dummy value'}" with caplog.at_level(logging.DEBUG): assert str_to_assert in caplog.text @patch( "lakehouse_engine.io.writers.rest_api_writer.execute_api_request", return_value=RestResponse(status_code=200, text="ok"), ) def test_send_payload_to_rest_api_with_file_params(_: Any, caplog: Any) -> None: """Test if the REST API payload creation process is correct with file params. Args: _: ignored patch. caplog: captures the log. """ output_spec = OutputSpec( spec_id="test_output", input_id="test_input", write_type="overwrite", data_format="rest_api", options={ "rest_api_url": "https://www.dummy-url.local/dummy-endpoint", "rest_api_method": "post", "rest_api_header": {"Authorization": "Bearer dummytoken"}, "rest_api_is_file_payload": True, "rest_api_file_payload_name": "anotherFileName", "rest_api_extra_json_payload": {"a": "b"}, }, ) row = Row(payload='{"dummy_payload":"dummy value"}') func = RestApiWriter._get_func_to_send_payload_to_rest_api(output_spec) func(row) str_to_assert = ( "Final payload: {'anotherFileName': " "'{\"dummy_payload\":\"dummy value\"}', 'a': 'b'}" ) with caplog.at_level(logging.DEBUG): assert str_to_assert in caplog.text ================================================ FILE: tests/unit/test_sensor.py ================================================ """Module with unit tests for Sensor class.""" from datetime import datetime from typing import Any from unittest.mock import MagicMock, patch import pytest from pyspark.sql.types import Row, StructType from lakehouse_engine.algorithms.exceptions import ( NoNewDataException, SensorAlreadyExistsException, ) from lakehouse_engine.algorithms.sensors.sensor import Sensor from lakehouse_engine.core.definitions import ( InputFormat, InputSpec, ReadType, SensorSpec, SensorStatus, ) from lakehouse_engine.core.sensor_manager import ( SensorControlTableManager, SensorUpstreamManager, ) from tests.utils.dataframe_helpers import DataframeHelpers @pytest.mark.parametrize( "scenario", [ { "scenario_name": "create_sensor", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "sensor_already_exists": False, "expected_result": SensorSpec( sensor_id="sensor_id_1", assets=["asset_1"], control_db_table_name="control_sensor_table_name", input_spec=InputSpec( spec_id="input_spec", read_type=ReadType.STREAMING.value, data_format=InputFormat.CSV.value, ), preprocess_query=None, checkpoint_location="s3://dummy-bucket" "/lakehouse_engine/sensors/sensor_id_1", fail_on_empty_result=False, ), }, { "scenario_name": "raise_exception_sensor_already_exists", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "sensor_already_exists": True, "expected_result": "There's already a sensor registered " "with same id or assets!", }, ], ) def test_create_sensor(scenario: dict, capsys: Any) -> None: """Test Sensor creation. We will raise an exception if we try to create a Sensor that already exists, otherwise we will create successfully. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with patch.object( Sensor, "_check_if_sensor_already_exists", new=MagicMock(return_value=scenario["sensor_already_exists"]), ) as sensor_already_exists_mock: sensor_already_exists_mock.start() if scenario["scenario_name"] == "raise_exception_sensor_already_exists": with pytest.raises(SensorAlreadyExistsException) as exception: Sensor(scenario["sensor_data"]) assert scenario["expected_result"] == str(exception.value) else: subject = Sensor(scenario["sensor_data"]) assert subject.spec == scenario["expected_result"] sensor_already_exists_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "create_non_existing_sensor_with_sensor_id", "sensor_data": { "sensor_id": "sensor_id_1", "assets": None, "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "control_db_sensor_data": Row( sensor_id="sensor_id_1", assets=None, status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508), checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": False, }, { "scenario_name": "create_non_existing_sensor_with_assets", "sensor_data": { "sensor_id": None, "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "control_db_sensor_data": Row( sensor_id=None, assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508), checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": False, }, { "scenario_name": "create_non_existing_sensor_with_sensor_id_and_assets", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "control_db_sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508), checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": False, }, { "scenario_name": "raise_exception_as_sensor_" "already_exist_with_same_id_and_different_asset", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "control_db_sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_2"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508), checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": "There's already a sensor " "registered with same id or assets!", }, { "scenario_name": "raise_exception_as_sensor_" "already_exist_with_same_asset_and_different_id", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "control_db_sensor_data": Row( sensor_id="sensor_id_2", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508), checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": "There's already a sensor " "registered with same id or assets!", }, ], ) def test_sensor_already_exists(scenario: dict, capsys: Any) -> None: """Test if Sensor already exists. We will raise an exception if the Sensor already exists by sensor_id or by assets. If the sensor doesn't exist we will create a new Sensor. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with patch.object( SensorControlTableManager, "read_sensor_table_data", new=MagicMock(return_value=scenario["control_db_sensor_data"]), ) as sensor_already_exists_mock: sensor_already_exists_mock.start() if "raise_exception" in scenario["scenario_name"]: with pytest.raises(SensorAlreadyExistsException) as exception: Sensor(scenario["sensor_data"]) assert scenario["expected_result"] == str(exception.value) else: subject = Sensor(scenario["sensor_data"])._check_if_sensor_already_exists() assert subject == scenario["expected_result"] sensor_already_exists_mock.stop() class TestExecuteSensor: """Test suite containing tests for the Sensor execute method.""" _sensor_already_exists_mock = patch.object( Sensor, "_check_if_sensor_already_exists", new=MagicMock(return_value=False), ) @classmethod def setup_class(cls) -> None: """Start mock for all test methods in this suite.""" cls._sensor_already_exists_mock.start() @classmethod def teardown_class(cls) -> None: """Clean mock after all test methods in this suite.""" cls._sensor_already_exists_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "execute_stream_sensor", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.CSV.value, }, "fail_on_empty_result": False, "base_checkpoint_location": "s3://dummy-bucket", }, "expected_result": True, } ], ) def test_execute_stream_sensor(self, scenario: dict, capsys: Any) -> None: """Test streaming Sensor execution. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with patch.object( SensorControlTableManager, "check_if_sensor_has_acquired_data", new=MagicMock(return_value=scenario["expected_result"]), ) as check_if_sensor_acquired_data_mock: check_if_sensor_acquired_data_mock.start() with patch.object( SensorUpstreamManager, "read_new_data", new=MagicMock( return_value=DataframeHelpers.create_empty_dataframe(StructType([])) ), ) as sensor_new_data_mock: with patch.object( Sensor, "_run_streaming_sensor", new=MagicMock(return_value=scenario["expected_result"]), ) as run_stream_sensor_mock: run_stream_sensor_mock.start() subject = Sensor(scenario["sensor_data"]).execute() assert subject == scenario["expected_result"] run_stream_sensor_mock.stop() sensor_new_data_mock.stop() check_if_sensor_acquired_data_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "execute_batch_sensor", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.BATCH.value, "data_format": InputFormat.JDBC.value, }, }, "expected_result": True, }, ], ) def test_execute_batch_sensor(self, scenario: dict, capsys: Any) -> None: """Test batch Sensor execution. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with patch.object( SensorControlTableManager, "check_if_sensor_has_acquired_data", new=MagicMock(return_value=scenario["expected_result"]), ) as check_if_sensor_acquired_data_mock: check_if_sensor_acquired_data_mock.start() with patch.object( SensorUpstreamManager, "read_new_data", new=MagicMock( return_value=DataframeHelpers.create_empty_dataframe(StructType([])) ), ) as sensor_new_data_mock: sensor_new_data_mock.start() with patch.object( Sensor, "_run_batch_sensor", new=MagicMock(return_value=scenario["expected_result"]), ) as run_batch_sensor_mock: run_batch_sensor_mock.start() subject = Sensor(scenario["sensor_data"]).execute() assert subject == scenario["expected_result"] run_batch_sensor_mock.stop() sensor_new_data_mock.stop() check_if_sensor_acquired_data_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "raise_exception_sensor_" "input_spec_format_not_implemented", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.BATCH.value, "data_format": InputFormat.DATAFRAME.value, }, "base_checkpoint_location": "s3://dummy-bucket", }, "expected_result": "A sensor has not been implemented yet for " "this data format or, this data format is not available for " "the read_type batch. Check the allowed combinations of " "read_type and data_formats: {'streaming': ['kafka', 'avro', " "'json', 'parquet', 'csv', 'delta', " "'cloudfiles'], 'batch': ['delta', 'jdbc']}", }, { "scenario_name": "raise_exception_sensor_" "input_spec_format_doesnt_exists", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "db_table": "test_db.test_table", "read_type": ReadType.BATCH.value, "data_format": "databricks", }, "base_checkpoint_location": "s3://dummy-bucket", }, "expected_result": "Data format databricks isn't implemented yet.", }, ], ) def test_execute_sensor_raise_no_input_spec_format_implemented( self, scenario: dict, capsys: Any ) -> None: """Expect to raise exception for input spec format not implemented. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with pytest.raises(NotImplementedError) as exception: Sensor(scenario["sensor_data"]).execute() assert scenario["expected_result"] == str(exception.value) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "raise_no_new_data_exception", "sensor_data": { "sensor_id": "sensor_id_1", "assets": ["asset_1"], "control_db_table_name": "control_sensor_table_name", "input_spec": { "spec_id": "input_spec", "read_type": ReadType.STREAMING.value, "data_format": InputFormat.KAFKA.value, }, "base_checkpoint_location": "s3://dummy-bucket", "fail_on_empty_result": True, }, "expected_result": "No data was acquired by sensor_id_1 sensor.", }, ], ) def test_execute_sensor_raise_no_new_data_exception( self, scenario: dict, capsys: Any ) -> None: """Expect to raise exception for empty data. When we pass the flag `fail_on_empty_result` equals to `True`. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with patch.object( SensorControlTableManager, "check_if_sensor_has_acquired_data", new=MagicMock(return_value=False), ) as check_if_sensor_acquired_data_mock: check_if_sensor_acquired_data_mock.start() with patch.object( SensorUpstreamManager, "read_new_data", new=MagicMock( return_value=DataframeHelpers.create_empty_dataframe(StructType([])) ), ) as sensor_new_data_mock: with patch.object( Sensor, "_run_streaming_sensor", new=MagicMock(return_value=False) ) as run_stream_sensor_mock: run_stream_sensor_mock.start() with pytest.raises(NoNewDataException) as exception: Sensor(scenario["sensor_data"]).execute() assert scenario["expected_result"] == str(exception.value) run_stream_sensor_mock.stop() sensor_new_data_mock.stop() check_if_sensor_acquired_data_mock.stop() ================================================ FILE: tests/unit/test_sensor_manager.py ================================================ """Module with unit tests for Sensor Manager module.""" from datetime import datetime from typing import Any from unittest.mock import MagicMock, patch import pytest from delta import DeltaTable from pyspark.sql import DataFrame from pyspark.sql.types import ( ArrayType, Row, StringType, StructField, StructType, TimestampType, ) from lakehouse_engine.algorithms.sensors.sensor import SensorStatus from lakehouse_engine.core.definitions import SensorSpec from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.core.sensor_manager import ( SensorControlTableManager, SensorUpstreamManager, ) from lakehouse_engine.io.reader_factory import ReaderFactory TEST_DEFAULT_DATETIME = datetime(2023, 5, 26, 14, 38, 16, 676508) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "should_return_default_update_set_when_empty_fields", "updated_set_to_add": {}, }, { "scenario_name": "should_add_just_one_field_to_update_set", "assets": ["asset_1"], "updated_set_to_add": {"sensors.assets": "updates.assets"}, }, { "scenario_name": "should_add_multiple_fields_to_update_set", "assets": ["asset_1"], "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_1", "upstream_key": "dummy_column", "upstream_value": "dummy_value", "updated_set_to_add": { "sensors.assets": "updates.assets", "sensors.checkpoint_location": "updates.checkpoint_location", "sensors.upstream_key": "updates.upstream_key", "sensors.upstream_value": "updates.upstream_value", }, }, ], ) def test_sensor_update_set(scenario: dict, capsys: Any) -> None: """Test sensor update set adding multiple fields based in the items to add. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ expected_default_update_set = { "sensors.sensor_id": "updates.sensor_id", "sensors.status": "updates.status", "sensors.status_change_timestamp": "updates.status_change_timestamp", } subject = SensorControlTableManager._get_sensor_update_set( assets=scenario.get("assets"), checkpoint_location=scenario.get("checkpoint_location"), upstream_key=scenario.get("upstream_key"), upstream_value=scenario.get("upstream_value"), ) assert subject == {**expected_default_update_set, **scenario["updated_set_to_add"]} @pytest.mark.parametrize( "scenario", [ { "scenario_name": "true_when_table_data_and_status_acquired_new_data", "sensor_id": "sensor_id_1", "assets": ["asset_1"], "status": SensorStatus.ACQUIRED_NEW_DATA.value, "status_change_timestamp": datetime.now(), "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_1", "upstream_key": "dummy_column", "upstream_value": "dummy_value", }, ], ) def test_sensor_data(scenario: dict, capsys: Any) -> None: """Test Sensor data construction. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ subject = SensorControlTableManager._convert_sensor_to_data( spec=SensorSpec( sensor_id=scenario["sensor_id"], assets=scenario["assets"], control_db_table_name=None, checkpoint_location=scenario["checkpoint_location"], preprocess_query=None, input_spec=None, ), status=scenario["status"], upstream_key=scenario["upstream_key"], upstream_value=scenario["upstream_value"], status_change_timestamp=scenario["status_change_timestamp"], ) assert subject == [ { "sensor_id": scenario["sensor_id"], "assets": scenario["assets"], "status": scenario["status"], "status_change_timestamp": scenario["status_change_timestamp"], "checkpoint_location": scenario["checkpoint_location"], "upstream_key": scenario["upstream_key"], "upstream_value": scenario["upstream_value"], } ] @pytest.mark.parametrize( "scenario", [ { "scenario_name": "true_when_table_data_and_status_acquired_new_data", "sensor_id": "sensor_id_1", "control_db_table_name": "sensor_control_db_table", "sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=TEST_DEFAULT_DATETIME, checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": True, }, { "scenario_name": "false_when_table_data_is_absent", "sensor_id": "sensor_id_1", "control_db_table_name": "sensor_control_db_table", "sensor_data": None, "expected_result": False, }, { "scenario_name": "false_when_table_data_is_present_and_" "status_different_than_acquired_new_data", "sensor_id": "sensor_id_1", "control_db_table_name": "sensor_control_db_table", "sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.PROCESSED_NEW_DATA.value, status_change_timestamp=TEST_DEFAULT_DATETIME, checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ), "expected_result": False, }, ], ) def test_check_if_sensor_has_acquired_data(scenario: dict, capsys: Any) -> None: """Test if Sensor has acquired data. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ with patch.object( SensorControlTableManager, "read_sensor_table_data", new=MagicMock(return_value=scenario["sensor_data"]), ) as sensor_table_data_mock: sensor_table_data_mock.start() subject = SensorControlTableManager.check_if_sensor_has_acquired_data( sensor_id=scenario["sensor_id"], control_db_table_name=scenario["control_db_table_name"], ) assert subject == scenario["expected_result"] sensor_table_data_mock.stop() @pytest.fixture def control_table_fixture() -> DataFrame: """Return a dummy dataframe in the Sensor control table schema.""" schema = StructType( [ StructField("sensor_id", StringType(), False), StructField("assets", ArrayType(StringType(), False), True), StructField("status", StringType(), False), StructField("status_change_timestamp", TimestampType(), False), StructField("checkpoint_location", StringType(), True), ] ) return ExecEnv.SESSION.createDataFrame( [ [ "sensor_id_1", [], SensorStatus.ACQUIRED_NEW_DATA.value, TEST_DEFAULT_DATETIME, "s3://dummy-bucket/sensors/sensor_id_1", ], [ "sensor_id_2", ["asset_2"], SensorStatus.PROCESSED_NEW_DATA.value, TEST_DEFAULT_DATETIME, "s3://dummy-bucket/sensors/sensor_id_2", ], [ "sensor_id_3", ["asset_3"], SensorStatus.ACQUIRED_NEW_DATA.value, TEST_DEFAULT_DATETIME, "s3://dummy-bucket/sensors/sensor_id_3", ], ], schema, ) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sensor_id_is_present", "sensor_id": "sensor_id_1", "control_db_table_name": "sensor_control_db_table", "assets": None, "expected_result": { "sensor_id": "sensor_id_1", "assets": [], "status": SensorStatus.ACQUIRED_NEW_DATA.value, "status_change_timestamp": TEST_DEFAULT_DATETIME, "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_1", }, }, { "scenario_name": "sensor_id_is_absent_and_assets_is_present", "sensor_id": None, "control_db_table_name": "sensor_control_db_table", "assets": ["asset_2"], "expected_result": { "sensor_id": "sensor_id_2", "assets": ["asset_2"], "status": SensorStatus.PROCESSED_NEW_DATA.value, "status_change_timestamp": TEST_DEFAULT_DATETIME, "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_2", }, }, { "scenario_name": "sensor_id_and_sensor_asset_are_absent", "sensor_id": None, "control_db_table_name": "sensor_control_db_table", "assets": None, "expected_result": "Either sensor_id or assets " "need to be provided as arguments.", }, ], ) def test_read_sensor_table_data( scenario: dict, capsys: Any, control_table_fixture: DataFrame ) -> None: """Test read data from Sensor control table. Args: scenario: scenario to test. capsys: capture stdout and stderr. control_table_fixture: fixture representing the control table as DataFrame. """ expected_result = scenario["expected_result"] with patch.object(DeltaTable, "forName", MagicMock()) as delta_table_for_name_mock: delta_table_for_name_mock.start() with patch.object( delta_table_for_name_mock.return_value, "toDF", MagicMock(return_value=control_table_fixture), ) as delta_table_for_to_df_mock: delta_table_for_to_df_mock.start() if scenario["scenario_name"] == "sensor_id_and_sensor_asset_are_absent": with pytest.raises(ValueError) as exception: SensorControlTableManager.read_sensor_table_data( sensor_id=scenario["sensor_id"], control_db_table_name=scenario["control_db_table_name"], assets=scenario["assets"], ) assert expected_result in str(exception.value) else: subject = SensorControlTableManager.read_sensor_table_data( sensor_id=scenario["sensor_id"], control_db_table_name=scenario["control_db_table_name"], assets=scenario["assets"], ) assert subject.asDict() == expected_result delta_table_for_to_df_mock.stop() delta_table_for_name_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "test_if_has_new_data", "empty_df": False, "expected_result": True, }, { "scenario_name": "test_if_has_not_new_data", "empty_df": True, "expected_result": False, }, ], ) def test_has_new_data(scenario: dict, capsys: Any) -> None: """Test if checking for new data works correctly where there is new data. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ new_data_df = _prepare_new_data_tests(return_empty_df=scenario["empty_df"]) has_new_data = SensorUpstreamManager.get_new_data(new_data_df) is not None assert has_new_data == scenario["expected_result"] @pytest.mark.parametrize( "scenario", [ { "scenario_name": "sensor_db_table_and_default_dummy_value", "sensor": { "sensor_id": "sensor_id_1", "filter_exp": "?upstream_key > '?upstream_value'", "control_db_table_name": "test_jdbc_sensor_default_dummy_value", "upstream_key": "dummy_time", "upstream_value": None, }, "sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=TEST_DEFAULT_DATETIME, checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", upstream_key="dummy_time", upstream_value=None, ), "expected_result": "SELECT COUNT(1) as count, " "'dummy_time' as UPSTREAM_KEY, " "max(dummy_time) as UPSTREAM_VALUE " "FROM sensor_new_data " "WHERE dummy_time > '-2147483647' " "HAVING COUNT(1) > 0", }, { "scenario_name": "sensor_db_table_with_custom_value", "sensor": { "sensor_id": "sensor_id_1", "filter_exp": "?upstream_key > '?upstream_value'", "control_db_table_name": "test_jdbc_sensor_custom_value", "upstream_key": "dummy_time", "upstream_value": "3333333333", }, "sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=TEST_DEFAULT_DATETIME, checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", upstream_key="dummy_time", upstream_value="3333333333", ), "expected_result": "SELECT COUNT(1) as count, " "'dummy_time' as UPSTREAM_KEY, " "max(dummy_time) as UPSTREAM_VALUE " "FROM sensor_new_data " "WHERE dummy_time > '3333333333' " "HAVING COUNT(1) > 0", }, { "scenario_name": "filter_exp_preprocess_query", "sensor": { "sensor_id": "sensor_id_1", "filter_exp": "my_column > 'my_value'", "control_db_table_name": None, "upstream_key": None, "upstream_value": None, }, "sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=TEST_DEFAULT_DATETIME, checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", upstream_key=None, upstream_value=None, ), "expected_result": "SELECT COUNT(1) as count " "FROM sensor_new_data " "WHERE my_column > 'my_value' " "HAVING COUNT(1) > 0", }, { "scenario_name": "filter_exp_preprocess_query_from_upstream_table_name", "sensor": { "sensor_id": "sensor_id_1", "filter_exp": "?upstream_key > '?upstream_value'", "control_db_table_name": "test_jdbc_sensor_default_dummy_value", "upstream_key": "dummy_time", "upstream_value": "3333333333", "upstream_table_name": "test_db.dummy_table", }, "sensor_data": Row( sensor_id="sensor_id_1", assets=["asset_1"], status=SensorStatus.ACQUIRED_NEW_DATA.value, status_change_timestamp=TEST_DEFAULT_DATETIME, checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", upstream_key="dummy_time", upstream_value="3333333333", ), "expected_result": "SELECT COUNT(1) as count, " "'dummy_time' as UPSTREAM_KEY, " "max(dummy_time) as UPSTREAM_VALUE " "FROM test_db.dummy_table " "WHERE dummy_time > '3333333333' " "HAVING COUNT(1) > 0", }, { "scenario_name": "raise_exception_db_name_is_defined_and_upstream_key_not", "sensor": { "sensor_id": "sensor_id_1", "filter_exp": "my_column > 'my_value'", "control_db_table_name": "test_jdbc_sensor_raise_exception", "upstream_key": None, "upstream_value": None, }, "expected_result": "If control_db_table_name is defined, " "upstream_key should " "also be defined!", }, ], ) def test_if_generate_filter_exp_preprocess_query(scenario: dict, capsys: Any) -> None: """Test filter expression for preprocess query gen. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ sensor_data = scenario["sensor"] expected_result = scenario["expected_result"] db_table = sensor_data.get("control_db_table_name") if ( scenario["scenario_name"] == "raise_exception_db_name_is_defined_and_upstream_key_not" ): with pytest.raises(ValueError) as exception: SensorUpstreamManager.generate_filter_exp_query( sensor_data.get("sensor_id"), sensor_data.get("filter_exp"), f"test_db.{db_table}" if db_table else None, sensor_data.get("upstream_key"), sensor_data.get("upstream_value"), ) assert expected_result in str(exception.value) else: with patch.object( SensorControlTableManager, "read_sensor_table_data", new=MagicMock(return_value=scenario["sensor_data"]), ) as sensor_table_data_mock: sensor_table_data_mock.start() subject = SensorUpstreamManager.generate_filter_exp_query( sensor_data.get("sensor_id"), sensor_data.get("filter_exp"), f"test_db.{db_table}" if db_table else None, sensor_data.get("upstream_key"), sensor_data.get("upstream_value"), sensor_data.get("upstream_table_name"), ) assert subject == expected_result sensor_table_data_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "generate_sensor_table_preprocess_query", "sensor_id": "sensor_id_1", "expected_result": "SELECT * " # nosec "FROM sensor_new_data " "WHERE" " _change_type in ('insert', 'update_postimage')" " and sensor_id = 'sensor_id_1'" f" and status = '{SensorStatus.PROCESSED_NEW_DATA.value}'", } ], ) def test_generate_sensor_table_preprocess_query(scenario: dict, capsys: Any) -> None: """Test if we are generating correctly the preprocess query. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ subject = SensorUpstreamManager.generate_sensor_table_preprocess_query( scenario["sensor_id"] ) assert subject == scenario["expected_result"] @pytest.fixture def dataframe_fixture() -> DataFrame: """Return a dummy dataframe to be used in our tests.""" schema = StructType([StructField("dummy_field", StringType(), True)]) return ExecEnv.SESSION.createDataFrame( [["a"], ["b"], ["c"]], schema, ) @pytest.mark.parametrize( "scenario", [ { "scenario_name": "read_new_data", "preprocess_query": None, "expected_result": 3, }, { "scenario_name": "read_new_data_with_preprocess_query", "preprocess_query": "SELECT *" "FROM sensor_new_data " "WHERE dummy_field = 'b' ", "expected_result": 1, }, ], ) def test_read_new_data( scenario: dict, capsys: Any, dataframe_fixture: DataFrame ) -> None: """Test if we execute the preprocess query when reading new data. Args: scenario: scenario to test. capsys: capture stdout and stderr. dataframe_fixture: fixture representing a dummy dataframe to be used as mock return. """ with patch.object( ReaderFactory, "get_data", MagicMock(return_value=dataframe_fixture) ) as reader_factory_mock: reader_factory_mock.start() new_data = SensorUpstreamManager.read_new_data( sensor_spec=SensorSpec( sensor_id="sensor_id_1", assets=["asset_1"], control_db_table_name="test_db.sensor_control_table", input_spec=None, preprocess_query=scenario["preprocess_query"], checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1", ) ) assert new_data.count() == scenario["expected_result"] reader_factory_mock.stop() @pytest.mark.parametrize( "scenario", [ { "scenario_name": "generate_sap_logchain_query", "chain_id": "MY_SAP_CHAIN_ID", "expected_result": "WITH sensor_new_data AS (" "SELECT " "CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, " "ANALYZED_STATUS " "FROM SAPPHA.RSPCLOGCHAIN " "WHERE " "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') " "AND UPPER(ANALYZED_STATUS) = UPPER('G')" ")", # nosec }, { "scenario_name": "generate_sap_logchain_query_dbtable", "chain_id": "MY_SAP_CHAIN_ID", "dbtable": "test_db.test_table", "expected_result": "WITH sensor_new_data AS (" "SELECT " "CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, " "ANALYZED_STATUS " "FROM test_db.test_table " "WHERE " "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') " "AND UPPER(ANALYZED_STATUS) = UPPER('G')" ")", # nosec }, { "scenario_name": "generate_sap_logchain_query_status", "chain_id": "MY_SAP_CHAIN_ID", "status": "A", "expected_result": "WITH sensor_new_data AS (" "SELECT " "CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, " "ANALYZED_STATUS " "FROM SAPPHA.RSPCLOGCHAIN " "WHERE " "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') " "AND UPPER(ANALYZED_STATUS) = UPPER('A')" ")", # nosec }, { "scenario_name": "generate_sap_logchain_query_engine_table", "chain_id": "MY_SAP_CHAIN_ID", "engine_table_name": "test_SAPTABLE", "expected_result": "WITH test_SAPTABLE AS (" "SELECT " "CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, " "ANALYZED_STATUS " "FROM SAPPHA.RSPCLOGCHAIN " "WHERE " "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') " "AND UPPER(ANALYZED_STATUS) = UPPER('G')" ")", # nosec }, { "scenario_name": "generate_sap_logchain_query_full_custom", "chain_id": "MY_SAP_CHAIN_ID", "dbtable": "test_db.test_table", "status": "A", "engine_table_name": "test_SAPTABLE", "expected_result": "WITH test_SAPTABLE AS (" "SELECT " "CHAIN_ID, " "CONCAT(DATUM, ZEIT) AS LOAD_DATE, " "ANALYZED_STATUS " "FROM test_db.test_table " "WHERE " "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') " "AND UPPER(ANALYZED_STATUS) = UPPER('A')" ")", # nosec }, { "scenario_name": "raise_exception_chain_id_is_not_defined", "chain_id": None, "expected_result": "To query on log chain SAP table the chain id " "should be defined!", }, ], ) def test_generate_sensor_sap_logchain_query(scenario: dict, capsys: Any) -> None: """Test if we are generating correctly the sap logchain query. Args: scenario: scenario to test. capsys: capture stdout and stderr. """ if scenario["scenario_name"] == "raise_exception_chain_id_is_not_defined": with pytest.raises(ValueError) as exception: SensorUpstreamManager.generate_sensor_sap_logchain_query( scenario["chain_id"], scenario.get("dbtable"), scenario.get("status"), scenario.get("engine_table_name"), ) assert scenario["expected_result"] in str(exception.value) else: if scenario["scenario_name"] == "generate_sap_logchain_query": subject = SensorUpstreamManager.generate_sensor_sap_logchain_query( scenario.get("chain_id"), ) elif scenario["scenario_name"] == "generate_sap_logchain_query_dbtable": subject = SensorUpstreamManager.generate_sensor_sap_logchain_query( scenario.get("chain_id"), dbtable=scenario.get("dbtable"), ) elif scenario["scenario_name"] == "generate_sap_logchain_query_status": subject = SensorUpstreamManager.generate_sensor_sap_logchain_query( scenario.get("chain_id"), status=scenario.get("status"), ) elif scenario["scenario_name"] == "generate_sap_logchain_query_engine_table": subject = SensorUpstreamManager.generate_sensor_sap_logchain_query( scenario.get("chain_id"), engine_table_name=scenario.get("engine_table_name"), ) else: subject = SensorUpstreamManager.generate_sensor_sap_logchain_query( scenario.get("chain_id"), scenario.get("dbtable"), scenario.get("status"), scenario.get("engine_table_name"), ) assert subject == scenario["expected_result"] def _prepare_new_data_tests(return_empty_df: bool = False) -> DataFrame: schema = StructType([StructField("dummy_field", StringType(), True)]) if return_empty_df: return ExecEnv.SESSION.createDataFrame( [], schema, ) else: return ExecEnv.SESSION.createDataFrame( [["a"], ["b"], ["c"]], schema, ) ================================================ FILE: tests/unit/test_sharepoint_csv_reader.py ================================================ """Test Sharepoint CSV reader. Unit tests for delimiter detection and Spark CSV option resolution in `SharepointCsvReader`. """ from __future__ import annotations from typing import Any, Dict, cast from lakehouse_engine.io.readers.sharepoint_reader import SharepointCsvReader class DummySharepointOptions: """Minimal Sharepoint options stub used to build a `SharepointCsvReader`. Args: local_options: Dictionary of local CSV read options (for example, header, delimiter, sep). """ def __init__(self, local_options: Dict[str, Any]) -> None: """Initialize the dummy options with the provided local options.""" self.local_options = local_options class DummyInputSpec: """Minimal input spec stub that exposes `sharepoint_opts` as expected by the reader. Args: sharepoint_options: Instance containing `local_options`. """ def __init__(self, sharepoint_options: DummySharepointOptions) -> None: """Initialize the dummy input spec with the provided Sharepoint options.""" self.sharepoint_opts = sharepoint_options def create_csv_reader(local_options: Dict[str, Any]) -> SharepointCsvReader: """Create a `SharepointCsvReader` instance without calling its constructor. Args: local_options: Dictionary of local CSV read options. Returns: SharepointCsvReader: A partially-initialized reader instance. """ csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader) csv_reader._input_spec = cast( Any, DummyInputSpec(DummySharepointOptions(local_options)) ) return csv_reader def test_detect_delimiter_uses_user_provided_delimiter() -> None: """It should always return the explicitly provided delimiter.""" csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader) detected: str = csv_reader.detect_delimiter( file_content=b"column_a;column_b\n1;2\n", provided_delimiter="|", expected_columns=None, ) assert detected == "|" def test_detect_delimiter_autodetects_semicolon() -> None: """It should infer the delimiter from the file content when none is provided.""" csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader) detected: str = csv_reader.detect_delimiter( file_content=b"column_a;column_b\n1;2\n", provided_delimiter=None, expected_columns=None, ) assert detected == ";" def test_detect_delimiter_defaults_to_comma_on_decode_error() -> None: """It should fall back to comma when content cannot be decoded for sniffing.""" csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader) detected: str = csv_reader.detect_delimiter( file_content=b"\xff\xfe", provided_delimiter=None, expected_columns=None, ) assert detected == "," def test_resolve_csv_options_prefers_sep_over_delimiter() -> None: """`sep` should take precedence over `delimiter`, and `delimiter` should be removed. Args: None Returns: None """ csv_reader: SharepointCsvReader = create_csv_reader( {"sep": "|", "delimiter": ",", "header": True} ) spark_options: Dict[str, Any] = csv_reader.resolve_spark_csv_options( b"column_a,column_b\n1,2\n" ) assert spark_options["sep"] == "|" assert "delimiter" not in spark_options def test_resolve_spark_csv_options_uses_delimiter_when_sep_missing() -> None: """If `sep` is missing, `delimiter` should be mapped into `sep` and removed.""" csv_reader: SharepointCsvReader = create_csv_reader( {"delimiter": ";", "header": True} ) spark_options: Dict[str, Any] = csv_reader.resolve_spark_csv_options( b"column_a,column_b\n1,2\n" ) assert spark_options["sep"] == ";" assert "delimiter" not in spark_options def test_resolve_spark_csv_options_autodetects_when_no_delimiter_provided() -> None: """If neither `sep` nor `delimiter` is provided, it should autodetect from content. Args: None Returns: None """ csv_reader: SharepointCsvReader = create_csv_reader({"header": True}) spark_options: Dict[str, Any] = csv_reader.resolve_spark_csv_options( b"column_a|column_b\n1|2\n" ) assert spark_options["sep"] == "|" def test_resolve_spark_csv_options_warns_when_expected_columns_names_mismatch( caplog: Any, ) -> None: """Warn when expected column names do not match the header. Args: caplog: Pytest log capture fixture. Returns: None. """ csv_reader: SharepointCsvReader = create_csv_reader( { "header": True, "expected_columns": ["col_a", "col_b"], } ) # Header uses semicolon, delimiter should be detected as ';', but names mismatch. file_content: bytes = b"wrong_a;wrong_b\n1;2\n" with caplog.at_level("WARNING"): csv_reader.resolve_spark_csv_options(file_content) assert "Expected columns don't match CSV header" in caplog.text def test_resolve_spark_csv_options_warns_when_expected_columns_validation_fails( caplog: Any, ) -> None: """Warn when validation against the header cannot be performed. Args: caplog: Pytest log capture fixture. Returns: None. """ csv_reader: SharepointCsvReader = create_csv_reader( { "header": True, "expected_columns": ["col_a", "col_b"], } ) # Force decode failure inside the expected_columns validation block. file_content: bytes = b"\xff\xfe" with caplog.at_level("WARNING"): csv_reader.resolve_spark_csv_options(file_content) assert "Failed to validate expected_columns against CSV header" in caplog.text ================================================ FILE: tests/unit/test_spark_session.py ================================================ """Test if a new spark session returns the same object as current session.""" from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.utils.logging_handler import LoggingHandler LOGGER = LoggingHandler(__name__).get_logger() def test_spark_session() -> None: """Test if a new spark session returns the same object as current session.""" old_session = ExecEnv.SESSION.getActiveSession() ExecEnv.get_or_create() new_session = ExecEnv.SESSION.getActiveSession() assert old_session is new_session, ( "Sessions pointing to different objects." f"{new_session} is different than {old_session}" ) LOGGER.info( f"New session ({new_session}) is the same as previously " f"created session ({old_session})." ) ================================================ FILE: tests/unit/test_version.py ================================================ """Test if the correct version of the lib is being read.""" import re from lakehouse_engine.utils.configs.config_utils import ConfigUtils def test_version() -> None: """Test if ConfigUtils is reading the correct version from pyproject.toml.""" configUtils = ConfigUtils() current_version = re.search( r"(?<=version = \").*(?=\")", open("pyproject.toml").read() ).group() assert current_version == configUtils.get_engine_version() ================================================ FILE: tests/utils/__init__.py ================================================ """Tests utilities.""" ================================================ FILE: tests/utils/dataframe_helpers.py ================================================ """Module with helper functions to interact with test dataframes.""" import random import string from typing import Optional, OrderedDict from pyspark.sql import DataFrame from pyspark.sql.types import StructType from lakehouse_engine.core.definitions import ( InputFormat, InputSpec, OutputFormat, OutputSpec, ReadType, WriteType, ) from lakehouse_engine.core.exec_env import ExecEnv from lakehouse_engine.io.readers.file_reader import FileReader from lakehouse_engine.io.readers.jdbc_reader import JDBCReader from lakehouse_engine.io.readers.table_reader import TableReader from lakehouse_engine.io.writers.jdbc_writer import JDBCWriter from lakehouse_engine.utils.logging_handler import LoggingHandler class DataframeHelpers(object): """Class with helper functions to interact with test dataframes.""" _logger = LoggingHandler(__name__).get_logger() @classmethod def has_diff( cls, df: DataFrame, another_df: DataFrame, group_and_order: bool = True ) -> bool: """Check if a dataframe has differences comparing to another dataframe. Note: the order of the columns and rows are not considered as differences by default. Args: df: one dataframe. another_df: another dataframe. group_and_order: whether to group and order the DFs or not. Returns: True if it has a difference, false otherwise. """ def print_diff(desc: str, diff_df: DataFrame) -> None: cls._logger.debug(desc) for row in diff_df.collect(): cls._logger.debug(row) cls._logger.debug("Checking if Dataframes have diff...") cols_to_group = df.columns if group_and_order: df = df.select(*cols_to_group).orderBy(*cols_to_group) another_df = another_df.select(*cols_to_group).orderBy(*cols_to_group) diff_1 = df.exceptAll(another_df) diff_2 = another_df.exceptAll(df) if diff_1.isEmpty() is False or diff_2.isEmpty() is False: df.show(100, False) another_df.show(100, False) cls._logger.debug("Dataframes have diff...") print_diff("Diff 1:", diff_1) print_diff("Diff 2:", diff_2) return True else: return False @staticmethod def read_from_file( location: str, file_format: str = InputFormat.CSV.value, schema: Optional[dict] = None, options: Optional[dict] = None, ) -> DataFrame: """Read data from a file into a dataframe. Args: location: location of the file(s). file_format: file(s) format. schema: schema of the files (only works with spark schema StructType for now). options: options (e.g., spark options) to read data. Returns: The dataframe that was read. """ if options is None and file_format == InputFormat.CSV.value: options = {"header": True, "delimiter": "|", "inferSchema": True} spec = InputSpec( spec_id=random.choice(string.ascii_letters), # nosec read_type=ReadType.BATCH.value, data_format=file_format, location=location, schema=schema, options=options, ) return FileReader(input_spec=spec).read() @staticmethod def read_from_table(db_table: str, options: Optional[dict] = None) -> DataFrame: """Read data from a table into a dataframe. Args: db_table: `database.table_name`. options: options (e.g., spark options) to read data. Returns: DataFrame: the dataframe that was read. """ spec = InputSpec( spec_id=random.choice(string.ascii_letters), # nosec read_type=ReadType.BATCH.value, db_table=db_table, options=options, ) return TableReader(input_spec=spec).read() @staticmethod def read_from_jdbc( uri: str, db_table: str, driver: str = "org.sqlite.JDBC" ) -> DataFrame: """Read data from jdbc into a dataframe. Args: uri: uri for the jdbc connection. db_table: `database.table_name`. driver: driver class. Returns: DataFrame: the dataframe that was read. """ spec = InputSpec( spec_id=random.choice(string.ascii_letters), # nosec db_table=db_table, read_type=ReadType.BATCH.value, options={"url": uri, "dbtable": db_table, "driver": driver}, ) return JDBCReader(input_spec=spec).read() @staticmethod def write_into_jdbc_table( df: DataFrame, uri: str, db_table: str, write_type: str = WriteType.APPEND.value, driver: str = "org.sqlite.JDBC", data: OrderedDict = None, ) -> None: """Write data into a jdbc table. Args: df: dataframe containing the data to append. uri: uri for the jdbc connection. db_table: `database.table_name`. write_type: type of writer to use for writing into the destination driver: driver class. data: list of all dfs generated on previous steps before writer. """ spec = OutputSpec( spec_id=random.choice(string.ascii_letters), # nosec input_id=random.choice(string.ascii_letters), # nosec write_type=write_type, data_format=OutputFormat.JDBC.value, options={"url": uri, "dbtable": db_table, "driver": driver}, ) JDBCWriter(output_spec=spec, df=df.coalesce(1), data=data).write() @staticmethod def create_empty_dataframe(struct_type: StructType) -> DataFrame: """Create an empty DataFrame. Args: struct_type: dict containing a spark schema structure. [Check here]( https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html). Returns: An empty dataframe """ return ExecEnv.SESSION.createDataFrame(data=[], schema=struct_type) @staticmethod def create_dataframe(data: list, schema: StructType) -> DataFrame: """Create a DataFrame. Args: data: dict containing the data to create the DataFrame. schema: dict containing a spark schema structure. [Check here]( https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html). Returns: The created DataFrame. """ return ExecEnv.SESSION.createDataFrame(data=data, schema=schema) @staticmethod def create_delta_table( cols: dict, table: str, db: str = "test_db", enable_cdf: bool = False ) -> None: """Create a delta table for test purposes. Args: cols: dict of columns to create table and their types. table: table name. db: database name. enable_cdf: whether to enable change data feed, or not. """ ExecEnv.SESSION.sql( f""" CREATE EXTERNAL TABLE {db}.{table} ( {','.join([f'{cname} {ctype}' for cname, ctype in cols.items()])} ) USING delta TBLPROPERTIES (delta.enableChangeDataFeed = {str(enable_cdf).lower()}) """ ) ================================================ FILE: tests/utils/dq_rules_table_utils.py ================================================ """Utils for dealing with DQ Rules tables.""" from lakehouse_engine.core.exec_env import ExecEnv from tests.utils.local_storage import LocalStorage def _create_dq_functions_source_table( test_resources_path: str, lakehouse_in_path: str, lakehouse_out_path: str, test_name: str, scenario: str, table_name: str, ) -> None: """Create test dq functions source table. Args: test_resources_path: path to the test resources. lakehouse_in_path: path to the lakehouse in. lakehouse_out_path: path to the lakehouse out. test_name: name of the test. scenario: name of the test scenario. table_name: name of the test table. """ LocalStorage.copy_file( f"{test_resources_path}/{test_name}/data/dq_functions/{table_name}.csv", f"{lakehouse_in_path}/{test_name}/{scenario}/dq_functions/", ) ExecEnv.SESSION.sql( f""" CREATE TABLE IF NOT EXISTS {table_name} ( dq_rule_id STRING, dq_check_type STRING, dq_tech_function STRING, execution_point STRING, schema STRING, table STRING, column STRING, filters STRING, arguments STRING, expected_technical_expression STRING, dimension STRING ) USING delta LOCATION '{lakehouse_out_path}/{test_name}/{scenario}/dq_functions' TBLPROPERTIES( 'lakehouse.primary_key'='dq_rule_id', 'delta.enableChangeDataFeed'='false' ) """ ) dq_functions = ( ExecEnv.SESSION.read.option("delimiter", "|") .option("header", True) .csv( f"{lakehouse_in_path}/{test_name}/{scenario}/dq_functions/{table_name}.csv" ) ) dq_functions.write.saveAsTable( name=f"{table_name}", format="delta", mode="overwrite" ) ================================================ FILE: tests/utils/exec_env_helpers.py ================================================ """Module with helper functions to interact with test execution environment.""" from lakehouse_engine.core.exec_env import ExecEnv class ExecEnvHelpers(object): """Class with helper functions to interact with test execution environment.""" @staticmethod def prepare_exec_env(spark_driver_memory: str) -> None: """Create single execution environment session.""" ExecEnv.get_or_create( app_name="Lakehouse Engine Tests", enable_hive_support=False, config={ "spark.master": "local[2]", "spark.driver.memory": spark_driver_memory, "spark.sql.warehouse.dir": "file:///app/tests/lakehouse/spark-warehouse/", # noqa: E501 "spark.sql.shuffle.partitions": "2", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension", "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog", # noqa: E501 "spark.jars.packages": "io.delta:delta-spark_2.13:4.0.0,org.xerial:sqlite-jdbc:3.50.3.0", # noqa: E501 "spark.jars.excludes": "net.sourceforge.f2j:arpack_combined_all", "spark.sql.sources.parallelPartitionDiscovery.parallelism": "2", "spark.sql.legacy.charVarcharAsString": True, }, ) @classmethod def set_exec_env_config(cls, key: str, value: str) -> None: """Set any execution environment (e.g., spark) session setting.""" ExecEnv.SESSION.conf.set(key, value) @classmethod def reset_default_spark_session_configs(cls) -> None: """Reset spark session configs.""" cls.set_exec_env_config( "spark.databricks.delta.schema.autoMerge.enabled", "false" ) cls.set_exec_env_config("spark.sql.streaming.schemaInference", "false") cls.set_exec_env_config( "spark.sql.sources.partitionColumnTypeInference.enabled", "true" ) ================================================ FILE: tests/utils/local_storage.py ================================================ """Utilities to interact with the local file system used in the tests.""" import glob from os import makedirs, path, remove from pathlib import Path from shutil import copy, copytree, rmtree from lakehouse_engine.utils.logging_handler import LoggingHandler _LOGGER = LoggingHandler(__name__).get_logger() class LocalStorage(object): """Helper class to support local storage operations in tests.""" @staticmethod def copy_file(from_path: str, to_path: str) -> None: """Copy files (supports regex) into target file or folder. :param str from_path: path from where to copy files from (supports regex). :param str to_path: path to where to copy files to. """ makedirs(path.dirname(to_path), exist_ok=True) for file in glob.glob(from_path): copy(file, to_path) @staticmethod def clean_folder(folder_path: str) -> None: """Clean a folder content. :param str folder_path: path of the folder to clean. """ if Path(folder_path).is_dir(): rmtree(folder_path) @staticmethod def delete_file(file_path: str) -> None: """Delete a file. :param str file_path: path of the file(s) to delete (supports regex). """ for file in glob.glob(file_path): if Path(file).exists(): remove(file) @staticmethod def read_file(file_path: str) -> str: """Read file from directory. Args: file_path: path of the file to be read. """ with open(file_path, "r") as f: result = f.read() return result @staticmethod def copy_dir(source: str, destination: str) -> None: """Copy all files in a directory. Args: source: string with the source location. destination: string with the destination location. """ copytree(source, destination, dirs_exist_ok=True) ================================================ FILE: tests/utils/mocks.py ================================================ """Module to hold utilities Mocks tests.""" from __future__ import annotations from typing import Any, Optional from unittest.mock import MagicMock class MockRESTResponse: """Mock Rest Responses for tests.""" def __init__( self, status_code: int, json_data: Optional[dict[str, Any]] = None, content: bytes = b"", ) -> None: """Construct MockRESTResponse instances. :param status_code: status code. :param json_data: json response. :param content: raw response content. """ self.status_code: int = status_code self.json_data: Optional[dict[str, Any]] = json_data self.content: bytes = content self.text: str = content.decode("utf-8", errors="ignore") if content else "" self.raise_for_status: MagicMock = MagicMock() def json(self) -> Optional[dict[str, Any]]: """Get json response. :return dict: json response. """ return self.json_data def __enter__(self) -> MockRESTResponse: """Allow use as a context manager.""" return self def __exit__( self, exc_type: type[BaseException] | None, exc: BaseException | None, tb: Any, ) -> None: """Context manager exit.""" return None ================================================ FILE: tests/utils/smtp_server.py ================================================ """A simple SMTP server for testing purposes.""" from logging import Logger from typing import Any from aiosmtpd import controller from aiosmtpd.handlers import Message from lakehouse_engine.utils.logging_handler import LoggingHandler class SMTPHandler(Message): """Custom handler to capture emails during testing.""" def __init__(self) -> None: """Initialize the SMTP handler.""" super().__init__() self.messages: list = [] def handle_message(self, message: Any) -> None: """Handle incoming messages and store them for verification. Args: message: The incoming email message. Returns: A string indicating the result of the message handling. """ self.messages.append(message) class SMTPServer: """Test SMTP server for unit testing.""" _LOGGER: Logger = LoggingHandler(__name__).get_logger() def __init__(self, host: str, port: int) -> None: """Initialize the SMTP server. Args: host: The hostname of the SMTP server. port: The port number of the SMTP server. """ self.host = host self.port = port self.handler = SMTPHandler() self.controller: controller.Controller | None = None def start(self) -> None: """Start the SMTP server.""" self.controller = controller.Controller( self.handler, hostname=self.host, port=self.port ) self.controller.start() self._LOGGER.info(f"Test SMTP server started on {self.host}:{self.port}") def stop(self) -> None: """Stop the SMTP server.""" if self.controller: self.controller.stop() self._LOGGER.info("Test SMTP server stopped") def get_messages(self) -> list: """Get all captured messages.""" return self.handler.messages def clear_messages(self) -> None: """Clear all captured messages.""" self.handler.messages.clear() def get_last_message(self) -> Any: """Get the last received message.""" return self.handler.messages[-1] if self.handler.messages else None