Repository: adidas/lakehouse-engine
Branch: master
Commit: 1487dfdcafbf
Files: 1183
Total size: 3.2 MB

Directory structure:
gitextract_pl4w_c1i/

├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   └── feature_request.md
│   └── pull_request_template.md
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.txt
├── Makefile
├── README.md
├── assets/
│   └── gab/
│       ├── metadata/
│       │   ├── gab/
│       │   │   └── f_agg_dummy_sales_kpi/
│       │   │       ├── 1_article_category.sql
│       │   │       └── 2_f_agg_dummy_sales_kpi.sql
│       │   └── tables/
│       │       ├── dim_calendar.sql
│       │       ├── dummy_sales_kpi.sql
│       │       ├── gab_log_events.sql
│       │       ├── gab_use_case_results.sql
│       │       └── lkp_query_builder.sql
│       ├── notebooks/
│       │   ├── gab.py
│       │   ├── gab_dim_calendar.py
│       │   ├── gab_job_manager.py
│       │   └── query_builder_helper.py
│       └── utils/
│           ├── databricks_job_utils.py
│           └── query_builder_utils.py
├── cicd/
│   ├── .bumpversion.cfg
│   ├── Dockerfile
│   ├── Jenkinsfile
│   ├── Jenkinsfile_deploy
│   ├── bandit.yaml
│   ├── code_doc/
│   │   ├── content.css
│   │   ├── custom_example_macros.py
│   │   ├── examples.json
│   │   ├── gen_ref_nav.py
│   │   ├── index.html.jinja2
│   │   ├── mkdocs.yml
│   │   ├── mkdocs_macros.py
│   │   ├── module.html.jinja2
│   │   ├── render_doc.py
│   │   └── render_docs.py
│   ├── flake8.conf
│   ├── meta.yaml
│   ├── requirements.txt
│   ├── requirements_azure.txt
│   ├── requirements_cicd.txt
│   ├── requirements_dq.txt
│   ├── requirements_os.txt
│   ├── requirements_sftp.txt
│   └── requirements_sharepoint.txt
├── lakehouse_engine/
│   ├── __init__.py
│   ├── algorithms/
│   │   ├── __init__.py
│   │   ├── algorithm.py
│   │   ├── data_loader.py
│   │   ├── dq_validator.py
│   │   ├── exceptions.py
│   │   ├── gab.py
│   │   ├── reconciliator.py
│   │   ├── sensor.py
│   │   └── sensors/
│   │       ├── __init__.py
│   │       ├── heartbeat.py
│   │       └── sensor.py
│   ├── configs/
│   │   ├── __init__.py
│   │   └── engine.yaml
│   ├── core/
│   │   ├── __init__.py
│   │   ├── dbfs_file_manager.py
│   │   ├── definitions.py
│   │   ├── exec_env.py
│   │   ├── executable.py
│   │   ├── file_manager.py
│   │   ├── gab_manager.py
│   │   ├── gab_sql_generator.py
│   │   ├── s3_file_manager.py
│   │   ├── sensor_manager.py
│   │   └── table_manager.py
│   ├── dq_processors/
│   │   ├── __init__.py
│   │   ├── custom_expectations/
│   │   │   ├── __init__.py
│   │   │   ├── expect_column_pair_a_to_be_not_equal_to_b.py
│   │   │   ├── expect_column_pair_a_to_be_smaller_or_equal_than_b.py
│   │   │   ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py
│   │   │   ├── expect_column_values_to_be_date_not_older_than.py
│   │   │   ├── expect_column_values_to_not_be_null_or_empty_string.py
│   │   │   ├── expect_multicolumn_column_a_must_equal_b_or_c.py
│   │   │   └── expect_queried_column_agg_value_to_be.py
│   │   ├── dq_factory.py
│   │   ├── exceptions.py
│   │   └── validator.py
│   ├── engine.py
│   ├── io/
│   │   ├── __init__.py
│   │   ├── exceptions.py
│   │   ├── reader.py
│   │   ├── reader_factory.py
│   │   ├── readers/
│   │   │   ├── __init__.py
│   │   │   ├── dataframe_reader.py
│   │   │   ├── file_reader.py
│   │   │   ├── jdbc_reader.py
│   │   │   ├── kafka_reader.py
│   │   │   ├── query_reader.py
│   │   │   ├── sap_b4_reader.py
│   │   │   ├── sap_bw_reader.py
│   │   │   ├── sftp_reader.py
│   │   │   ├── sharepoint_reader.py
│   │   │   └── table_reader.py
│   │   ├── writer.py
│   │   ├── writer_factory.py
│   │   └── writers/
│   │       ├── __init__.py
│   │       ├── console_writer.py
│   │       ├── dataframe_writer.py
│   │       ├── delta_merge_writer.py
│   │       ├── file_writer.py
│   │       ├── jdbc_writer.py
│   │       ├── kafka_writer.py
│   │       ├── rest_api_writer.py
│   │       ├── sharepoint_writer.py
│   │       └── table_writer.py
│   ├── terminators/
│   │   ├── __init__.py
│   │   ├── cdf_processor.py
│   │   ├── dataset_optimizer.py
│   │   ├── notifier.py
│   │   ├── notifier_factory.py
│   │   ├── notifiers/
│   │   │   ├── __init__.py
│   │   │   ├── email_notifier.py
│   │   │   ├── exceptions.py
│   │   │   └── notification_templates.py
│   │   ├── sensor_terminator.py
│   │   ├── spark_terminator.py
│   │   └── terminator_factory.py
│   ├── transformers/
│   │   ├── __init__.py
│   │   ├── aggregators.py
│   │   ├── column_creators.py
│   │   ├── column_reshapers.py
│   │   ├── condensers.py
│   │   ├── custom_transformers.py
│   │   ├── data_maskers.py
│   │   ├── date_transformers.py
│   │   ├── exceptions.py
│   │   ├── filters.py
│   │   ├── joiners.py
│   │   ├── null_handlers.py
│   │   ├── optimizers.py
│   │   ├── regex_transformers.py
│   │   ├── repartitioners.py
│   │   ├── transformer_factory.py
│   │   ├── unions.py
│   │   └── watermarker.py
│   └── utils/
│       ├── __init__.py
│       ├── acon_utils.py
│       ├── configs/
│       │   ├── __init__.py
│       │   └── config_utils.py
│       ├── databricks_utils.py
│       ├── dq_utils.py
│       ├── engine_usage_stats.py
│       ├── expectations_utils.py
│       ├── extraction/
│       │   ├── __init__.py
│       │   ├── jdbc_extraction_utils.py
│       │   ├── sap_b4_extraction_utils.py
│       │   ├── sap_bw_extraction_utils.py
│       │   └── sftp_extraction_utils.py
│       ├── file_utils.py
│       ├── gab_utils.py
│       ├── logging_handler.py
│       ├── rest_api.py
│       ├── schema_utils.py
│       ├── sharepoint_utils.py
│       ├── spark_utils.py
│       ├── sql_parser_utils.py
│       └── storage/
│           ├── __init__.py
│           ├── dbfs_storage.py
│           ├── file_storage.py
│           ├── file_storage_functions.py
│           ├── local_fs_storage.py
│           └── s3_storage.py
├── lakehouse_engine_usage/
│   ├── __init__.py
│   ├── data_loader/
│   │   ├── __init__.py
│   │   ├── append_load_from_jdbc_with_permissive_mode/
│   │   │   ├── __init__.py
│   │   │   └── append_load_from_jdbc_with_permissive_mode.md
│   │   ├── append_load_with_failfast/
│   │   │   ├── __init__.py
│   │   │   └── append_load_with_failfast.md
│   │   ├── batch_delta_load_init_delta_backfill_with_merge/
│   │   │   ├── __init__.py
│   │   │   └── batch_delta_load_init_delta_backfill_with_merge.md
│   │   ├── custom_transformer/
│   │   │   ├── __init__.py
│   │   │   ├── custom_transformer.md
│   │   │   └── sql_custom_transformer.md
│   │   ├── custom_transformer_sql/
│   │   │   ├── __init__.py
│   │   │   └── custom_transformer_sql.md
│   │   ├── data_loader.md
│   │   ├── extract_from_sap_b4_adso/
│   │   │   ├── __init__.py
│   │   │   └── extract_from_sap_b4_adso.md
│   │   ├── extract_from_sap_bw_dso/
│   │   │   ├── __init__.py
│   │   │   └── extract_from_sap_bw_dso.md
│   │   ├── extract_from_sftp/
│   │   │   ├── __init__.py
│   │   │   └── extract_from_sftp.md
│   │   ├── extract_using_jdbc_connection/
│   │   │   ├── __init__.py
│   │   │   └── extract_using_jdbc_connection.md
│   │   ├── filtered_full_load/
│   │   │   ├── __init__.py
│   │   │   └── filtered_full_load.md
│   │   ├── filtered_full_load_with_selective_replace/
│   │   │   ├── __init__.py
│   │   │   └── filtered_full_load_with_selective_replace.md
│   │   ├── flatten_schema_and_explode_columns/
│   │   │   ├── __init__.py
│   │   │   └── flatten_schema_and_explode_columns.md
│   │   ├── full_load/
│   │   │   ├── __init__.py
│   │   │   └── full_load.md
│   │   ├── read_from_dataframe/
│   │   │   ├── __init__.py
│   │   │   └── read_from_dataframe.md
│   │   ├── read_from_sharepoint/
│   │   │   ├── __init__.py
│   │   │   └── read_from_sharepoint.md
│   │   ├── streaming_append_load_with_malformed/
│   │   │   ├── __init__.py
│   │   │   └── streaming_append_load_with_malformed.md
│   │   ├── streaming_append_load_with_terminator/
│   │   │   ├── __init__.py
│   │   │   └── streaming_append_load_with_terminator.md
│   │   ├── streaming_delta_load_with_group_and_rank_condensation/
│   │   │   ├── __init__.py
│   │   │   └── streaming_delta_load_with_group_and_rank_condensation.md
│   │   ├── streaming_delta_with_late_arriving_and_out_of_order_events/
│   │   │   ├── __init__.py
│   │   │   └── streaming_delta_with_late_arriving_and_out_of_order_events.md
│   │   ├── write_and_read_dataframe/
│   │   │   ├── __init__.py
│   │   │   └── write_and_read_dataframe.md
│   │   ├── write_to_console/
│   │   │   ├── __init__.py
│   │   │   └── write_to_console.md
│   │   ├── write_to_rest_api/
│   │   │   ├── __init__.py
│   │   │   └── write_to_rest_api.md
│   │   └── write_to_sharepoint/
│   │       ├── __init__.py
│   │       └── write_to_sharepoint.md
│   ├── data_quality/
│   │   ├── __init__.py
│   │   ├── custom_expectations/
│   │   │   ├── __init__.py
│   │   │   └── custom_expectations.md
│   │   ├── data_quality.md
│   │   ├── data_quality_validator/
│   │   │   ├── __init__.py
│   │   │   └── data_quality_validator.md
│   │   ├── minimal_example/
│   │   │   ├── __init__.py
│   │   │   └── minimal_example.md
│   │   ├── prisma/
│   │   │   ├── __init__.py
│   │   │   └── prisma.md
│   │   ├── result_sink/
│   │   │   ├── __init__.py
│   │   │   └── result_sink.md
│   │   ├── row_tagging/
│   │   │   ├── __init__.py
│   │   │   └── row_tagging.md
│   │   └── validations_failing/
│   │       ├── __init__.py
│   │       └── validations_failing.md
│   ├── gab/
│   │   ├── __init__.py
│   │   ├── gab.md
│   │   └── step_by_step/
│   │       ├── __init__.py
│   │       └── step_by_step.md
│   ├── lakehouse_engine_usage.md
│   ├── managerhelper/
│   │   ├── managerhelper.md
│   │   ├── operations-script.js
│   │   ├── operations-styles-mkdocs.css
│   │   └── styles-mkdocs.css
│   ├── reconciliator/
│   │   ├── __init__.py
│   │   └── reconciliator.md
│   ├── sensor/
│   │   ├── __init__.py
│   │   ├── delta_table/
│   │   │   ├── __init__.py
│   │   │   └── delta_table.md
│   │   ├── delta_upstream_sensor_table/
│   │   │   ├── __init__.py
│   │   │   └── delta_upstream_sensor_table.md
│   │   ├── file/
│   │   │   ├── __init__.py
│   │   │   └── file.md
│   │   ├── jdbc_table/
│   │   │   ├── __init__.py
│   │   │   └── jdbc_table.md
│   │   ├── kafka/
│   │   │   ├── __init__.py
│   │   │   └── kafka.md
│   │   ├── sap_bw_b4/
│   │   │   ├── __init__.py
│   │   │   └── sap_bw_b4.md
│   │   ├── sensor.md
│   │   └── update_sensor_status/
│   │       ├── __init__.py
│   │       └── update_sensor_status.md
│   └── sensors/
│       ├── __init__.py
│       ├── heartbeat/
│       │   ├── __init__.py
│       │   ├── delta_table/
│       │   │   ├── __init__.py
│       │   │   └── delta_table.md
│       │   ├── heartbeat.md
│       │   ├── heartbeat_sensor_data_feed/
│       │   │   ├── __init__.py
│       │   │   └── heartbeat_sensor_data_feed.md
│       │   ├── kafka/
│       │   │   ├── __init__.py
│       │   │   └── kafka.md
│       │   ├── manual_table/
│       │   │   ├── __init__.py
│       │   │   └── manual_table.md
│       │   ├── sap_bw_b4/
│       │   │   ├── __init__.py
│       │   │   └── sap_bw_b4.md
│       │   ├── trigger_file/
│       │   │   ├── __init__.py
│       │   │   └── trigger_file.md
│       │   └── update_heartbeat_sensor_status/
│       │       ├── __init__.py
│       │       └── update_heartbeat_sensor_status.md
│       ├── sensor/
│       │   ├── __init__.py
│       │   ├── delta_table/
│       │   │   ├── __init__.py
│       │   │   └── delta_table.md
│       │   ├── delta_upstream_sensor_table/
│       │   │   ├── __init__.py
│       │   │   └── delta_upstream_sensor_table.md
│       │   ├── file/
│       │   │   ├── __init__.py
│       │   │   └── file.md
│       │   ├── jdbc_table/
│       │   │   ├── __init__.py
│       │   │   └── jdbc_table.md
│       │   ├── kafka/
│       │   │   ├── __init__.py
│       │   │   └── kafka.md
│       │   ├── sap_bw_b4/
│       │   │   ├── __init__.py
│       │   │   └── sap_bw_b4.md
│       │   ├── sensor.md
│       │   └── update_sensor_status/
│       │       ├── __init__.py
│       │       └── update_sensor_status.md
│       └── sensors.md
├── pyproject.toml
├── samples/
│   ├── cricket_dq_tutorial.py
│   └── tpch_load_and_analysis_tutorial.py
└── tests/
    ├── __init__.py
    ├── configs/
    │   ├── __init__.py
    │   └── engine.yaml
    ├── conftest.py
    ├── feature/
    │   ├── __init__.py
    │   ├── custom_expectations/
    │   │   ├── __init__.py
    │   │   ├── test_custom_expectations.py
    │   │   └── test_expectation_validity.py
    │   ├── data_loader_custom_transformer/
    │   │   ├── __init__.py
    │   │   ├── test_data_loader_custom_transformer_calculate_kpi.py
    │   │   ├── test_data_loader_custom_transformer_delta_load.py
    │   │   └── test_data_loader_custom_transformer_sql_transformation.py
    │   ├── delta_load/
    │   │   ├── __init__.py
    │   │   ├── test_delta_load_group_and_rank.py
    │   │   ├── test_delta_load_merge_options.py
    │   │   └── test_delta_load_record_mode_cdc.py
    │   ├── test_append_load.py
    │   ├── test_data_quality.py
    │   ├── test_dq_validator.py
    │   ├── test_engine_usage_stats.py
    │   ├── test_extract_from_sap_b4.py
    │   ├── test_extract_from_sap_bw.py
    │   ├── test_file_manager.py
    │   ├── test_file_manager_dbfs.py
    │   ├── test_file_manager_s3.py
    │   ├── test_full_load.py
    │   ├── test_gab.py
    │   ├── test_heartbeat.py
    │   ├── test_jdbc_reader.py
    │   ├── test_materialize_cdf.py
    │   ├── test_notification.py
    │   ├── test_reconciliation.py
    │   ├── test_schema_evolution.py
    │   ├── test_sensors.py
    │   ├── test_sftp_reader.py
    │   ├── test_sharepoint_reader.py
    │   ├── test_sharepoint_writer.py
    │   ├── test_table_manager.py
    │   ├── test_writers.py
    │   └── transformations/
    │       ├── __init__.py
    │       ├── test_chain_transformations.py
    │       ├── test_column_creators.py
    │       ├── test_column_reshapers.py
    │       ├── test_data_maskers.py
    │       ├── test_date_transformers.py
    │       ├── test_drop_duplicate_rows.py
    │       ├── test_joiners.py
    │       ├── test_multiple_transformations.py
    │       ├── test_null_handlers.py
    │       ├── test_optimizers.py
    │       ├── test_regex_transformers.py
    │       ├── test_unions.py
    │       └── test_watermarker.py
    ├── resources/
    │   ├── feature/
    │   │   ├── append_load/
    │   │   │   ├── failfast/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── batch_init.json
    │   │   │   │   └── data/
    │   │   │   │       └── source/
    │   │   │   │           ├── part-01.csv
    │   │   │   │           ├── part-02.csv
    │   │   │   │           └── part-03.csv
    │   │   │   ├── jdbc_permissive/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── batch_init.json
    │   │   │   │   └── data/
    │   │   │   │       ├── control/
    │   │   │   │       │   └── part-01.csv
    │   │   │   │       └── source/
    │   │   │   │           ├── part-01.csv
    │   │   │   │           ├── part-02.csv
    │   │   │   │           └── part-03.csv
    │   │   │   ├── streaming_dropmalformed/
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── part-01.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │       └── part-03.csv
    │   │   │   │   └── streaming.json
    │   │   │   └── streaming_with_terminators/
    │   │   │       ├── data/
    │   │   │       │   ├── control/
    │   │   │       │   │   └── part-01.csv
    │   │   │       │   └── source/
    │   │   │       │       └── part-01.csv
    │   │   │       └── streaming.json
    │   │   ├── custom_expectations/
    │   │   │   ├── expect_column_pair_a_to_be_not_equal_to_b/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── dq_control_success.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── dq_sales_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── expect_column_pair_a_to_be_smaller_or_equal_than_b/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── dq_control_success.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── dq_sales_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── dq_control_success.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── dq_sales_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── expect_column_values_to_be_date_not_older_than/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── dq_control_success.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── dq_sales_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── expect_column_values_to_not_be_null_or_empty_string/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── dq_control_success.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── dq_sales_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── expect_multicolumn_column_a_must_equal_b_or_c/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── dq_control_success.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── dq_sales_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   └── expect_queried_column_agg_value_to_be/
    │   │   │       ├── batch.json
    │   │   │       ├── data/
    │   │   │       │   ├── control/
    │   │   │       │   │   └── dq_control_success.csv
    │   │   │       │   └── source/
    │   │   │       │       ├── part-01.csv
    │   │   │       │       └── part-02.csv
    │   │   │       ├── dq_sales_schema.json
    │   │   │       └── streaming.json
    │   │   ├── data_loader_custom_transformer/
    │   │   │   ├── calculate_kpi/
    │   │   │   │   ├── control_schema.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── part-01.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       └── part-01.csv
    │   │   │   │   └── source_schema.json
    │   │   │   ├── delta_load/
    │   │   │   │   └── data/
    │   │   │   │       ├── control/
    │   │   │   │       │   └── part-01.csv
    │   │   │   │       └── source/
    │   │   │   │           ├── part-01.csv
    │   │   │   │           ├── part-02.csv
    │   │   │   │           ├── part-03.csv
    │   │   │   │           └── part-04.csv
    │   │   │   └── sql_transformation/
    │   │   │       ├── control_schema.json
    │   │   │       ├── data/
    │   │   │       │   ├── control/
    │   │   │       │   │   └── part-01.csv
    │   │   │       │   └── source/
    │   │   │       │       └── part-01.csv
    │   │   │       └── source_schema.json
    │   │   ├── data_quality/
    │   │   │   ├── build_data_docs/
    │   │   │   │   ├── with_data_docs_local_fs/
    │   │   │   │   │   └── 20240410-080323-dq_success-sales_orders-checkpoint/
    │   │   │   │   │       └── 20240410T080323.289170Z/
    │   │   │   │   │           └── 7ba399ea28cc40bf8c79213a440aeb91.json
    │   │   │   │   └── without_data_docs_local_fs/
    │   │   │   │       └── 20240409-143548-dq_validator-sales_source-checkpoint/
    │   │   │   │           └── 20240409T143548.454043Z/
    │   │   │   │               └── f0d7bd293d22bcfd3c1fec5a7d566638.json
    │   │   │   ├── load_with_dq_table/
    │   │   │   │   ├── delta_with_dupl_tag_gen_fail/
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   ├── data_validator.json
    │   │   │   │   │   │   │   ├── data_validator_schema.json
    │   │   │   │   │   │   │   ├── sales.json
    │   │   │   │   │   │   │   └── sales_schema.json
    │   │   │   │   │   │   ├── dq_functions/
    │   │   │   │   │   │   │   ├── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init.csv
    │   │   │   │   │   │   │   └── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new.csv
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │   │       └── part-04.csv
    │   │   │   │   │   ├── streaming_init.json
    │   │   │   │   │   └── streaming_new.json
    │   │   │   │   ├── delta_with_duplicates_tag/
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   ├── data_validator.json
    │   │   │   │   │   │   │   ├── data_validator_schema.json
    │   │   │   │   │   │   │   ├── sales.json
    │   │   │   │   │   │   │   └── sales_schema.json
    │   │   │   │   │   │   ├── dq_functions/
    │   │   │   │   │   │   │   ├── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init.csv
    │   │   │   │   │   │   │   └── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new.csv
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │   │       └── part-04.csv
    │   │   │   │   │   ├── streaming_init.json
    │   │   │   │   │   └── streaming_new.json
    │   │   │   │   └── full_overwrite_tag/
    │   │   │   │       ├── batch_init.json
    │   │   │   │       ├── batch_new.json
    │   │   │   │       └── data/
    │   │   │   │           ├── control/
    │   │   │   │           │   ├── data_validator.json
    │   │   │   │           │   ├── data_validator_schema.json
    │   │   │   │           │   ├── sales.json
    │   │   │   │           │   └── sales_schema.json
    │   │   │   │           ├── dq_functions/
    │   │   │   │           │   ├── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init.csv
    │   │   │   │           │   └── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_new.csv
    │   │   │   │           └── source/
    │   │   │   │               ├── part-01.csv
    │   │   │   │               └── part-02.csv
    │   │   │   ├── load_with_dq_validator/
    │   │   │   │   ├── delta_with_dupl_tag_gen_fail/
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   ├── data_validator.json
    │   │   │   │   │   │   │   ├── data_validator_schema.json
    │   │   │   │   │   │   │   ├── sales.json
    │   │   │   │   │   │   │   └── sales_schema.json
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │   │       └── part-04.csv
    │   │   │   │   │   ├── streaming_init.json
    │   │   │   │   │   └── streaming_new.json
    │   │   │   │   ├── delta_with_duplicates/
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   ├── data_validator.json
    │   │   │   │   │   │   │   └── data_validator_schema.json
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │   │       └── part-04.csv
    │   │   │   │   │   ├── streaming_init.json
    │   │   │   │   │   └── streaming_new.json
    │   │   │   │   ├── delta_with_duplicates_tag/
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   ├── data_validator.json
    │   │   │   │   │   │   │   ├── data_validator_schema.json
    │   │   │   │   │   │   │   ├── sales.json
    │   │   │   │   │   │   │   └── sales_schema.json
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │   │       └── part-04.csv
    │   │   │   │   │   ├── streaming_init.json
    │   │   │   │   │   └── streaming_new.json
    │   │   │   │   ├── full_overwrite/
    │   │   │   │   │   ├── batch_init.json
    │   │   │   │   │   ├── batch_new.json
    │   │   │   │   │   └── data/
    │   │   │   │   │       ├── control/
    │   │   │   │   │       │   ├── data_validator.json
    │   │   │   │   │       │   └── data_validator_schema.json
    │   │   │   │   │       └── source/
    │   │   │   │   │           ├── part-01.csv
    │   │   │   │   │           └── part-02.csv
    │   │   │   │   ├── full_overwrite_tag/
    │   │   │   │   │   ├── batch_init.json
    │   │   │   │   │   ├── batch_new.json
    │   │   │   │   │   └── data/
    │   │   │   │   │       ├── control/
    │   │   │   │   │       │   ├── data_validator.json
    │   │   │   │   │       │   ├── data_validator_schema.json
    │   │   │   │   │       │   ├── sales.json
    │   │   │   │   │       │   └── sales_schema.json
    │   │   │   │   │       └── source/
    │   │   │   │   │           ├── part-01.csv
    │   │   │   │   │           └── part-02.csv
    │   │   │   │   └── no_transformers/
    │   │   │   │       ├── data/
    │   │   │   │       │   ├── control/
    │   │   │   │       │   │   ├── data_validator.json
    │   │   │   │       │   │   └── data_validator_schema.json
    │   │   │   │       │   └── source/
    │   │   │   │       │       ├── part-01.csv
    │   │   │   │       │       ├── part-02.csv
    │   │   │   │       │       ├── part-03.csv
    │   │   │   │       │       └── part-04.csv
    │   │   │   │       ├── streaming_init.json
    │   │   │   │       └── streaming_new.json
    │   │   │   └── validator/
    │   │   │       └── data/
    │   │   │           ├── control/
    │   │   │           │   └── data_validator.csv
    │   │   │           ├── dq_functions/
    │   │   │           │   ├── test_db.dq_functions_source_dq_failure.csv
    │   │   │           │   ├── test_db.dq_functions_source_dq_failure_error_disabled.csv
    │   │   │           │   ├── test_db.dq_functions_source_dq_failure_max_percentage.csv
    │   │   │           │   └── test_db.dq_functions_source_dq_success.csv
    │   │   │           └── source/
    │   │   │               └── part-01.csv
    │   │   ├── delta_load/
    │   │   │   ├── group_and_rank/
    │   │   │   │   ├── fail_with_duplicates_in_same_file/
    │   │   │   │   │   ├── batch_delta.json
    │   │   │   │   │   ├── batch_init.json
    │   │   │   │   │   ├── control_batch_schema.json
    │   │   │   │   │   ├── control_streaming_schema.json
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   ├── batch.csv
    │   │   │   │   │   │   │   └── streaming.csv
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       ├── WE_SO_SCL_202108111400000000.csv
    │   │   │   │   │   │       ├── WE_SO_SCL_202108111500000000.csv
    │   │   │   │   │   │       └── WE_SO_SCL_202108111600000000.csv
    │   │   │   │   │   ├── source_schema.json
    │   │   │   │   │   └── streaming_delta.json
    │   │   │   │   └── with_duplicates_in_same_file/
    │   │   │   │       ├── batch_delta.json
    │   │   │   │       ├── batch_init.json
    │   │   │   │       ├── control_batch_schema.json
    │   │   │   │       ├── control_streaming_schema.json
    │   │   │   │       ├── data/
    │   │   │   │       │   ├── control/
    │   │   │   │       │   │   ├── batch.csv
    │   │   │   │       │   │   └── streaming.csv
    │   │   │   │       │   └── source/
    │   │   │   │       │       ├── WE_SO_SCL_202108111400000000.csv
    │   │   │   │       │       ├── WE_SO_SCL_202108111500000000.csv
    │   │   │   │       │       └── WE_SO_SCL_202108111600000000.csv
    │   │   │   │       ├── source_schema.json
    │   │   │   │       └── streaming_delta.json
    │   │   │   ├── merge_options/
    │   │   │   │   ├── control_batch_schema.json
    │   │   │   │   ├── insert_column_set/
    │   │   │   │   │   ├── batch_delta.json
    │   │   │   │   │   ├── batch_init.json
    │   │   │   │   │   └── data/
    │   │   │   │   │       ├── control/
    │   │   │   │   │       │   └── batch.csv
    │   │   │   │   │       └── source/
    │   │   │   │   │           ├── WE_SO_SCL_202108111400000000.csv
    │   │   │   │   │           └── WE_SO_SCL_202108111500000000.csv
    │   │   │   │   ├── source_schema.json
    │   │   │   │   ├── update_all/
    │   │   │   │   │   ├── batch_delta.json
    │   │   │   │   │   ├── batch_init.json
    │   │   │   │   │   └── data/
    │   │   │   │   │       ├── control/
    │   │   │   │   │       │   └── batch.csv
    │   │   │   │   │       └── source/
    │   │   │   │   │           ├── WE_SO_SCL_202108111400000000.csv
    │   │   │   │   │           └── WE_SO_SCL_202108111500000000.csv
    │   │   │   │   └── update_column_set/
    │   │   │   │       ├── batch_delta.json
    │   │   │   │       ├── batch_init.json
    │   │   │   │       └── data/
    │   │   │   │           ├── control/
    │   │   │   │           │   └── batch.csv
    │   │   │   │           └── source/
    │   │   │   │               ├── WE_SO_SCL_202108111400000000.csv
    │   │   │   │               └── WE_SO_SCL_202108111500000000.csv
    │   │   │   └── record_mode_cdc/
    │   │   │       ├── backfill/
    │   │   │       │   ├── batch_backfill.json
    │   │   │       │   ├── batch_delta.json
    │   │   │       │   ├── batch_init.json
    │   │   │       │   └── data/
    │   │   │       │       ├── control/
    │   │   │       │       │   └── part-01.csv
    │   │   │       │       └── source/
    │   │   │       │           ├── part-01.csv
    │   │   │       │           ├── part-02.csv
    │   │   │       │           ├── part-03.csv
    │   │   │       │           ├── part-04.csv
    │   │   │       │           └── part-05.csv
    │   │   │       ├── direct_silver_load/
    │   │   │       │   ├── batch_delta.json
    │   │   │       │   ├── batch_init.json
    │   │   │       │   └── data/
    │   │   │       │       ├── control/
    │   │   │       │       │   ├── part-01.csv
    │   │   │       │       │   └── part-02.csv
    │   │   │       │       └── source/
    │   │   │       │           ├── part-01.csv
    │   │   │       │           ├── part-02.csv
    │   │   │       │           ├── part-03.csv
    │   │   │       │           └── part-04.csv
    │   │   │       ├── late_arriving_changes/
    │   │   │       │   ├── batch_delta.json
    │   │   │       │   ├── batch_init.json
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── control/
    │   │   │       │   │   │   └── part-01.csv
    │   │   │       │   │   └── source/
    │   │   │       │   │       ├── part-01.csv
    │   │   │       │   │       ├── part-02.csv
    │   │   │       │   │       ├── part-03.csv
    │   │   │       │   │       └── part-04.csv
    │   │   │       │   └── streaming_delta.json
    │   │   │       ├── out_of_order_changes/
    │   │   │       │   ├── batch_delta.json
    │   │   │       │   ├── batch_init.json
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── control/
    │   │   │       │   │   │   └── part-01.csv
    │   │   │       │   │   └── source/
    │   │   │       │   │       ├── part-01.csv
    │   │   │       │   │       ├── part-02.csv
    │   │   │       │   │       ├── part-03.csv
    │   │   │       │   │       └── part-04.csv
    │   │   │       │   └── streaming_delta.json
    │   │   │       ├── with_deletes_additional_columns/
    │   │   │       │   ├── batch_delta.json
    │   │   │       │   ├── batch_init.json
    │   │   │       │   └── data/
    │   │   │       │       ├── control/
    │   │   │       │       │   └── part-01.csv
    │   │   │       │       └── source/
    │   │   │       │           ├── part-01.csv
    │   │   │       │           ├── part-02.csv
    │   │   │       │           ├── part-03.csv
    │   │   │       │           └── part-04.csv
    │   │   │       ├── with_duplicates/
    │   │   │       │   ├── batch_delta.json
    │   │   │       │   ├── batch_init.json
    │   │   │       │   └── data/
    │   │   │       │       ├── control/
    │   │   │       │       │   └── part-01.csv
    │   │   │       │       └── source/
    │   │   │       │           ├── part-01.csv
    │   │   │       │           ├── part-02.csv
    │   │   │       │           ├── part-03.csv
    │   │   │       │           └── part-04.csv
    │   │   │       └── with_upserts_only_removed_columns/
    │   │   │           ├── batch_delta.json
    │   │   │           ├── batch_init.json
    │   │   │           └── data/
    │   │   │               ├── control/
    │   │   │               │   └── part-01.csv
    │   │   │               └── source/
    │   │   │                   ├── part-01.json
    │   │   │                   ├── part-02.json
    │   │   │                   ├── part-03.json
    │   │   │                   └── part-04.json
    │   │   ├── dq_validator/
    │   │   │   ├── batch.json
    │   │   │   ├── data/
    │   │   │   │   ├── control/
    │   │   │   │   │   ├── data_restore_control.csv
    │   │   │   │   │   ├── dq_control_failure.csv
    │   │   │   │   │   ├── dq_control_failure_disabled.csv
    │   │   │   │   │   ├── dq_control_success.csv
    │   │   │   │   │   ├── dq_control_success_explode.csv
    │   │   │   │   │   └── dq_control_success_explode_disabled.csv
    │   │   │   │   ├── dq_functions/
    │   │   │   │   │   ├── test_db.dq_functions_source_table_failure.csv
    │   │   │   │   │   └── test_db.dq_functions_source_table_success.csv
    │   │   │   │   └── source/
    │   │   │   │       ├── part-01.csv
    │   │   │   │       └── part-02.csv
    │   │   │   ├── dq_sales_schema.json
    │   │   │   ├── streaming.json
    │   │   │   ├── streaming_dataframe_two_runs/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_functions_streaming_dataframe_two_runs_first_run.csv
    │   │   │   │           └── test_db.dq_functions_streaming_dataframe_two_runs_second_run.csv
    │   │   │   ├── table_batch_dataframe_failure_disabled/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_functions_source_table_failure.csv
    │   │   │   │           └── test_db.dq_functions_source_table_success.csv
    │   │   │   ├── table_batch_dataframe_success/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_functions_source_table_failure.csv
    │   │   │   │           └── test_db.dq_functions_source_table_success.csv
    │   │   │   ├── table_batch_dq_rule/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_table_rule_id_failure.csv
    │   │   │   │           └── test_db.dq_table_rule_id_success.csv
    │   │   │   ├── table_batch_failure_disabled/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_functions_source_table_failure.csv
    │   │   │   │           └── test_db.dq_functions_source_table_success.csv
    │   │   │   ├── table_batch_success/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_functions_source_table_failure.csv
    │   │   │   │           └── test_db.dq_functions_source_table_success.csv
    │   │   │   ├── table_streaming_dq_rule/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_table_rule_id_failure.csv
    │   │   │   │           └── test_db.dq_table_rule_id_success.csv
    │   │   │   ├── table_streaming_failure_disabled/
    │   │   │   │   └── data/
    │   │   │   │       └── dq_functions/
    │   │   │   │           ├── test_db.dq_functions_source_table_failure.csv
    │   │   │   │           └── test_db.dq_functions_source_table_success.csv
    │   │   │   └── table_streaming_success/
    │   │   │       └── data/
    │   │   │           └── dq_functions/
    │   │   │               ├── test_db.dq_functions_source_table_failure.csv
    │   │   │               └── test_db.dq_functions_source_table_success.csv
    │   │   ├── engine_usage_stats/
    │   │   │   ├── dq_validator/
    │   │   │   │   └── data/
    │   │   │   │       ├── control.json
    │   │   │   │       └── source.csv
    │   │   │   ├── load_custom_transf_and_df/
    │   │   │   │   └── data/
    │   │   │   │       ├── control.json
    │   │   │   │       └── source.csv
    │   │   │   ├── load_simple_acon/
    │   │   │   │   └── data/
    │   │   │   │       ├── control.json
    │   │   │   │       └── source.csv
    │   │   │   └── table_manager/
    │   │   │       └── data/
    │   │   │           └── control.json
    │   │   ├── extract_from_sap_b4/
    │   │   │   ├── extract_aq_dso/
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── dummy_table.csv
    │   │   │   │   │   │   ├── dummy_table_join_condition.csv
    │   │   │   │   │   │   └── dummy_table_schema.json
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── dummy_table.csv
    │   │   │   │   │       ├── dummy_table_1.csv
    │   │   │   │   │       ├── dummy_table_2.csv
    │   │   │   │   │       └── rspmrequest.csv
    │   │   │   │   ├── dummy_table_schema.json
    │   │   │   │   └── rspmrequest_schema.json
    │   │   │   └── extract_cl_dso/
    │   │   │       ├── data/
    │   │   │       │   ├── control/
    │   │   │       │   │   ├── dummy_table.csv
    │   │   │       │   │   ├── dummy_table_join_condition.csv
    │   │   │       │   │   └── dummy_table_schema.json
    │   │   │       │   └── source/
    │   │   │       │       ├── dummy_table.csv
    │   │   │       │       ├── dummy_table_cl_1.csv
    │   │   │       │       ├── dummy_table_cl_2.csv
    │   │   │       │       └── rspmrequest.csv
    │   │   │       ├── dummy_table_cl_schema.json
    │   │   │       ├── dummy_table_schema.json
    │   │   │       └── rspmrequest_schema.json
    │   │   ├── extract_from_sap_bw/
    │   │   │   ├── derive_changelog_table_name/
    │   │   │   │   ├── RSBASIDOC_schema.json
    │   │   │   │   ├── RSTSODS_schema.json
    │   │   │   │   └── data/
    │   │   │   │       └── source/
    │   │   │   │           ├── RSBASIDOC.csv
    │   │   │   │           └── RSTSODS.csv
    │   │   │   ├── extract_dso/
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── dummy_table.csv
    │   │   │   │   │   │   ├── dummy_table_join_condition.csv
    │   │   │   │   │   │   └── dummy_table_schema.json
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── dummy_table.csv
    │   │   │   │   │       ├── dummy_table_cl_1.csv
    │   │   │   │   │       ├── dummy_table_cl_2.csv
    │   │   │   │   │       └── rsodsactreq.csv
    │   │   │   │   ├── dummy_table_cl_schema.json
    │   │   │   │   ├── dummy_table_schema.json
    │   │   │   │   └── rsodsactreq_schema.json
    │   │   │   └── extract_write_optimised_dso/
    │   │   │       ├── data/
    │   │   │       │   ├── control/
    │   │   │       │   │   ├── dummy_table.csv
    │   │   │       │   │   ├── dummy_table_actreq_timestamp.csv
    │   │   │       │   │   ├── dummy_table_join_condition.csv
    │   │   │       │   │   └── dummy_table_schema.json
    │   │   │       │   └── source/
    │   │   │       │       ├── dummy_table.csv
    │   │   │       │       ├── dummy_table_1.csv
    │   │   │       │       ├── dummy_table_2.csv
    │   │   │       │       └── rsodsactreq.csv
    │   │   │       ├── dummy_table_schema.json
    │   │   │       └── rsodsactreq_schema.json
    │   │   ├── file_manager/
    │   │   │   ├── check_restore_status/
    │   │   │   │   ├── acon_check_restore_status_directory.json
    │   │   │   │   └── acon_check_restore_status_single_object.json
    │   │   │   ├── copy_object/
    │   │   │   │   ├── acon_copy_directory.json
    │   │   │   │   ├── acon_copy_directory_dry_run.json
    │   │   │   │   ├── acon_copy_single_object.json
    │   │   │   │   └── acon_copy_single_object_dry_run.json
    │   │   │   ├── delete_objects/
    │   │   │   │   ├── acon_delete_objects.json
    │   │   │   │   └── acon_delete_objects_dry_run.json
    │   │   │   ├── request_restore/
    │   │   │   │   ├── acon_request_restore_directory.json
    │   │   │   │   └── acon_request_restore_single_object.json
    │   │   │   └── request_restore_to_destination_and_wait/
    │   │   │       ├── acon_request_restore_to_destination_and_wait_directory.json
    │   │   │       ├── acon_request_restore_to_destination_and_wait_single_object.json
    │   │   │       └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json
    │   │   ├── file_manager_dbfs/
    │   │   │   ├── copy_objects/
    │   │   │   │   ├── acon_copy_directory.json
    │   │   │   │   ├── acon_copy_directory_dry_run.json
    │   │   │   │   └── acon_copy_single_object.json
    │   │   │   ├── delete_objects/
    │   │   │   │   ├── acon_delete_objects.json
    │   │   │   │   └── acon_delete_objects_dry_run.json
    │   │   │   └── move_objects/
    │   │   │       ├── acon_move_objects.json
    │   │   │       └── acon_move_objects_dry_run.json
    │   │   ├── file_manager_s3/
    │   │   │   ├── check_restore_status/
    │   │   │   │   ├── acon_check_restore_status_directory.json
    │   │   │   │   └── acon_check_restore_status_single_object.json
    │   │   │   ├── copy_objects/
    │   │   │   │   ├── acon_copy_directory.json
    │   │   │   │   ├── acon_copy_directory_dry_run.json
    │   │   │   │   ├── acon_copy_single_object.json
    │   │   │   │   └── acon_copy_single_object_dry_run.json
    │   │   │   ├── delete_objects/
    │   │   │   │   ├── acon_delete_objects.json
    │   │   │   │   └── acon_delete_objects_dry_run.json
    │   │   │   ├── request_restore/
    │   │   │   │   ├── acon_request_restore_directory.json
    │   │   │   │   └── acon_request_restore_single_object.json
    │   │   │   └── request_restore_to_destination_and_wait/
    │   │   │       ├── acon_request_restore_to_destination_and_wait_directory.json
    │   │   │       ├── acon_request_restore_to_destination_and_wait_single_object.json
    │   │   │       └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json
    │   │   ├── full_load/
    │   │   │   ├── full_overwrite/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── batch_init.json
    │   │   │   │   └── data/
    │   │   │   │       ├── control/
    │   │   │   │       │   └── part-01.csv
    │   │   │   │       └── source/
    │   │   │   │           ├── part-01.csv
    │   │   │   │           └── part-02.csv
    │   │   │   ├── with_filter/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── batch_init.json
    │   │   │   │   └── data/
    │   │   │   │       ├── control/
    │   │   │   │       │   └── part-01.csv
    │   │   │   │       └── source/
    │   │   │   │           ├── part-01.csv
    │   │   │   │           └── part-02.csv
    │   │   │   └── with_filter_partition_overwrite/
    │   │   │       ├── batch.json
    │   │   │       ├── batch_init.json
    │   │   │       └── data/
    │   │   │           ├── control/
    │   │   │           │   └── part-01.csv
    │   │   │           └── source/
    │   │   │               ├── part-01.csv
    │   │   │               └── part-02.csv
    │   │   ├── gab/
    │   │   │   ├── control/
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── vw_dummy_sales_kpi.csv
    │   │   │   │   │   ├── vw_nam_orders_all_snapshot.csv
    │   │   │   │   │   ├── vw_nam_orders_filtered_snapshot.csv
    │   │   │   │   │   ├── vw_negative_offset_orders_all.csv
    │   │   │   │   │   ├── vw_negative_offset_orders_filtered.csv
    │   │   │   │   │   ├── vw_orders_all.csv
    │   │   │   │   │   ├── vw_orders_all_snapshot.csv
    │   │   │   │   │   ├── vw_orders_filtered.csv
    │   │   │   │   │   └── vw_orders_filtered_snapshot.csv
    │   │   │   │   └── schema/
    │   │   │   │       ├── vw_dummy_sales_kpi.json
    │   │   │   │       └── vw_orders.json
    │   │   │   ├── setup/
    │   │   │   │   ├── column_list/
    │   │   │   │   │   ├── calendar.json
    │   │   │   │   │   ├── dummy_sales_kpi.json
    │   │   │   │   │   ├── gab_log_events.json
    │   │   │   │   │   ├── gab_use_case_results.json
    │   │   │   │   │   ├── lkp_query_builder.json
    │   │   │   │   │   └── order_events.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── dummy_sales_kpi.csv
    │   │   │   │   │   ├── lkp_query_builder.csv
    │   │   │   │   │   └── order_events.csv
    │   │   │   │   └── schema/
    │   │   │   │       ├── dummy_sales_kpi.json
    │   │   │   │       ├── lkp_query_builder.json
    │   │   │   │       └── order_events.json
    │   │   │   └── usecases/
    │   │   │       ├── dummy_sales_kpi/
    │   │   │       │   ├── 1_article_category.sql
    │   │   │       │   ├── 2_dummy_sales_kpi.sql
    │   │   │       │   └── scenario/
    │   │   │       │       └── dummy_sales_kpi.json
    │   │   │       └── order_events/
    │   │   │           ├── 1_order_events.sql
    │   │   │           └── scenario/
    │   │   │               ├── order_events.json
    │   │   │               ├── order_events_nam.json
    │   │   │               ├── order_events_negative_timezone_offset.json
    │   │   │               ├── order_events_snapshot.json
    │   │   │               ├── skip_use_case_by_empty_reconciliation.json
    │   │   │               ├── skip_use_case_by_empty_requested_cadence.json
    │   │   │               ├── skip_use_case_by_not_configured_cadence.json
    │   │   │               └── skip_use_case_by_unexisting_cadence.json
    │   │   ├── heartbeat/
    │   │   │   ├── control/
    │   │   │   │   ├── default/
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── ctr_heart_tbl_heartb_feed.csv
    │   │   │   │   │   │   ├── ctrl_heart_tbl_exec_sensor.csv
    │   │   │   │   │   │   ├── ctrl_heart_tbl_trigger_job.csv
    │   │   │   │   │   │   ├── ctrl_heart_tbl_updated.csv
    │   │   │   │   │   │   └── ctrl_sensor_tbl_upd_status.json
    │   │   │   │   │   └── schema/
    │   │   │   │   │       ├── ctrl_heart_tbl_schema.json
    │   │   │   │   │       └── ctrl_heart_tbl_trig_schema.json
    │   │   │   │   └── heartbeat_paused_sensor_new_record/
    │   │   │   │       ├── data/
    │   │   │   │       │   ├── ctr_heart_tbl_heartb_feed.csv
    │   │   │   │       │   ├── ctrl_heart_tbl_exec_sensor.csv
    │   │   │   │       │   ├── ctrl_heart_tbl_trigger_job.csv
    │   │   │   │       │   ├── ctrl_heart_tbl_updated.csv
    │   │   │   │       │   └── ctrl_sensor_tbl_upd_status.json
    │   │   │   │       └── schema/
    │   │   │   │           └── ctrl_heart_tbl_schema.json
    │   │   │   └── setup/
    │   │   │       ├── default/
    │   │   │       │   ├── column_list/
    │   │   │       │   │   ├── heartbeat_sensor_control_table.json
    │   │   │       │   │   └── sensor_table.json
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── setup_heartbeat_data.csv
    │   │   │       │   │   └── setup_sensor_data.json
    │   │   │       │   └── schema/
    │   │   │       │       └── schema_sensor_df.json
    │   │   │       └── heartbeat_paused_sensor_new_record/
    │   │   │           ├── column_list/
    │   │   │           │   ├── heartbeat_sensor_control_table.json
    │   │   │           │   └── sensor_table.json
    │   │   │           ├── data/
    │   │   │           │   ├── setup_heartbeat_data.csv
    │   │   │           │   └── setup_sensor_data.json
    │   │   │           └── schema/
    │   │   │               └── schema_sensor_df.json
    │   │   ├── jdbc_reader/
    │   │   │   ├── jdbc_format/
    │   │   │   │   ├── correct_arguments/
    │   │   │   │   │   ├── batch_init.json
    │   │   │   │   │   └── data/
    │   │   │   │   │       ├── control/
    │   │   │   │   │       │   └── part-01.csv
    │   │   │   │   │       └── source/
    │   │   │   │   │           └── part-01.csv
    │   │   │   │   ├── predicates/
    │   │   │   │   │   └── batch_init.json
    │   │   │   │   └── wrong_arguments/
    │   │   │   │       └── batch_init.json
    │   │   │   └── jdbc_function/
    │   │   │       ├── correct_arguments/
    │   │   │       │   ├── batch_init.json
    │   │   │       │   └── data/
    │   │   │       │       ├── control/
    │   │   │       │       │   └── part-01.csv
    │   │   │       │       └── source/
    │   │   │       │           └── part-01.csv
    │   │   │       └── wrong_arguments/
    │   │   │           └── batch_init.json
    │   │   ├── materialize_cdf/
    │   │   │   ├── acon_create_table.json
    │   │   │   ├── control_schema.json
    │   │   │   ├── data/
    │   │   │   │   ├── control/
    │   │   │   │   │   └── part-01_cdf.csv
    │   │   │   │   ├── source/
    │   │   │   │   │   ├── part-01.csv
    │   │   │   │   │   └── part-02.csv
    │   │   │   │   └── table/
    │   │   │   │       └── streaming_with_cdf.sql
    │   │   │   ├── streaming_with_clean_and_vacuum.json
    │   │   │   └── streaming_without_clean_cdf.json
    │   │   ├── notification/
    │   │   │   └── test_attachement.txt
    │   │   ├── reconciliation/
    │   │   │   └── data/
    │   │   │       ├── current.json
    │   │   │       ├── current_different_rows.json
    │   │   │       ├── current_fail.json
    │   │   │       ├── current_nulls_and_zeros.json
    │   │   │       ├── current_nulls_and_zeros_fail.json
    │   │   │       ├── truth.json
    │   │   │       ├── truth_different_rows.json
    │   │   │       ├── truth_empty.json
    │   │   │       ├── truth_nulls_and_zeros.json
    │   │   │       └── truth_nulls_and_zeros_fail.json
    │   │   ├── schema_evolution/
    │   │   │   ├── append_load/
    │   │   │   │   ├── batch_append_disabled.json
    │   │   │   │   ├── batch_append_disabled_cast.json
    │   │   │   │   ├── batch_append_enabled.json
    │   │   │   │   ├── batch_append_enabled_cast.json
    │   │   │   │   ├── batch_init_disabled.json
    │   │   │   │   ├── batch_init_enabled.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── part-02.csv
    │   │   │   │   │   │   ├── part-03.csv
    │   │   │   │   │   │   ├── part-05.csv
    │   │   │   │   │   │   └── part-06.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │       ├── part-04.csv
    │   │   │   │   │       ├── part-05.csv
    │   │   │   │   │       └── part-06.csv
    │   │   │   │   └── schema/
    │   │   │   │       ├── control/
    │   │   │   │       │   ├── control_schema.json
    │   │   │   │       │   ├── control_schema_add_column.json
    │   │   │   │       │   └── control_schema_rename.json
    │   │   │   │       └── source/
    │   │   │   │           ├── source_part-01_schema.json
    │   │   │   │           ├── source_part-02_schema.json
    │   │   │   │           ├── source_part-03_schema.json
    │   │   │   │           ├── source_part-04_schema.json
    │   │   │   │           ├── source_part-05_schema.json
    │   │   │   │           └── source_part-06_schema.json
    │   │   │   ├── delta_load/
    │   │   │   │   ├── batch_delta_disabled.json
    │   │   │   │   ├── batch_delta_disabled_rename.json
    │   │   │   │   ├── batch_delta_enabled.json
    │   │   │   │   ├── batch_init_disabled.json
    │   │   │   │   ├── batch_init_enabled.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── part-02.csv
    │   │   │   │   │   │   ├── part-03.csv
    │   │   │   │   │   │   ├── part-04.csv
    │   │   │   │   │   │   ├── part-05.csv
    │   │   │   │   │   │   └── part-06.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       ├── part-02.csv
    │   │   │   │   │       ├── part-03.csv
    │   │   │   │   │       ├── part-04.csv
    │   │   │   │   │       ├── part-05.csv
    │   │   │   │   │       └── part-06.csv
    │   │   │   │   └── schema/
    │   │   │   │       ├── control/
    │   │   │   │       │   ├── control_schema.json
    │   │   │   │       │   ├── control_schema_add_column.json
    │   │   │   │       │   └── control_schema_rename.json
    │   │   │   │       └── source/
    │   │   │   │           ├── source_part-01_schema.json
    │   │   │   │           ├── source_part-02_schema.json
    │   │   │   │           ├── source_part-03_schema.json
    │   │   │   │           ├── source_part-04_schema.json
    │   │   │   │           ├── source_part-05_schema.json
    │   │   │   │           └── source_part-06_schema.json
    │   │   │   └── full_load/
    │   │   │       ├── batch_init.json
    │   │   │       ├── batch_merge_disabled.json
    │   │   │       ├── batch_merge_enabled.json
    │   │   │       ├── batch_overwrite.json
    │   │   │       ├── data/
    │   │   │       │   ├── control/
    │   │   │       │   │   └── part-02.csv
    │   │   │       │   └── source/
    │   │   │       │       ├── part-01.csv
    │   │   │       │       └── part-02.csv
    │   │   │       └── schema/
    │   │   │           ├── control/
    │   │   │           │   ├── control_schema_merge_enabled.json
    │   │   │           │   └── control_schema_overwrite.json
    │   │   │           └── source/
    │   │   │               ├── source_part-01_schema.json
    │   │   │               └── source_part-02_schema.json
    │   │   ├── sftp_reader/
    │   │   │   └── data/
    │   │   │       ├── file.csv
    │   │   │       ├── file1.csv
    │   │   │       ├── file2.csv
    │   │   │       ├── file3.json
    │   │   │       ├── file4.xml
    │   │   │       └── file5.txt
    │   │   ├── sharepoint/
    │   │   │   ├── exceptions/
    │   │   │   │   ├── acons/
    │   │   │   │   │   ├── drive_exception.json
    │   │   │   │   │   ├── endpoint_exception.json
    │   │   │   │   │   ├── local_path_exception.json
    │   │   │   │   │   ├── site_exception.json
    │   │   │   │   │   └── streaming_exception.json
    │   │   │   │   └── schemas/
    │   │   │   │       └── schema.json
    │   │   │   ├── reader/
    │   │   │   │   ├── acons/
    │   │   │   │   │   ├── read_file_name_and_file_pattern_conflict_should_fail.json
    │   │   │   │   │   ├── read_file_name_unsupported_extension_should_fail.json
    │   │   │   │   │   ├── read_folder_csv_archive_enabled_success.json
    │   │   │   │   │   ├── read_folder_csv_archive_success_subfolder_override_success.json
    │   │   │   │   │   ├── read_folder_csv_no_csv_files_should_fail.json
    │   │   │   │   │   ├── read_folder_csv_one_file_schema_mismatch_custom_error_subfolder_should_archive_error.json
    │   │   │   │   │   ├── read_folder_csv_one_file_schema_mismatch_should_archive_error.json
    │   │   │   │   │   ├── read_folder_csv_pattern_matches_no_files_should_fail.json
    │   │   │   │   │   ├── read_folder_csv_pattern_success.json
    │   │   │   │   │   ├── read_folder_csv_success.json
    │   │   │   │   │   ├── read_folder_path_does_not_exist_should_fail.json
    │   │   │   │   │   ├── read_folder_relative_path_looks_like_file_unsupported_extension_should_fail.json
    │   │   │   │   │   ├── read_single_csv_archive_default_enabled_success.json
    │   │   │   │   │   ├── read_single_csv_archive_enabled_success.json
    │   │   │   │   │   ├── read_single_csv_archive_success_subfolder_override_success.json
    │   │   │   │   │   ├── read_single_csv_download_error_should_archive_error.json
    │   │   │   │   │   ├── read_single_csv_empty_file_should_archive_error.json
    │   │   │   │   │   ├── read_single_csv_full_path_success.json
    │   │   │   │   │   ├── read_single_csv_full_path_with_file_name_should_fail.json
    │   │   │   │   │   ├── read_single_csv_full_path_with_file_pattern_should_fail.json
    │   │   │   │   │   ├── read_single_csv_full_path_with_file_type_should_fail.json
    │   │   │   │   │   ├── read_single_csv_spark_load_fails_should_archive_error.json
    │   │   │   │   │   ├── read_single_csv_success.json
    │   │   │   │   │   └── read_unsupported_file_type_should_fail.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── bad_schema.csv
    │   │   │   │   │   ├── other.csv
    │   │   │   │   │   ├── sample_1.csv
    │   │   │   │   │   └── sample_2.csv
    │   │   │   │   └── mocks/
    │   │   │   │       ├── get_drive_id.json
    │   │   │   │       ├── get_file_metadata.json
    │   │   │   │       ├── get_site_id.json
    │   │   │   │       └── rename_file.json
    │   │   │   └── writer/
    │   │   │       ├── acons/
    │   │   │       │   └── write_to_local_success.json
    │   │   │       ├── data/
    │   │   │       │   ├── file_control.csv
    │   │   │       │   └── file_source.csv
    │   │   │       ├── mocks/
    │   │   │       │   ├── create_upload_session.json
    │   │   │       │   ├── get_drive_id.json
    │   │   │       │   └── get_site_id.json
    │   │   │       └── schemas/
    │   │   │           └── schema.json
    │   │   ├── table_manager/
    │   │   │   ├── compute_table_statistics/
    │   │   │   │   ├── table_stats_complex_default_scenario1.json
    │   │   │   │   ├── table_stats_complex_default_scenario2.json
    │   │   │   │   ├── table_stats_complex_different_delimiter_scenario1.json
    │   │   │   │   ├── table_stats_complex_different_delimiter_scenario2.json
    │   │   │   │   └── table_stats_simple_split_scenario.json
    │   │   │   ├── create/
    │   │   │   │   ├── acon_create_table.json
    │   │   │   │   ├── acon_create_table_complex_default_scenario.json
    │   │   │   │   ├── acon_create_table_complex_different_delimiter_scenario.json
    │   │   │   │   ├── acon_create_table_simple_split_scenario.json
    │   │   │   │   ├── acon_create_view.json
    │   │   │   │   ├── acon_create_view_complex_default_scenario.json
    │   │   │   │   ├── acon_create_view_complex_different_delimiter_scenario.json
    │   │   │   │   ├── acon_create_view_simple_split_scenario.json
    │   │   │   │   ├── table/
    │   │   │   │   │   ├── test_table_complex_default_scenario.sql
    │   │   │   │   │   ├── test_table_complex_different_delimiter_scenario.sql
    │   │   │   │   │   └── test_table_simple_split_scenario.sql
    │   │   │   │   └── view/
    │   │   │   │       ├── test_view_complex_default_scenario.sql
    │   │   │   │       ├── test_view_complex_different_delimiter_scenario.sql
    │   │   │   │       └── test_view_simple_split_scenario.sql
    │   │   │   ├── delete/
    │   │   │   │   └── acon_delete_where_table_simple_split_scenario.json
    │   │   │   ├── describe/
    │   │   │   │   └── acon_describe_simple_split_scenario.json
    │   │   │   ├── drop/
    │   │   │   │   ├── acon_drop_table_simple_split_scenario.json
    │   │   │   │   └── acon_drop_view_simple_split_scenario.json
    │   │   │   ├── execute_sql/
    │   │   │   │   ├── acon_execute_sql_complex_default_scenario.json
    │   │   │   │   ├── acon_execute_sql_complex_different_delimiter_scenario.json
    │   │   │   │   └── acon_execute_sql_simple_split_scenario.json
    │   │   │   ├── get_tbl_pk/
    │   │   │   │   └── get_tbl_pk_simple_split_scenario.json
    │   │   │   ├── optimize/
    │   │   │   │   ├── optimize_location.json
    │   │   │   │   ├── optimize_location_simple_split_scenario.json
    │   │   │   │   ├── optimize_table.json
    │   │   │   │   └── optimize_table_simple_split_scenario.json
    │   │   │   ├── show_tbl_properties/
    │   │   │   │   └── show_tbl_properties_simple_split_scenario.json
    │   │   │   └── vacuum/
    │   │   │       ├── acon_vacuum_location.json
    │   │   │       ├── acon_vacuum_location_simple_split_scenario.json
    │   │   │       └── acon_vacuum_table_simple_split_scenario.json
    │   │   ├── transformations/
    │   │   │   ├── chain_transformations/
    │   │   │   │   ├── acons/
    │   │   │   │   │   ├── batch.json
    │   │   │   │   │   ├── streaming.json
    │   │   │   │   │   ├── streaming_batch.json
    │   │   │   │   │   ├── write_streaming_struct_data.json
    │   │   │   │   │   └── write_streaming_struct_data_fail.json
    │   │   │   │   ├── control/
    │   │   │   │   │   ├── chain_control.csv
    │   │   │   │   │   └── struct_data.json
    │   │   │   │   ├── schema/
    │   │   │   │   │   ├── customer_schema.json
    │   │   │   │   │   ├── sales_schema.json
    │   │   │   │   │   └── struct_data_schema.json
    │   │   │   │   └── source/
    │   │   │   │       ├── customers.csv
    │   │   │   │       ├── sales_historical.csv
    │   │   │   │       ├── sales_new.csv
    │   │   │   │       └── struct_data.csv
    │   │   │   ├── column_creators/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── part-01.json
    │   │   │   │   │   └── source/
    │   │   │   │   │       └── part-01.csv
    │   │   │   │   ├── source_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── column_reshapers/
    │   │   │   │   ├── explode_arrays/
    │   │   │   │   │   ├── batch.json
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   └── part-01.csv
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       └── part-01.json
    │   │   │   │   │   ├── source_schema.json
    │   │   │   │   │   └── streaming.json
    │   │   │   │   ├── flatten_and_explode_arrays_and_maps/
    │   │   │   │   │   ├── batch.json
    │   │   │   │   │   ├── data/
    │   │   │   │   │   │   ├── control/
    │   │   │   │   │   │   │   └── part-01.csv
    │   │   │   │   │   │   └── source/
    │   │   │   │   │   │       └── part-01.json
    │   │   │   │   │   ├── source_schema.json
    │   │   │   │   │   └── streaming.json
    │   │   │   │   └── flatten_schema/
    │   │   │   │       ├── batch.json
    │   │   │   │       ├── data/
    │   │   │   │       │   ├── control/
    │   │   │   │       │   │   └── part-01.csv
    │   │   │   │       │   └── source/
    │   │   │   │       │       └── part-01.json
    │   │   │   │       ├── source_schema.json
    │   │   │   │       └── streaming.json
    │   │   │   ├── data_maskers/
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── drop_columns.csv
    │   │   │   │   │   │   └── hash_masking.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       └── part-01.csv
    │   │   │   │   ├── drop_columns.json
    │   │   │   │   ├── drop_columns_control_schema.json
    │   │   │   │   ├── hash_masking.json
    │   │   │   │   ├── hash_masking_control_schema.json
    │   │   │   │   └── source_schema.json
    │   │   │   ├── date_transformers/
    │   │   │   │   ├── control_schema.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── part-01.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       └── part-01.csv
    │   │   │   │   ├── source_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── drop_duplicate_rows/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── batch_distinct.json
    │   │   │   │   │   │   ├── batch_drop_duplicates.json
    │   │   │   │   │   │   ├── streaming_distinct.json
    │   │   │   │   │   │   └── streaming_drop_duplicates.json
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── part-01.csv
    │   │   │   │   │       └── part-02.csv
    │   │   │   │   ├── source_schema.json
    │   │   │   │   └── streaming.json
    │   │   │   ├── joiners/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── control_scenario_1_and_2_schema.json
    │   │   │   │   ├── control_scenario_3_schema.json
    │   │   │   │   ├── customer_schema.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── control_scenario_1_and_2.csv
    │   │   │   │   │   │   └── control_scenario_3.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── customer-part-01.csv
    │   │   │   │   │       ├── sales-part-01.csv
    │   │   │   │   │       └── sales-part-02.csv
    │   │   │   │   ├── sales_schema.json
    │   │   │   │   ├── streaming.json
    │   │   │   │   ├── streaming_foreachBatch.json
    │   │   │   │   ├── streaming_without_broadcast.json
    │   │   │   │   └── streaming_without_column_rename.json
    │   │   │   ├── multiple_transform/
    │   │   │   │   ├── batch.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   └── part-01.json
    │   │   │   │   │   └── source/
    │   │   │   │   │       └── part-01.csv
    │   │   │   │   └── source_schema.json
    │   │   │   ├── null_handlers/
    │   │   │   │   ├── control_schema.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── replace_nulls.csv
    │   │   │   │   │   │   └── replace_nulls_col_subset.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       └── part-01.csv
    │   │   │   │   ├── replace_nulls.json
    │   │   │   │   ├── replace_nulls_col_subset.json
    │   │   │   │   └── source_schema.json
    │   │   │   ├── optimizers/
    │   │   │   │   └── data/
    │   │   │   │       └── source/
    │   │   │   │           └── part-01.csv
    │   │   │   ├── regex_transformers/
    │   │   │   │   └── with_regex_value/
    │   │   │   │       ├── batch.json
    │   │   │   │       ├── control_schema.json
    │   │   │   │       ├── data/
    │   │   │   │       │   ├── control/
    │   │   │   │       │   │   └── part-01.csv
    │   │   │   │       │   └── source/
    │   │   │   │       │       └── WE_SO_SCL_202108111400000029.csv
    │   │   │   │       └── source_schema.json
    │   │   │   ├── unions/
    │   │   │   │   ├── batch_union.json
    │   │   │   │   ├── batch_unionByName.json
    │   │   │   │   ├── batch_unionByName_diff_schema.json
    │   │   │   │   ├── batch_unionByName_diff_schema_error.json
    │   │   │   │   ├── batch_union_diff_schema.json
    │   │   │   │   ├── data/
    │   │   │   │   │   ├── control/
    │   │   │   │   │   │   ├── control_sales.csv
    │   │   │   │   │   │   ├── control_sales_shipment.csv
    │   │   │   │   │   │   ├── control_sales_shipment_streaming.csv
    │   │   │   │   │   │   ├── control_sales_shipment_streaming_foreachBatch.csv
    │   │   │   │   │   │   ├── control_sales_streaming.csv
    │   │   │   │   │   │   └── control_sales_streaming_foreachBatch.csv
    │   │   │   │   │   └── source/
    │   │   │   │   │       ├── sales-historical-part-01.csv
    │   │   │   │   │       ├── sales-historical-part-02.csv
    │   │   │   │   │       ├── sales-new-part-01.csv
    │   │   │   │   │       ├── sales-new-part-02.csv
    │   │   │   │   │       ├── sales-shipment-part-01.csv
    │   │   │   │   │       └── sales-shipment-part-02.csv
    │   │   │   │   ├── sales_schema.json
    │   │   │   │   ├── sales_shipment_schema.json
    │   │   │   │   ├── streaming_union.json
    │   │   │   │   ├── streaming_unionByName_diff_schema.json
    │   │   │   │   ├── streaming_unionByName_diff_schema_foreachBatch.json
    │   │   │   │   └── streaming_union_foreachBatch.json
    │   │   │   └── watermarker/
    │   │   │       ├── streaming_drop_duplicates/
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── control/
    │   │   │       │   │   │   └── streaming_drop_duplicates.csv
    │   │   │       │   │   └── source/
    │   │   │       │   │       ├── part-01.csv
    │   │   │       │   │       └── part-02.csv
    │   │   │       │   ├── source_schema.json
    │   │   │       │   └── streaming_drop_duplicates.json
    │   │   │       ├── streaming_drop_duplicates_overall_watermark/
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── control/
    │   │   │       │   │   │   └── streaming_drop_duplicates_overall_watermark.csv
    │   │   │       │   │   └── source/
    │   │   │       │   │       ├── part-01.csv
    │   │   │       │   │       └── part-02.csv
    │   │   │       │   ├── source_schema.json
    │   │   │       │   └── streaming_drop_duplicates_overall_watermark.json
    │   │   │       ├── streaming_inner_join/
    │   │   │       │   ├── customer_schema.json
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── control/
    │   │   │       │   │   │   └── streaming_inner_join.csv
    │   │   │       │   │   └── source/
    │   │   │       │   │       ├── customer-part-01.csv
    │   │   │       │   │       ├── sales-part-01.csv
    │   │   │       │   │       └── sales-part-02.csv
    │   │   │       │   ├── sales_schema.json
    │   │   │       │   ├── streaming_inner_join.json
    │   │   │       │   └── streaming_inner_join_control_schema.json
    │   │   │       ├── streaming_left_outer_join/
    │   │   │       │   ├── customer_schema.json
    │   │   │       │   ├── data/
    │   │   │       │   │   ├── control/
    │   │   │       │   │   │   └── streaming_left_outer_join.csv
    │   │   │       │   │   └── source/
    │   │   │       │   │       ├── customer-part-01.csv
    │   │   │       │   │       ├── customer-part-02.csv
    │   │   │       │   │       ├── customer-part-03.csv
    │   │   │       │   │       ├── customer-part-04.csv
    │   │   │       │   │       ├── customer-part-05.csv
    │   │   │       │   │       ├── sales-part-01.csv
    │   │   │       │   │       ├── sales-part-02.csv
    │   │   │       │   │       ├── sales-part-03.csv
    │   │   │       │   │       ├── sales-part-04.csv
    │   │   │       │   │       └── sales-part-05.csv
    │   │   │       │   ├── sales_schema.json
    │   │   │       │   ├── streaming_left_outer_join.json
    │   │   │       │   └── streaming_left_outer_join_control_schema.json
    │   │   │       └── streaming_right_outer_join/
    │   │   │           ├── customer_schema.json
    │   │   │           ├── data/
    │   │   │           │   ├── control/
    │   │   │           │   │   └── streaming_right_outer_join.csv
    │   │   │           │   └── source/
    │   │   │           │       ├── customer-part-01.csv
    │   │   │           │       ├── sales-part-01.csv
    │   │   │           │       └── sales-part-02.csv
    │   │   │           ├── sales_schema.json
    │   │   │           ├── streaming_right_outer_join.json
    │   │   │           └── streaming_right_outer_join_control_schema.json
    │   │   └── writers/
    │   │       ├── acons/
    │   │       │   ├── write_batch_console.json
    │   │       │   ├── write_batch_dataframe.json
    │   │       │   ├── write_batch_files.json
    │   │       │   ├── write_batch_jdbc.json
    │   │       │   ├── write_batch_rest_api.json
    │   │       │   ├── write_batch_table.json
    │   │       │   ├── write_streaming_console.json
    │   │       │   ├── write_streaming_dataframe.json
    │   │       │   ├── write_streaming_df_with_checkpoint.json
    │   │       │   ├── write_streaming_files.json
    │   │       │   ├── write_streaming_foreachBatch_console.json
    │   │       │   ├── write_streaming_foreachBatch_dataframe.json
    │   │       │   ├── write_streaming_foreachBatch_df_with_checkpoint.json
    │   │       │   ├── write_streaming_foreachBatch_files.json
    │   │       │   ├── write_streaming_foreachBatch_jdbc.json
    │   │       │   ├── write_streaming_foreachBatch_table.json
    │   │       │   ├── write_streaming_multiple_dfs.json
    │   │       │   ├── write_streaming_rest_api.json
    │   │       │   └── write_streaming_table.json
    │   │       ├── control/
    │   │       │   ├── writers_control.csv
    │   │       │   ├── writers_control_streaming_dataframe_1.csv
    │   │       │   ├── writers_control_streaming_dataframe_2.csv
    │   │       │   ├── writers_control_streaming_dataframe_foreachBatch_1.csv
    │   │       │   └── writers_control_streaming_dataframe_foreachBatch_2.csv
    │   │       ├── schema/
    │   │       │   └── sales_schema.json
    │   │       └── source/
    │   │           ├── sales_historical_1.csv
    │   │           ├── sales_historical_2.csv
    │   │           ├── sales_new_1.csv
    │   │           └── sales_new_2.csv
    │   └── unit/
    │       ├── custom_configs/
    │       │   └── custom_engine_config.yaml
    │       ├── heartbeat/
    │       │   ├── heartbeat_acon_creation/
    │       │   │   └── setup/
    │       │   │       └── column_list/
    │       │   │           ├── heartbeat_sensor_control_table.json
    │       │   │           └── sensor_table.json
    │       │   └── heartbeat_anchor_job/
    │       │       └── setup/
    │       │           └── column_list/
    │       │               ├── heartbeat_sensor_control_table.json
    │       │               └── sensor_table.json
    │       └── sharepoint_reader/
    │           └── data/
    │               ├── sample_ok.csv
    │               └── sample_other_delim.csv
    ├── unit/
    │   ├── __init__.py
    │   ├── test_acon_validation.py
    │   ├── test_custom_configs.py
    │   ├── test_databricks_utils.py
    │   ├── test_failure_notification_creation.py
    │   ├── test_heartbeat_acon_creation.py
    │   ├── test_heartbeat_anchor_job.py
    │   ├── test_log_filter_sensitive_data.py
    │   ├── test_notification_creation.py
    │   ├── test_notification_factory.py
    │   ├── test_prisma_dq_rule_id.py
    │   ├── test_prisma_function_definition.py
    │   ├── test_rest_api_functions.py
    │   ├── test_sensor.py
    │   ├── test_sensor_manager.py
    │   ├── test_sharepoint_csv_reader.py
    │   ├── test_spark_session.py
    │   └── test_version.py
    └── utils/
        ├── __init__.py
        ├── dataframe_helpers.py
        ├── dq_rules_table_utils.py
        ├── exec_env_helpers.py
        ├── local_storage.py
        ├── mocks.py
        └── smtp_server.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a Bug report to help us improve
title: "[BUG] Function X is raising error Y"
labels: bug
assignees: jmcorreia

---

**Describe the bug**
A clear and concise description of what the bug is.

**Environment Details**
- Lakehouse Engine Version
- Environment where you are using the Lakehouse Engine (Ex. Databricks 13.3LTS)

**To Reproduce**
Please include all the necessary details to reproduce the problem, including the full ACON or functions that are being used and at what point the problem is occurring. 

**Expected behavior**
A clear and concise description of what you expected to happen.

**Screenshots**
If applicable, add screenshots to help explain your problem.

**Additional context**
Add any other context about the problem here.


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: "[FEATURE] I would like to have the capability to do X"
labels: enhancement
assignees: jmcorreia

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context, useful links or screenshots about the feature request here.


================================================
FILE: .github/pull_request_template.md
================================================
- [ ] Description of PR changes above includes a link to [an existing GitHub issue](https://github.com/adidas/lakehouse-engine/issues)
- [ ] PR title is prefixed with one of: [BUGFIX], [FEATURE]
- [ ] Appropriate tests and docs have been updated
- [ ] Code is linted and tested -
```
  make style
  make lint
  make test
  make test-security
```

For more information about contributing, see [Contribute](https://github.com/adidas/lakehouse-engine/blob/master/CONTRIBUTING.md).

After you submit your PR, keep **monitoring its statuses and discuss/apply fixes for any issues or suggestions coming from the PR Reviews**. 

Thanks for contributing!

================================================
FILE: .gitignore
================================================
# mac os hidden files
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checer
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# intellij and vscode
.idea/
**.iml
.vscode/

# credentials
**credential**

# lakehouse and spark
/tests/lakehouse/**
*derby.log*
**/metastore_db/
/metastore_db/
**/spark-warehouse/
/spark-warehouse/

/artefacts/
tmp_os/

================================================
FILE: CONTRIBUTING.md
================================================
# How to Contribute

📖 Search algorithms, transformations and check implementation details & examples in our [documentation](https://adidas.github.io/lakehouse-engine-docs/lakehouse_engine.html).

💭 In case you have doubts, ideas, want to ask for help or want to discuss different approach and usages, feel free to create a [discussion](https://github.com/adidas/lakehouse-engine/discussions).

⚠️ Are you facing any issues? Open an issue on [GitHub](https://github.com/adidas/lakehouse-engine/issues).

💡 Do you have ideas for new features? Open a feature request on [GitHub](https://github.com/adidas/lakehouse-engine/issues).

🚀 Want to find the available releases? Check our release notes on [GitHub](https://github.com/adidas/lakehouse-engine/releases) and [PyPi](https://pypi.org/project/lakehouse-engine/).

## Prerequisites

1. Git.
2. Your IDE of choice with a Python 3 environment (e.g., virtualenv created from the requirements_cicd.txt file).
3. Docker. **Warning:** The default spark driver memory limit for the tests is set at 2g. This limit is configurable but your
   testing docker setup **MUST** always have **at least** 2 * spark driver memory limit + 1 gb configured.
4. GNU make.

## General steps for contributing
1. Fork the project.
2. Clone the forked project into your working environment.
3. Create your feature branch following the convention [feature|bugfix]/ISSUE_ID_short_name.
4. Apply your changes in the recently created branch. It is **mandatory** to add tests covering the feature of fix contributed.
5. Style, lint, test and test security:
    ```
    make style
    make lint
    make test
    make test-security
    ```
---
> ***Note:*** To use the make targets with another docker-compatible cli other than docker you can pass the parameter "container_cli". 
Example: `make test container_cli=nerdctl`

---

---
> ***Note:*** Most make target commands are running on docker. If you face any problem, you can also check the code of the respective make targets and directly execute the code in your python virtual environment.

---

6. (optional) You can build the wheel locally with `make build`.
7. (optional) Install the wheel you have just generated and test it.
8. If you have changed or added new requirements, you should run `make build-lock-files`, to rebuild the lock files. 
9. If the transitive dependencies have not been updated for a while, and you want to upgrade them, you can use `make upgrade-lock-files` to update them. This will update the transitive dependencies even if you have not changed the requirements.
10. When you're ready with your changes, open a Pull Request (PR) to develop.
11. Ping the team through the preferred communication channel.
12. The team will come together to review it and approve it (2 approvals required).
13. Your changes will be tested internally, promoted to master and included in the next release.

> 🚀🚀🚀
>
> **Pull Requests are welcome from anyone**. However, before opening one, please make sure to open an issue on [GitHub](https://github.com/adidas/lakehouse-engine/issues)
> and link it.
> Moreover, if the Pull Request intends to cover big changes or features, it is recommended to first discuss it on a [GitHub issue](https://github.com/adidas/lakehouse-engine/issues) or [Discussion](https://github.com/adidas/lakehouse-engine/discussions).
>
> 🚀🚀🚀

================================================
FILE: LICENSE.txt
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2023 adidas AG

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
SHELL := /bin/bash -euxo pipefail

container_cli := docker
image_name := lakehouse-engine
deploy_env := dev
project_version := $(shell cat cicd/.bumpversion.cfg | grep "current_version =" | cut -f 3 -d " ")
version := $(project_version)
# Gets system information in upper case
system_information := $(shell uname -mvp | tr a-z A-Z)
meta_conf_file := cicd/meta.yaml
meta_os_conf_file := cicd/meta_os.yaml
group_id := $(shell id -g ${USER})
engine_conf_file := lakehouse_engine/configs/engine.yaml
engine_os_conf_file := lakehouse_engine/configs/engine_os.yaml
remove_files_from_os := $(engine_conf_file) $(meta_conf_file) CODEOWNERS sonar-project.properties CONTRIBUTING.md CHANGELOG.md assets/img/os_strategy.png
last_commit_msg := "$(shell git log -1 --pretty=%B)"
git_tag := $(shell git describe --tags --abbrev=0)
commits_url := $(shell cat $(meta_conf_file) | grep commits_url | cut -f 2 -d " ")

ifneq ($(project_version), $(version))
wheel_version := $(project_version)+$(subst _,.,$(subst -,.,$(version)))
project_name := lakehouse-engine-experimental
else
wheel_version := $(version)
project_name := lakehouse-engine
endif

# Add \ to make reg safe comparisons (e.g. in the perl commands)
wheel_version_reg_safe := $(subst +,\+,$(subst .,\.,$(wheel_version)))
project_version_reg_safe := $(subst .,\.,$(project_version))

# Condition to define the Python image to be built based on the machine CPU architecture.
# The base Python image only changes if the identified CPU architecture is ARM.
ifneq (,$(findstring ARM,$(system_information)))
python_image := $(shell cat $(meta_conf_file) | grep arm_python_image | cut -f 2 -d " ")
cpu_architecture := arm64
else
python_image := $(shell cat $(meta_conf_file) | grep amd_python_image | cut -f 2 -d " ")
cpu_architecture := amd64
endif

# Condition to define the spark driver memory limit to be used in the tests
# In order to change this limit you can use the spark_driver_memory parameter
# Example: make test spark_driver_memory=3g
#
# WARNING: When the tests are being run 2 spark nodes are created, so despite
# the default value being 2g, your configured docker environment should have
# extra memory for communication and overhead.
ifndef $(spark_driver_memory)
	spark_driver_memory := "2g"
endif

# A requirements_full.lock file is created based on all the requirements of the project (core, dq, os, azure, sftp, cicd and sharepoint).
# The requirements_full.lock file is then used as a constraints file to build the other lock file so that we ensure dependencies are consistent and compatible
# with each other, otherwise, the the installations would likely fail.
# Moreover, the requirement_full.lock file is also used in the dockerfile to install all project dependencies.
full_requirements := -o requirements_full.lock requirements.txt requirements_os.txt requirements_dq.txt requirements_azure.txt requirements_sftp.txt requirements_cicd.txt requirements_sharepoint.txt
requirements := -o requirements.lock requirements.txt -c requirements_full.lock
os_requirements := -o requirements_os.lock requirements_os.txt -c requirements_full.lock
dq_requirements = -o requirements_dq.lock requirements_dq.txt -c requirements_full.lock
azure_requirements = -o requirements_azure.lock requirements_azure.txt -c requirements_full.lock
sftp_requirements = -o requirements_sftp.lock requirements_sftp.txt -c requirements_full.lock
sharepoint_requirements = -o requirements_sharepoint.lock requirements_sharepoint.txt -c requirements_full.lock
os_deployment := False
container_user_dir := /home/appuser
trust_git_host := ssh -oStrictHostKeyChecking=no -i $(container_user_dir)/.ssh/id_rsa git@github.com
ifeq ($(os_deployment), True)
build_src_dir := tmp_os/lakehouse-engine
else
build_src_dir := .
endif

build-image:
	$(container_cli) build \
		--build-arg USER_ID=$(shell id -u ${USER}) \
		--build-arg GROUP_ID=$(group_id)  \
		--build-arg PYTHON_IMAGE=$(python_image) \
		--build-arg CPU_ARCHITECTURE=$(cpu_architecture) \
		-t $(image_name):$(version) . -f cicd/Dockerfile

build-image-windows:
	$(container_cli) build \
		--build-arg PYTHON_IMAGE=$(python_image) \
        --build-arg CPU_ARCHITECTURE=$(cpu_architecture) \
        -t $(image_name):$(version) . -f cicd/Dockerfile

# The build target is used to build the wheel package.
# It makes usage of some `perl` commands to change the project wheel version in the pyproject.toml file,
# whenever the goal is to release a package for testing, instead of an official release.
# Ex: if you run 'make build-image version=feature-x-1276, and the current project version is 1.20.0, the generated wheel will be: lakehouse_engine_experimental-1.20.0+feature.x.1276-py3-none-any,
# while for the official 1.20.0 release, the wheel will be: lakehouse_engine-1.20.0-py3-none-any.
build:
	perl -pi -e 's/version = "$(project_version_reg_safe)"/version = "$(wheel_version)"/g' pyproject.toml && \
	perl -pi -e 's/name = "lakehouse-engine"/name = "$(project_name)"/g' pyproject.toml && \
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'python -m build --wheel $(build_src_dir)' && \
	perl -pi -e 's/version = "$(wheel_version_reg_safe)"/version = "$(project_version)"/g' pyproject.toml && \
	perl -pi -e 's/name = "$(project_name)"/name = "lakehouse-engine"/g' pyproject.toml


deploy: build
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		-v $(artifactory_credentials_file):$(container_user_dir)/.pypirc \
		$(image_name):$(version) \
		/bin/bash -c 'twine upload -r artifactory dist/$(subst -,_,$(project_name))-$(wheel_version)-py3-none-any.whl --skip-existing'

docs:
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'cd $(build_src_dir) && pip install . && python ./cicd/code_doc/render_docs.py'

# mypy incremental mode is used by default, so in case there is any cache related issue,
# you can modify the command to include --no-incremental flag or you can delete mypy_cache folder.
lint:
	$(container_cli) run --rm \
		-w /app \
        -v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'flake8 --docstring-convention google --config=cicd/flake8.conf lakehouse_engine tests cicd/code_doc/render_docs.py \
		&& mypy --no-incremental lakehouse_engine tests'

# useful to print and use make variables. Usage: make print-variable var=variable_to_print.
print-variable:
	@echo $($(var))

style:
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c '''isort lakehouse_engine tests cicd/code_doc/render_docs.py && \
        black lakehouse_engine tests cicd/code_doc/render_docs.py'''

terminal:
	$(container_cli) run \
		-it \
		--rm \
	  	-w /app \
		-v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash

# Can use test only: ```make test test_only="tests/feature/test_delta_load_record_mode_cdc.py"```.
# You can also hack it by doing ```make test test_only="-rx tests/feature/test_delta_load_record_mode_cdc.py"```
# to show complete output even of passed tests.
# We also fix the coverage filepaths, using perl, so that report has the correct paths
test:
	$(container_cli) run \
		--rm \
		-w /app \
        -v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c "pytest \
            --junitxml=artefacts/tests.xml \
            --cov-report xml --cov-report xml:artefacts/coverage.xml \
            --cov-report term-missing --cov=lakehouse_engine \
            --log-cli-level=INFO --color=yes -x -vv \
            --spark_driver_memory=$(spark_driver_memory) $(test_only)" && \
	perl -pi -e 's/filename=\"/filename=\"lakehouse_engine\//g' artefacts/coverage.xml

test-security:
	$(container_cli) run \
		--rm \
		-w /app \
        -v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'bandit -c cicd/bandit.yaml -r lakehouse_engine tests'

#####################################
##### Dependency Management Targets #####
#####################################

audit-dep-safety:
	$(container_cli) run --rm \
		-w /app \
        -v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'pip-audit -r cicd/requirements_full.lock --desc on -f json --fix --dry-run -o artefacts/safety_analysis.json'

# This target will build the lock files to be used for building the wheel and delivering it.
build-lock-files:
	$(container_cli) run --rm \
	    -w /app \
	    -v "$$PWD":/app \
	    $(image_name):$(version) \
	    /bin/bash -c 'cd cicd && pip-compile --resolver=backtracking $(full_requirements) && \
	    pip-compile --resolver=backtracking $(requirements) && \
	    pip-compile --resolver=backtracking $(os_requirements) && \
	    pip-compile --resolver=backtracking $(dq_requirements) && \
		pip-compile --resolver=backtracking $(azure_requirements) && \
		pip-compile --resolver=backtracking $(sftp_requirements) && \
		pip-compile --resolver=backtracking $(sharepoint_requirements)'

# We test the dependencies to check if they need to be updated because requirements.txt files have changed.
# On top of that, we also test if we will be able to install the base and the extra packages together, 
# as their lock files are built separately and therefore dependency constraints might be too restricted. 
# If that happens, pip install will fail because it cannot solve the dependency resolution process, and therefore
# we need to pin those conflict dependencies in the requirements.txt files to a version that fits both the base and 
# extra packages.
test-deps:
	@GIT_STATUS="$$(git status --porcelain --ignore-submodules cicd/)"; \
	if [ ! "x$$GIT_STATUS" = "x"  ]; then \
	    echo "!!! Requirements lists has been updated but lock file was not rebuilt !!!"; \
	    echo "!!! Run make build-lock-files !!!"; \
	    echo -e "$${GIT_STATUS}"; \
	    git diff cicd/; \
	    exit 1; \
	fi
	$(container_cli) run --rm \
		-w /app \
        -v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'pip install -e .[azure,dq,sftp,os] --dry-run --ignore-installed'

# This will update the transitive dependencies even if there were no changes in the requirements files.
# This should be a recurrent activity to make sure transitive dependencies are kept up to date.
upgrade-lock-files:
	$(container_cli) run --rm \
	    -w /app \
	    -v "$$PWD":/app \
	    $(image_name):$(version) \
	    /bin/bash -c 'cd cicd && pip-compile --resolver=backtracking --upgrade $(full_requirements) && \
	    pip-compile --resolver=backtracking --upgrade $(requirements) && \
	    pip-compile --resolver=backtracking --upgrade $(os_requirements) && \
	    pip-compile --resolver=backtracking --upgrade $(dq_requirements) && \
		pip-compile --resolver=backtracking --upgrade $(azure_requirements) && \
		pip-compile --resolver=backtracking --upgrade $(sftp_requirements) && \
		pip-compile --resolver=backtracking --upgrade $(sharepoint_requirements)'

#####################################
##### GitHub Deployment Targets #####
#####################################

prepare-github-repo:
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		-v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \
		$(image_name):$(version) \
		/bin/bash -c """mkdir -p tmp_os/$(repository); \
		cd tmp_os/$(repository); \
		git init -b master; \
		git config pull.rebase false; \
		git config user.email 'lak-engine@adidas.com'; \
		git config user.name 'Lakehouse Engine'; \
		$(trust_git_host); \
		git remote add origin git@github.com:adidas/$(repository).git; \
		git pull origin master --tags"""

sync-to-github: prepare-github-repo
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		-v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \
		$(image_name):$(version) \
		/bin/bash -c """cd tmp_os/lakehouse-engine; \
		rsync -r --exclude=.git --exclude=.*cache* --exclude=venv --exclude=dist --exclude=tmp_os /app/ . ; \
		rm $(remove_files_from_os); \
		mv $(engine_os_conf_file) $(engine_conf_file); \
		mv $(meta_os_conf_file) $(meta_conf_file); \
		mv CONTRIBUTING_OS.md CONTRIBUTING.md; \
		$(trust_git_host); \
		git add . ; \
		git commit -m "'${last_commit_msg}'"; \
		git tag -a $(git_tag) -m 'Release $(git_tag)' ; \
		git push origin master --follow-tags;"""

deploy-docs-to-github: docs prepare-github-repo
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		-v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \
		$(image_name):$(version) \
		/bin/bash -c """cp -r tmp_os/lakehouse-engine/artefacts/docs/site/* tmp_os/lakehouse-engine-docs/ ; \
		cd tmp_os/lakehouse-engine-docs; \
		$(trust_git_host); \
		git add . ; \
		git commit -m 'Lakehouse Engine $(version) documentation'; \
		git push origin master ; \
		cd .. && rm -rf tmp_os/lakehouse-engine-docs"""

deploy-to-pypi: build
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		-v $(pypi_credentials_file):$(container_user_dir)/.pypirc \
		$(image_name):$(version) \
		/bin/bash -c 'twine upload tmp_os/lakehouse-engine/dist/lakehouse_engine-$(project_version)-py3-none-any.whl --skip-existing'

deploy-to-pypi-and-clean: deploy-to-pypi
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'rm -rf tmp_os/lakehouse-engine'

###########################
##### Release Targets #####
###########################
create-changelog:
	echo "# Changelog - $(shell date +"%Y-%m-%d") v$(shell cat cicd/.bumpversion.cfg | grep "current_version =" | cut -f 3 -d " ")" > CHANGELOG.md && \
	echo "All notable changes to this project will be documented in this file automatically. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)." >> CHANGELOG.md && \
	echo "" >> CHANGELOG.md && \
	git log --no-decorate --pretty=format:"#### [%cs] [%(describe)]%n [%h]($(commits_url)%H) %s" -n 1000 >> CHANGELOG.md

bump-up-version:
	$(container_cli) run --rm \
		-w /app \
		-v "$$PWD":/app \
		$(image_name):$(version) \
		/bin/bash -c 'bump2version --config-file cicd/.bumpversion.cfg $(increment)'

prepare-release: bump-up-version create-changelog
	echo "Prepared version and changelog to release!"

commit-release:
	git commit -a -m 'Create release $(version)' && \
    git tag -a 'v$(version)' -m 'Release $(version)'

push-release:
	git push --follow-tags

delete-tag:
	git push --delete origin $(tag)

.PHONY: $(MAKECMDGOALS)


================================================
FILE: README.md
================================================
<img align="right" src="assets/img/lakehouse_engine_logo_symbol_small.png" alt="Lakehouse Engine Logo">

# Lakehouse Engine
A configuration driven Spark framework, written in Python, serving as a scalable and distributed engine for several lakehouse algorithms, data flows and utilities for Data Products.

---
> ***Note:*** whenever you read Data Product or Data Product team, we want to refer to Teams and use cases, whose main focus is on 
leveraging the power of data, on a particular topic, end-to-end (ingestion, consumption...) to achieve insights, supporting faster and better decisions, 
which generate value for their businesses. These Teams should not be focusing on building reusable frameworks, but on re-using the existing frameworks to achieve their goals.

---

## Main Goals
The goal of the Lakehouse Engine is to bring some advantages, such as:

- offer cutting-edge, standard, governed and battle-tested foundations that several Data Product teams can benefit from;
- avoid that Data Product teams develop siloed solutions, reducing technical debts and high operating costs (redundant developments across teams);
- allow Data Product teams to focus mostly on data-related tasks, avoiding wasting time & resources on developing the same code for different use cases;
- benefit from the fact that many teams are reusing the same code, which increases the likelihood that common issues are surfaced and solved faster;
- decrease the dependency and learning curve to Spark and other technologies that the Lakehouse Engine abstracts;
- speed up repetitive tasks;
- reduced vendor lock-in.

---
  > ***Note:*** even though you will see a focus on AWS and Databricks, this is just due to the lack of use cases for other technologies like GCP and Azure, but we are open for contribution.

---

## Key Features
⭐ **Data Loads:** perform data loads from diverse source types and apply transformations and data quality validations, 
ensuring trustworthy data, before integrating it into distinct target types. Additionally, people can also define termination 
actions like optimisations or notifications. [On the usage section](#load-data-usage-example) you will find an example using all the supported keywords for data loads.

---
> ***Note:*** The Lakehouse 
Engine supports different types of sources and targets, such as, kafka, jdbc, dataframes, files (csv, parquet, json, delta...), sftp, sap bw, sap b4...

---

⭐ **Transformations:** configuration driven transformations without the need to write any spark code. Transformations can be applied by using the `transform_specs` in the Data Loads.

---
> ***Note:*** you can search all the available transformations, as well as checking implementation details and examples [here](reference/packages/transformers/index.md).

---

⭐ **Data Quality Validations:** the Lakehouse Engine uses Great Expectations as a backend and abstracts any implementation
details by offering people the capability to specify what validations to apply on the data, solely using dict/json based configurations.
The Data Quality validations can be applied on:

- post-mortem (static) data, using the DQ Validator algorithm (`execute_dq_validation`)
- data in-motion, using the `dq_specs` keyword in the Data Loads, to add it as one more step while loading data. 
[On the usage section](#load-data-usage-example) you will find an example using this type of Data Quality validations.

⭐ **Reconciliation:** useful algorithm to compare two source of data, by defining one version of the `truth` to compare
against the `current` version of the data. It can be particularly useful during migrations phases, two compare a few KPIs
and ensure the new version of a table (`current`), for example, delivers the same vision of the data as the old one (`truth`).
Find usage examples [here](lakehouse_engine_usage/reconciliator/reconciliator.md).

⭐ **Sensors:** an abstraction to otherwise complex spark code that can be executed in very small single-node clusters
to check if an upstream system or Data Product contains new data since the last execution. With this feature, people can
trigger jobs to run in more frequent intervals and if the upstream does not contain new data, then the rest of the job
exits without creating bigger clusters to execute more intensive data ETL (Extraction, Transformation, and Loading).
Find usage examples [here](lakehouse_engine_usage/sensors/sensors.md).

⭐ **Terminators:** this feature allow people to specify what to do as a last action, before finishing a Data Load.
Some examples of actions are: optimising target table, vacuum, compute stats, expose change data feed to external location
or even send e-mail notifications. Thus, it is specifically used in Data Loads, using the `terminate_specs` keyword.
[On the usage section](#load-data-usage-example) you will find an example using terminators.

⭐ **Table Manager:** function `manage_table`, offers a set of actions to manipulate tables/views in several ways, such as:

- compute table statistics;
- create/drop tables and views;
- delete/truncate/repair tables;
- vacuum delta tables or locations;
- optimize table;
- describe table;
- show table properties;
- execute sql.

⭐ **File Manager:** function `manage_files`, offers a set of actions to manipulate files in several ways, such as:

- delete Objects in S3;
- copy Objects in S3;
- restore Objects from S3 Glacier;
- check the status of a restore from S3 Glacier;
- request a restore of objects from S3 Glacier and wait for them to be copied to a destination.


⭐ **Notifications:** you can configure and send email notifications.

---
> ***Note:*** it can be used as an independent function (`send_notification`) or as a `terminator_spec`, using the function `notify`.

---

📖 In case you want to check further details you can check the documentation of the [Lakehouse Engine facade](reference/packages/engine.md).

## Installation
As the Lakehouse Engine is built as wheel (look into our **build** and **deploy** make targets) you can install it as any other python package using **pip**.

```
pip install lakehouse-engine
```

Alternatively, you can also upload the wheel to any target of your like (e.g. S3) and perform a pip installation pointing to that target location.

---
> ***Note:*** The Lakehouse Engine is packaged with plugins or optional dependencies, which are not installed by default. The goal is
> to make its installation lighter and to avoid unnecessary dependencies. You can check all the optional dependencies in
> the [tool.setuptools.dynamic] section of the [pyproject.toml](pyproject.toml) file. They are currently: os, dq, azure, sharepoint and sftp. So,
> in case you want to make usage of the Data Quality features offered in the Lakehouse Engine, instead of running the previous command, you should run
> the command below, which will bring the core functionalities, plus DQ.
> ```
> pip install lakehouse-engine[dq]
> ```
> In case you are in an environment without pre-install spark and delta, you will also want to install the `os` optional dependencies, like so:
> ```
> pip install lakehouse-engine[os]
> ```
> And in case you want to install several optional dependencies, you can run a command like:
> ```
> pip install lakehouse-engine[dq,sftp]
> ```
> It is advisable for a Data Product to pin a specific version of the Lakehouse Engine (and have recurring upgrading activities)
> to avoid breaking changes in a new release.
> In case you don't want to be so conservative, you can pin to a major version, which usually shouldn't include changes that break backwards compatibility.

---

## How Data Products use the Lakehouse Engine Framework?
<img src="assets/img/lakehouse_dp_usage.drawio.png?raw=true" style="max-width: 800px; height: auto; "/>

The Lakehouse Engine is a configuration-first Data Engineering framework, using the concept of ACONs to configure algorithms. 
An ACON, stands for Algorithm Configuration and is a JSON representation, as the [Load Data Usage Example](#load-data-usage-example) demonstrates. 

Below you find described the main keywords you can use to configure and ACON for a Data Load.

---
> ***Note:*** the usage logic for the other [algorithms/features presented](#key-features) will always be similar, but using different keywords, 
which you can search for in the examples and documentation provided in the [Key Features](#key-features) and [Community Support and Contributing](#community-support-and-contributing) sections.

---

- **Input specifications (input_specs):** specify how to read data. This is a **mandatory** keyword.
- **Transform specifications (transform_specs):** specify how to transform data.
- **Data quality specifications (dq_specs):** specify how to execute the data quality process.
- **Output specifications (output_specs):** specify how to write data to the target. This is a **mandatory** keyword.
- **Terminate specifications (terminate_specs):** specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc).
- **Execution environment (exec_env):** custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example).

## Load Data Usage Example

You can use the Lakehouse Engine in a **pyspark script** or **notebook**.
Below you can find an example on how to execute a Data Load using the Lakehouse Engine, which is doing the following:

1. Read CSV files, from a specified location, in a streaming fashion and providing a specific schema and some additional 
options for properly read the files (e.g. header, delimiter...);
2. Apply two transformations on the input data:
    1. Add a new column having the Row ID;
    2. Add a new column `extraction_date`, which extracts the date from the `lhe_extraction_filepath`, based on a regex.
3. Apply Data Quality validations and store the result of their execution in the table `your_database.order_events_dq_checks`:
    1. Check if the column `omnihub_locale_code` is not having null values;
    2. Check if the distinct value count for the column `product_division` is between 10 and 100;
    3. Check if the max of the column `so_net_value` is between 10 and 1000;
    4. Check if the length of the values in the column `omnihub_locale_code` is between 1 and 10;
    5. Check if the mean of the values for the column `coupon_code` is between 15 and 20.
4. Write the output into the table `your_database.order_events_with_dq` in a delta format, partitioned by `order_date_header`
and applying a merge predicate condition, ensuring the data is only inserted into the table if it does not match the predicate
(meaning the data is not yet available in the table). Moreover, the `insert_only` flag is used to specify that there should not 
be any updates or deletes in the target table, only inserts;
5. Optimize the Delta Table that we just wrote in (e.g. z-ordering);
6. Specify 3 custom Spark Session configurations.

---
> ⚠️ ***Note:*** `spec_id` is one of the main concepts to ensure you can chain the steps of the algorithm,
so, for example, you can specify the transformations (in `transform_specs`) of a DataFrame that was read in the `input_specs`.

---

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "orders_bronze",
            "read_type": "streaming",
            "data_format": "csv",
            "schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json",
            "with_filepath": True,
            "options": {
                "badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/",
                "header": False,
                "delimiter": "\u005E",
                "dateFormat": "yyyyMMdd",
            },
            "location": "s3://my-data-product-bucket/bronze/orders/",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "orders_bronze_with_extraction_date",
            "input_id": "orders_bronze",
            "transformers": [
                {"function": "with_row_id"},
                {
                    "function": "with_regex_value",
                    "args": {
                        "input_col": "lhe_extraction_filepath",
                        "output_col": "extraction_date",
                        "drop_input_col": True,
                        "regex": ".*WE_SO_SCL_(\\d+).csv",
                    },
                },
            ],
        }
    ],
    "dq_specs": [
        {
            "spec_id": "check_orders_bronze_with_extraction_date",
            "input_id": "orders_bronze_with_extraction_date",
            "dq_type": "validator",
            "result_sink_db_table": "your_database.order_events_dq_checks",
            "fail_on_error": False,
            "dq_functions": [
                {
                  "dq_function": "expect_column_values_to_not_be_null", 
                  "args": {
                    "column": "omnihub_locale_code"
                  }
                },
                {
                    "dq_function": "expect_column_unique_value_count_to_be_between",
                    "args": {
                      "column": "product_division", 
                      "min_value": 10,
                      "max_value": 100
                    },
                },
                {
                    "dq_function": "expect_column_max_to_be_between", 
                    "args": {
                      "column": "so_net_value", 
                      "min_value": 10, 
                      "max_value": 1000
                    }
                },
                {
                    "dq_function": "expect_column_value_lengths_to_be_between",
                    "args": {
                      "column": "omnihub_locale_code", 
                      "min_value": 1, 
                      "max_value": 10
                    },
                },
                {
                  "dq_function": "expect_column_mean_to_be_between", 
                  "args": {
                    "column": "coupon_code", 
                    "min_value": 15, 
                    "max_value": 20
                  }
                },
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "orders_silver",
            "input_id": "check_orders_bronze_with_extraction_date",
            "data_format": "delta",
            "write_type": "merge",
            "partitions": ["order_date_header"],
            "merge_opts": {
                "merge_predicate": """
                    new.sales_order_header = current.sales_order_header
                    AND new.sales_order_schedule = current.sales_order_schedule
                    AND new.sales_order_item=current.sales_order_item
                    AND new.epoch_status=current.epoch_status
                    AND new.changed_on=current.changed_on
                    AND new.extraction_date=current.extraction_date
                    AND new.lhe_batch_id=current.lhe_batch_id
                    AND new.lhe_row_id=current.lhe_row_id
                """,
                "insert_only": True,
            },
            "db_table": "your_database.order_events_with_dq",
            "options": {
                "checkpointLocation": "s3://my-data-product-bucket/checkpoints/template_order_events_with_dq/"
            },
        }
    ],
    "terminate_specs": [
        {
            "function": "optimize_dataset",
            "args": {
              "db_table": "your_database.order_events_with_dq"
            }
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

---
> ***Note:*** Although it is possible to interact with the Lakehouse Engine functions directly from your python code, 
instead of relying on creating an ACON dict and use the engine api, we do not ensure the stability across new 
Lakehouse Engine releases when calling internal functions (not exposed in the facade) directly.

---

---
> ***Note:*** ACON structure might change across releases, please test your Data Product first before updating to a 
new version of the Lakehouse Engine in your Production environment.

---
## Overwriting default configurations

We use a YAML file to specify various configurations needed for different functionalities. You can overwrite these 
configurations using a dictionary with new settings or by providing a path to a YAML file.

This functionality can be particularly useful for the open-source community as it unlocks 
the usage several functionalities like Prisma and engine usage logs.

Check default configurations.
```
from lakehouse_engine.core import exec_env
print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket)
   > default-bucket
```

Change the dq_dev_bucket configuration.
```
exec_env.ExecEnv.set_default_engine_config(custom_configs_dict={"dq_dev_bucket": "your-dq-bucket"})
print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket)
   > your-dq-bucket
```
Reset to default configurations.
```
exec_env.ExecEnv.set_default_engine_config()
print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket)
   > default-bucket
```

---

## Who maintains the Lakehouse Engine?
The Lakehouse Engine is under active development and production usage by the Adidas Lakehouse Foundations Engineering team. 

## Community Support and Contributing

🤝 Do you want to contribute or need any support? Check out all the details in [CONTRIBUTING.md](https://github.com/adidas/lakehouse-engine/blob/master/CONTRIBUTING.md).

## License and Software Information

© adidas AG

adidas AG publishes this software and accompanied documentation (if any) subject to the terms of the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt)
with the aim of helping the community with our tools and libraries which we think can be also useful for other people.
You will find a copy of the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) in the root folder of this package. All rights not explicitly granted
to you under the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) remain the sole and exclusive property of adidas AG.

---
> ***NOTICE:*** The software has been designed solely for the purposes described in this ReadMe file. The software is NOT designed,
tested or verified for productive use whatsoever, nor or for any use related to high risk environments, such as health care,
highly or fully autonomous driving, power plants, or other critical infrastructures or services.

---

If you want to contact adidas regarding the software, you can mail us at software.engineering@adidas.com.

For further information open the [adidas terms and conditions](https://github.com/adidas/adidas-contribution-guidelines/wiki/Terms-and-conditions) page.


================================================
FILE: assets/gab/metadata/gab/f_agg_dummy_sales_kpi/1_article_category.sql
================================================
SELECT
    "category_a" AS category_name
   ,"article1" AS article_id
UNION
SELECT
     "category_a" AS category_name
    ,"article2" AS article_id
UNION
SELECT
     "category_a" AS category_name
    ,"article3" AS article_id
UNION
SELECT
     "category_a" AS category_name
    ,"article4" AS article_id
UNION
SELECT
     "category_b" AS category_name
    ,"article5" AS article_id
UNION
SELECT
     "category_b" AS category_name
    ,"article6" AS article_id
UNION
SELECT
     "category_b" AS category_name
    ,"article7" AS article_id


================================================
FILE: assets/gab/metadata/gab/f_agg_dummy_sales_kpi/2_f_agg_dummy_sales_kpi.sql
================================================
SELECT
    {% if replace_offset_value == 0 %} {{ project_date_column }}
    {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour)
    {% endif %} AS order_date,
    {{ to_date }} AS to_date,
    b.category_name,
    COUNT(a.article_id) qty_articles,
    SUM(amount) total_amount
FROM
  `{{ database }}`.`dummy_sales_kpi` a {{ joins }}
  LEFT JOIN article_categories b
    ON a.article_id = b.article_id
WHERE
  TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= (
          '{{start_date}}' + interval '{{offset_value}}' hour
  )
  AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < (
          '{{ end_date}}' + interval '{{offset_value}}' hour
  )
GROUP BY
  1,2,3


================================================
FILE: assets/gab/metadata/tables/dim_calendar.sql
================================================
DROP TABLE IF EXISTS  `database`.dim_calendar;
CREATE EXTERNAL TABLE `database`.dim_calendar (
  calendar_date DATE COMMENT 'Full calendar date in the format yyyyMMdd.',
  day_en STRING COMMENT 'Name of the day of the week.',
  weeknum_mon INT COMMENT 'Week number where the week starts on Monday.',
  weekstart_mon DATE COMMENT 'First day of the week where the week starts on Monday.',
  weekend_mon DATE COMMENT 'Last day of the week where the week starts on Monday.',
  weekstart_sun DATE COMMENT 'First day of the week where the week starts on Sunday.',
  weekend_sun DATE COMMENT 'Last day of the week where the week starts on Sunday.',
  month_start DATE COMMENT 'First day of the Month.',
  month_end DATE COMMENT 'Last day of the Month.',
  quarter_start DATE COMMENT 'First day of the Quarter.',
  quarter_end DATE COMMENT 'Last day of the Quarter.',
  year_start DATE COMMENT 'First day of the Year.',
  year_end DATE COMMENT 'Last day of the Year.'
)
USING DELTA
LOCATION 's3://my-data-product-bucket/dim_calendar'
COMMENT 'This table stores the calendar information.'
TBLPROPERTIES(
  'lakehouse.primary_key'='calendar_date',
  'delta.enableChangeDataFeed'='false'
)

================================================
FILE: assets/gab/metadata/tables/dummy_sales_kpi.sql
================================================
DROP TABLE IF EXISTS `database`.`dummy_sales_kpi`;
CREATE EXTERNAL TABLE `database`.`dummy_sales_kpi` (
  `order_date` DATE COMMENT 'date of the orders',
  `article_id` STRING COMMENT 'article id',
  `amount` INT COMMENT 'quantity/amount sold on this date'
)
USING DELTA
PARTITIONED BY (order_date)
LOCATION 's3://my-data-product-bucket/dummy_sales_kpi'
COMMENT 'Dummy sales KPI (articles sold per date).'
TBLPROPERTIES(
  'lakehouse.primary_key'='article_id, order_date',
  'delta.enableChangeDataFeed'='true'
)


================================================
FILE: assets/gab/metadata/tables/gab_log_events.sql
================================================
DROP TABLE IF EXISTS `database`.`gab_log_events`;
CREATE EXTERNAL TABLE `database`.`gab_log_events`
(
`run_start_time` TIMESTAMP COMMENT 'Run start time for the use case',
`run_end_time` TIMESTAMP COMMENT 'Run end time for the use case',
`input_start_date` TIMESTAMP COMMENT 'The start time set for the use case process',
`input_end_date` TIMESTAMP COMMENT 'The end time set for the use case process',
`query_id` STRING COMMENT 'Query ID for the use case',
`query_label` STRING COMMENT 'Query label for the use case',
`cadence` STRING COMMENT 'This field stores the cadence of data granularity (Day/Week/Month/Quarter/Year)',
`stage_name` STRING COMMENT 'Intermediate stage',
`stage_query` STRING COMMENT 'Query run as part of stage',
`status` STRING COMMENT 'Status of the stage',
`error_code` STRING COMMENT 'Error code'
)
USING DELTA
PARTITIONED BY (query_id)
LOCATION 's3://my-data-product-bucket/gab_log_events'
COMMENT 'This table stores the log for all use cases in gab'
TBLPROPERTIES(
  'lakehouse.primary_key'='run_start_time,query_id,stage_name',
  'delta.enableChangeDataFeed'='false'
)

================================================
FILE: assets/gab/metadata/tables/gab_use_case_results.sql
================================================
DROP TABLE IF EXISTS `database`.`gab_use_case_results`;
CREATE EXTERNAL TABLE `database`.`gab_use_case_results`
(
`query_id` STRING COMMENT 'Query ID for the use case',
`cadence` STRING COMMENT 'Cadence of data granularity (Day/Week/Month/Quarter/Year)',
`from_date` DATE COMMENT 'Aggregate based on the date column',
`to_date` DATE COMMENT 'Snapshot end date',
`d1` STRING COMMENT 'Dimension 1',
`d2` STRING COMMENT 'Dimension 2',
`d3` STRING COMMENT 'Dimension 3',
`d4` STRING COMMENT 'Dimension 4',
`d5` STRING COMMENT 'Dimension 5',
`d6` STRING COMMENT 'Dimension 6',
`d7` STRING COMMENT 'Dimension 7',
`d8` STRING COMMENT 'Dimension 8',
`d9` STRING COMMENT 'Dimension 9',
`d10` STRING COMMENT 'Dimension 10',
`d11` STRING COMMENT 'Dimension 11',
`d12` STRING COMMENT 'Dimension 12',
`d13` STRING COMMENT 'Dimension 13',
`d14` STRING COMMENT 'Dimension 14',
`d15` STRING COMMENT 'Dimension 15',
`d16` STRING COMMENT 'Dimension 16',
`d17` STRING COMMENT 'Dimension 17',
`d18` STRING COMMENT 'Dimension 18',
`d19` STRING COMMENT 'Dimension 19',
`d20` STRING COMMENT 'Dimension 20',
`d21` STRING COMMENT 'Dimension 21',
`d22` STRING COMMENT 'Dimension 22',
`d23` STRING COMMENT 'Dimension 23',
`d24` STRING COMMENT 'Dimension 24',
`d25` STRING COMMENT 'Dimension 25',
`d26` STRING COMMENT 'Dimension 26',
`d27` STRING COMMENT 'Dimension 27',
`d28` STRING COMMENT 'Dimension 28',
`d29` STRING COMMENT 'Dimension 29',
`d30` STRING COMMENT 'Dimension 30',
`d31` STRING COMMENT 'Dimension 31',
`d32` STRING COMMENT 'Dimension 32',
`d33` STRING COMMENT 'Dimension 33',
`d34` STRING COMMENT 'Dimension 34',
`d35` STRING COMMENT 'Dimension 35',
`d36` STRING COMMENT 'Dimension 36',
`d37` STRING COMMENT 'Dimension 37',
`d38` STRING COMMENT 'Dimension 38',
`d39` STRING COMMENT 'Dimension 39',
`d40` STRING COMMENT 'Dimension 40',
`m1` DOUBLE COMMENT 'Metric 1',
`m2` DOUBLE COMMENT 'Metric 2',
`m3` DOUBLE COMMENT 'Metric 3',
`m4` DOUBLE COMMENT 'Metric 4',
`m5` DOUBLE COMMENT 'Metric 5',
`m6` DOUBLE COMMENT 'Metric 6',
`m7` DOUBLE COMMENT 'Metric 7',
`m8` DOUBLE COMMENT 'Metric 8',
`m9` DOUBLE COMMENT 'Metric 9',
`m10` DOUBLE COMMENT 'Metric 10',
`m11` DOUBLE COMMENT 'Metric 11',
`m12` DOUBLE COMMENT 'Metric 12',
`m13` DOUBLE COMMENT 'Metric 13',
`m14` DOUBLE COMMENT 'Metric 14',
`m15` DOUBLE COMMENT 'Metric 15',
`m16` DOUBLE COMMENT 'Metric 16',
`m17` DOUBLE COMMENT 'Metric 17',
`m18` DOUBLE COMMENT 'Metric 18',
`m19` DOUBLE COMMENT 'Metric 19',
`m20` DOUBLE COMMENT 'Metric 20',
`m21` DOUBLE COMMENT 'Metric 21',
`m22` DOUBLE COMMENT 'Metric 22',
`m23` DOUBLE COMMENT 'Metric 23',
`m24` DOUBLE COMMENT 'Metric 24',
`m25` DOUBLE COMMENT 'Metric 25',
`m26` DOUBLE COMMENT 'Metric 26',
`m27` DOUBLE COMMENT 'Metric 27',
`m28` DOUBLE COMMENT 'Metric 28',
`m29` DOUBLE COMMENT 'Metric 29',
`m30` DOUBLE COMMENT 'Metric 30',
`m31` DOUBLE COMMENT 'Metric 31',
`m32` DOUBLE COMMENT 'Metric 32',
`m33` DOUBLE COMMENT 'Metric 33',
`m34` DOUBLE COMMENT 'Metric 34',
`m35` DOUBLE COMMENT 'Metric 35',
`m36` DOUBLE COMMENT 'Metric 36',
`m37` DOUBLE COMMENT 'Metric 37',
`m38` DOUBLE COMMENT 'Metric 38',
`m39` DOUBLE COMMENT 'Metric 39',
`m40` DOUBLE COMMENT 'Metric 40',
`lh_created_on` TIMESTAMP COMMENT 'This field stores the created_on in lakehouse'
)
USING DELTA
PARTITIONED BY (query_id)
LOCATION 's3://my-data-product-bucket/gab_use_case_results'
COMMENT 'This table is the common table for all use cases and stores all the dimensions and metrics'
TBLPROPERTIES(
  'lakehouse.primary_key'='query_id,cadence,to_date,from_date',
  'delta.enableChangeDataFeed'='false'
)

================================================
FILE: assets/gab/metadata/tables/lkp_query_builder.sql
================================================
DROP TABLE IF EXISTS `database`.`lkp_query_builder`;
CREATE EXTERNAL TABLE `database`.`lkp_query_builder`
(
`query_id` INT COMMENT 'Query ID for the use case which is a sequence of numbers',
`query_label` STRING COMMENT 'Summarized description of the use case',
`query_type` STRING COMMENT 'Type of use case based on region',
`mappings` STRING COMMENT 'Dictionary of mappings for dimensions and metrics',
`intermediate_stages` STRING COMMENT 'All the stages and their configs such as storageLevel repartitioning date columns',
`recon_window` STRING COMMENT 'Configurations for Cadence and Reconciliation Windows',
`timezone_offset` INT COMMENT 'Timezone offsets can be configured by a positive or negative integer',
`start_of_the_week` STRING COMMENT 'Sunday or Monday can be configured as the start of the week',
`is_active` STRING COMMENT 'Active Flag - Can be set to Y or N',
`queue` STRING COMMENT 'Can be set to High/Medium/Low based on the cluster computation requirement',
`lh_created_on` TIMESTAMP COMMENT 'This field stores the created_on in lakehouse'
)
USING DELTA
LOCATION 's3://my-data-product-bucket/lkp_query_builder'
COMMENT 'This table stores the configuration for the gab framework'
TBLPROPERTIES(
  'lakehouse.primary_key'='query_id',
  'delta.enableChangeDataFeed'='false'
)

================================================
FILE: assets/gab/notebooks/gab.py
================================================
# Databricks notebook source
from datetime import datetime, timedelta

from lakehouse_engine.engine import execute_gab
from pyspark.sql.functions import collect_list, collect_set, lit

# COMMAND ----------

dbutils.widgets.text("lookup_table", "lkp_query_builder")
lookup_table = dbutils.widgets.get("lookup_table")
dbutils.widgets.text("source_database", "source_database")
source_database = dbutils.widgets.get("source_database")
dbutils.widgets.text("target_database", "target_database")
target_database = dbutils.widgets.get("target_database")

# COMMAND ----------


def flatten_extend(list_to_flatten: list) -> list:
    """Flatten python list.

    Args:
        list_to_flatten: list to be flattened.
    Returns:
        A list containing the flatten values.
    """
    flat_list = []
    for row in list_to_flatten:
        flat_list.extend(row)
    return flat_list


lkp_query_builder_df = spark.read.table(
    "{}.{}".format(target_database, lookup_table)
)

query_label_and_queue = (
    lkp_query_builder_df.groupBy(lit(1)).agg(collect_list("query_label"), collect_set("queue")).collect()
)
query_list = flatten_extend(query_label_and_queue)[1]
queue_list = flatten_extend(query_label_and_queue)[2]

# COMMAND ----------

dbutils.widgets.text("start_date", "", label="Start Date")
dbutils.widgets.text("end_date", "", label="End Date")
dbutils.widgets.text("rerun_flag", "N", label="Re-Run Flag")
dbutils.widgets.text("look_back", "1", label="Look Back Window")
dbutils.widgets.multiselect(
    "cadence_filter",
    "All",
    ["All", "DAY", "WEEK", "MONTH", "QUARTER", "YEAR"],
    label="Cadence",
)
dbutils.widgets.multiselect("query_label_filter", "All", query_list + ["All"], label="Use Case")
dbutils.widgets.multiselect("queue_filter", "All", queue_list + ["All"], label="Query Categorization")
dbutils.widgets.text("gab_base_path", "", label="Base Path Use Cases")
dbutils.widgets.text("target_table", "", label="Target Table")

# Input Parameters
lookback_days = "1" if dbutils.widgets.get("look_back") == "" else dbutils.widgets.get("look_back")

# COMMAND ----------

end_date_str = (
    datetime.today().strftime("%Y-%m-%d") if dbutils.widgets.get("end_date") == "" else dbutils.widgets.get("end_date")
)
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")

# As part of daily run, when no end_date is given, program always runs
# for yesterday date (Unless custom end date is given)
if dbutils.widgets.get("end_date") == "":
    end_date = end_date - timedelta(days=1)

start_date_str = (
    datetime.date(end_date - timedelta(days=int(lookback_days))).strftime("%Y-%m-%d")
    if dbutils.widgets.get("start_date") == ""
    else dbutils.widgets.get("start_date")
)
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
rerun_flag = dbutils.widgets.get("rerun_flag")

query_label_filter = dbutils.widgets.get("query_label_filter")
recon_filter = dbutils.widgets.get("cadence_filter")
queue_filter = dbutils.widgets.get("queue_filter")
gab_base_path = dbutils.widgets.get("gab_base_path")

# COMMAND ----------

query_label_filter = [x.strip() for x in list(set(query_label_filter.split(",")))]
queue_filter = list(set(queue_filter.split(",")))
recon_filter = list(set(recon_filter.split(",")))

if "All" in query_label_filter:
    query_label_filter = query_list

if "All" in queue_filter:
    queue_filter = queue_list

# COMMAND ----------

target_table = (
    "gab_use_case_results" if dbutils.widgets.get("target_table") == "" else dbutils.widgets.get("target_table")
)

# COMMAND ----------

print(f"Query Label: {query_label_filter}")
print(f"Queue Filter: {queue_filter}")
print(f"Cadence Filter: {recon_filter}")
print(f"Target Database: {target_database}")
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
print(f"Look Back Days: {lookback_days}")
print(f"Re-run Flag: {rerun_flag}")
print(f"Target Table: {target_table}")
print(f"Source Database: {source_database}")
print(f"Path Use Cases: {gab_base_path}")

# COMMAND ----------

gab_acon = {
    "query_label_filter": query_label_filter,
    "queue_filter": queue_filter,
    "cadence_filter": recon_filter,
    "target_database": target_database,
    "start_date": start_date,
    "end_date": end_date,
    "rerun_flag": rerun_flag,
    "target_table": target_table,
    "source_database": source_database,
    "gab_base_path": gab_base_path,
    "lookup_table": lookup_table,
    "calendar_table": "dim_calendar",
}

# COMMAND ----------

execute_gab(acon=gab_acon)


================================================
FILE: assets/gab/notebooks/gab_dim_calendar.py
================================================
# Databricks notebook source
# MAGIC %md
# MAGIC # This notebook holds the calendar used as part of the GAB framework.

# COMMAND ----------

# Import the required libraries
from datetime import datetime, timedelta

from pyspark.sql.functions import to_date
from pyspark.sql.types import StringType

# COMMAND ----------

DIM_CALENDAR_LOCATION = "s3://my-data-product-bucket/dim_calendar"

# COMMAND ----------

initial_date = datetime.strptime("1990-01-01", "%Y-%m-%d")

dates_list = [datetime.strftime(initial_date, "%Y-%m-%d")]

for _ in range(1, 200000):
    initial_date = initial_date + timedelta(days=1)
    next_date = datetime.strftime(initial_date, "%Y-%m-%d")
    dates_list.append(next_date)

# COMMAND ----------

df_date_completed = spark.createDataFrame(dates_list, StringType())
df_date_completed = df_date_completed.withColumn("calendar_date", to_date(df_date_completed.value, "yyyy-MM-dd")).drop(
    df_date_completed.value
)
df_date_completed.createOrReplaceTempView("dates_completed")

# COMMAND ----------

df_cal = spark.sql(
    """
    WITH monday_calendar AS (
        SELECT
            calendar_date,
            WEEKOFYEAR(calendar_date) AS weeknum_mon,
            DATE_FORMAT(calendar_date, 'E') AS day_en,
            MIN(calendar_date) OVER (PARTITION BY CONCAT(DATE_PART('YEAROFWEEK', calendar_date),
            WEEKOFYEAR(calendar_date)) ORDER BY calendar_date) AS weekstart_mon
        FROM dates_completed
        ORDER BY
            calendar_date
    ),
    monday_calendar_plus_week_num_sunday AS (
        SELECT
            monday_calendar.*,
            LEAD(weeknum_mon) OVER(ORDER BY calendar_date) AS weeknum_sun
        FROM monday_calendar
    ),
    calendar_complementary_values AS (
        SELECT
            calendar_date,
            weeknum_mon,
            day_en,
            weekstart_mon,
            weekstart_mon+6 AS weekend_mon,
            LEAD(weekstart_mon-1) OVER(ORDER BY calendar_date) AS weekstart_sun,
            DATE(DATE_TRUNC('MONTH', calendar_date)) AS month_start,
            DATE(DATE_TRUNC('QUARTER', calendar_date)) AS quarter_start,
            DATE(DATE_TRUNC('YEAR', calendar_date)) AS year_start
        FROM monday_calendar_plus_week_num_sunday
    )
    SELECT
        calendar_date,
        day_en,
        weeknum_mon,
        weekstart_mon,
        weekend_mon,
        weekstart_sun,
        weekstart_sun+6 AS weekend_sun,
        month_start,
        add_months(month_start, 1)-1 AS month_end,
        quarter_start,
        ADD_MONTHS(quarter_start, 3)-1 AS quarter_end,
        year_start,
        ADD_MONTHS(year_start, 12)-1 AS year_end
    FROM calendar_complementary_values
    """
)
df_cal.createOrReplaceTempView("df_cal")

# COMMAND ----------

df_cal.write.format("delta").mode("overwrite").save(DIM_CALENDAR_LOCATION)


================================================
FILE: assets/gab/notebooks/gab_job_manager.py
================================================
# Databricks notebook source
import os

NOTEBOOK_CONTEXT = dbutils.notebook.entry_point.getDbutils().notebook().getContext()

# Import the required libraries
import datetime
import json
import time
import uuid
import ast

from pyspark.sql.functions import col, lit, upper

# COMMAND ----------

# MAGIC %run ../utils/databricks_job_utils

# COMMAND ----------

AUTH_TOKEN = NOTEBOOK_CONTEXT.apiToken().getOrElse(None)

HOST_NAME = spark.conf.get("spark.databricks.workspaceUrl")

DATABRICKS_JOB_UTILS = DatabricksJobs(databricks_instance=HOST_NAME, auth=AUTH_TOKEN)

# COMMAND ----------

dbutils.widgets.text("gab_job_schedule", "{'hour': {07: 'GLOBAL'}}")
gab_job_schedule = ast.literal_eval(dbutils.widgets.get("gab_job_schedule"))

dbutils.widgets.text("source_database", "")
source_database = dbutils.widgets.get("source_database")

dbutils.widgets.text("target_database", "")
target_database = dbutils.widgets.get("target_database")

dbutils.widgets.text("gab_base_path", "")
gab_base_path = dbutils.widgets.get("gab_base_path")

dbutils.widgets.text("gab_max_jobs_limit_high_job", "")
gab_max_jobs_limit_high_job = dbutils.widgets.get("gab_max_jobs_limit_high_job")

dbutils.widgets.text("gab_max_jobs_limit_medium_job", "")
gab_max_jobs_limit_medium_job = dbutils.widgets.get("gab_max_jobs_limit_medium_job")

dbutils.widgets.text("gab_max_jobs_limit_low_job", "")
gab_max_jobs_limit_low_job = dbutils.widgets.get("gab_max_jobs_limit_low_job")


# COMMAND ----------

# functions


def divide_chunks(input_list: list, max_number_of_jobs: int) -> list:
    """Split list into predefined chunks, accordingly to the number of jobs.

        This function reads the maximum job limit defined by the parameter for each queue type in order to determine
            the number of parallel runs for each queue and divides the use cases into chunks for each run.
        For example, if the maximum job limit is set to 30 for the high queue and there are 60 use cases for the
            high queue, then each run will handle 2 use cases.
    Args:
        input_list: Input list to be split.
        max_number_of_jobs: Max job number.

    Returns:
        Split chunk list.
    """
    avg_chunk_size = len(input_list) // max_number_of_jobs
    remainder = len(input_list) % max_number_of_jobs

    chunks = [
        input_list[i * avg_chunk_size + min(i, remainder) : (i + 1) * avg_chunk_size + min(i + 1, remainder)]
        for i in range(max_number_of_jobs)
    ]
    chunks = list(filter(None, chunks))
    return chunks


def get_run_regions(job_schedule: dict, job_info: dict) -> list:
    """Get run regions accordingly to job_manager trigger time.

    Args:
        job_schedule: Markets schedule list from the parameter `gab_job_schedule`.
        job_info: Job manager info to match.

    Returns:
        Markets run list.
    """
    q_type_match = ""
    for keys in job_schedule["hour"].keys():
        if keys == int(datetime.datetime.fromtimestamp(job_info["start_time"] / 1000).strftime("%H")):
            q_type_match = job_schedule["hour"][keys]
    try:
        print("Matched regions are: ", q_type_match)
        return list(q_type_match.split(","))
    except Exception:
        raise Exception("None of the query types are configured to be run at this time")


# COMMAND ----------


context_json = json.loads(NOTEBOOK_CONTEXT.safeToJson())

run_id = ""
if context_json.get("attributes") and context_json["attributes"].get("rootRunId"):
    run_id = context_json["attributes"]["rootRunId"]
print(f"Job Run Id: {run_id}")

job_status = DATABRICKS_JOB_UTILS.get_job(run_id)
print("Job Status: ", job_status)

# COMMAND ----------

list_q_type_match = get_run_regions(gab_job_schedule, job_status)

job_queues = {
    "High": {"queue": "gab_high_queue", "max_jobs": gab_max_jobs_limit_high_job},
    "Medium": {
        "queue": "gab_medium_queue",
        "max_jobs": gab_max_jobs_limit_medium_job,
    },
    "Low": {"queue": "gab_low_queue", "max_jobs": gab_max_jobs_limit_low_job},
}

df = spark.read.table(f"{target_database}.lkp_query_builder")

for queue_type, queue_config in job_queues.items():
    lst = (
        df.filter(upper(col("queue")) == lit(queue_type.upper()))
        .filter(col("query_type").isin(list_q_type_match))
        .select(col("query_label"))
        .collect()
    )
    query_list = [job_queues[0] for job_queues in lst]

    chunk = divide_chunks(query_list, int(queue_config["max_jobs"]))
    chunk = [i for i in chunk if i]

    if chunk:
        for i in range(0, len(chunk)):
            chunk_split = ",".join(chunk[i])
            print(chunk_split)
            time.sleep(2)

            idempotency_token = uuid.uuid4()
            print(idempotency_token)

            result = DATABRICKS_JOB_UTILS.run_now(
                DATABRICKS_JOB_UTILS.job_id_extraction(queue_config["queue"]),
                {
                    "query_label_filter": chunk_split,
                    "start_date": "",
                    "look_back": "",
                    "end_date": "",
                    "cadence_filter": "All",
                    "queue_filter": queue_type,
                    "rerun_flag": "N",
                    "target_database": target_database,
                    "source_database": source_database,
                    "gab_base_path": gab_base_path,
                },
                idempotency_token=idempotency_token,
            )
            print(f"{result}\n")


================================================
FILE: assets/gab/notebooks/query_builder_helper.py
================================================
# Databricks notebook source
# MAGIC %md
# MAGIC # Import Utils

# COMMAND ----------

# MAGIC %run ../utils/query_builder_utils

# COMMAND ----------

QUERY_BUILDER_UTILS = QueryBuilderUtils()

# COMMAND ----------

# MAGIC %md
# MAGIC <h1>Use Case Setup

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC The Global Asset Builder (GAB) has been developed to help you automate the creations of aggregate tables for
# MAGIC dashboards on top of base fact tables. It reduce the efforts and time to production for new aggregate tables.
# MAGIC Users don't need to create separate pipeline for all such cases.
# MAGIC
# MAGIC This notebook has been developed to help users to create their use cases configurations easily.
# MAGIC
# MAGIC There is some mandatory information that must be completed for the use case to work correctly:
# MAGIC
# MAGIC **Use case name:** This parameter must not contain spaces or special characters.
# MAGIC The suggestion is to use lowercase and underlined alphanumeric characters.
# MAGIC
# MAGIC **Market:** Related to the job schedule, example: GLOBAL starting at 07AM UTC
# MAGIC It gets the complete coverage of last day for the market.
# MAGIC - GLOBAL - 07AM UTC
# MAGIC
# MAGIC **Reference date:** Reference date of the use case. The parameter should be the column name.
# MAGIC The selected column should have the date/datetime format.
# MAGIC
# MAGIC **To date:** This parameter is used in the template, by default its value must be "to_date".
# MAGIC You can change it if you have managed this in your SQL files.
# MAGIC The values stored in this column depend on the use case behavior:
# MAGIC - if snapshots are enabled, it will contain the snapshot end day.
# MAGIC - If snapshot is not enabled, it will contain the last day of the cadence.
# MAGIC The snapshot behaviour is set in the reconciliation steps.
# MAGIC
# MAGIC **How many dimensions?** An integer input of the number of dimensions (columns) expected in the use case.
# MAGIC Do not consider the reference date or metrics here, as they have their own parameters.
# MAGIC
# MAGIC **Time Offset:** The time zone offset that you want to apply to the reference date column.
# MAGIC It should be a number to decrement or add to the date (e.g., -8 or 8). The default value is zero,
# MAGIC which means that any time zone transformation will be applied to the date.
# MAGIC
# MAGIC **Week start:** The start of the business week of the use case. Two options are available SUNDAY or MONDAY.
# MAGIC
# MAGIC **Is Active:** Flag to make the use case active or not. Default value is "Y".
# MAGIC
# MAGIC **How many views?** Defines how many consumption views you want to have for the use case.
# MAGIC You can have as many as you want. However, they will have exactly the same structure
# MAGIC (metrics, columns, timelines, etc.), the only change will be the filter applied to them.
# MAGIC The default value is 1.
# MAGIC
# MAGIC **Complexity:** Defines the complexity of your use case. You should mainly consider the volume of data.
# MAGIC This parameter directly affects the number of workers that will be spin up to execute the use case.
# MAGIC - High
# MAGIC - Medium
# MAGIC - Low
# MAGIC
# MAGIC **SQL File Names:** Name of the SQL files used in the use case.
# MAGIC You can combine different layers of dependencies between them as shown in the example,
# MAGIC where the "2_combined.sql" file depends on "1_product_category.sql" file.
# MAGIC The file name should follow the pattern x_file_name (where x is an integer digit) and be separated by a comma
# MAGIC (e.g.: 1_first_query.sql, 2_second_query.sql).
# MAGIC
# MAGIC **DEV - Database Schema Name** Refers to the name of the development environment database where the
# MAGIC "lkp_query_builder" table resides. This parameter is used at the end of the notebook to insert data into
# MAGIC the "lkp_query_builder" table.

# COMMAND ----------

dbutils.widgets.removeAll()
dbutils.widgets.text(name="usecase_name", defaultValue="", label="Use Case Name")
dbutils.widgets.dropdown(
    name="market", defaultValue="GLOBAL", label="Market", choices=["APAC", "GLOBAL", "NAM", "NIGHTLY"]
)
dbutils.widgets.text(name="from_date", defaultValue="", label="Reference Date")
dbutils.widgets.text(name="to_date", defaultValue="to_date", label="Snapshot End Date")
dbutils.widgets.text(name="num_dimensions", defaultValue="", label="How many dimensions?")
dbutils.widgets.text(name="time_offset", defaultValue="0", label="Time Offset")
dbutils.widgets.dropdown(name="week_start", defaultValue="MONDAY", label="Week start", choices=["SUNDAY", "MONDAY"])
dbutils.widgets.dropdown(name="is_active", defaultValue="Y", label="Is Active", choices=["Y", "N"])
dbutils.widgets.text(name="num_of_views", defaultValue="1", label="How many views?")
dbutils.widgets.dropdown(
    name="complexity", defaultValue="Medium", label="Complexity", choices=["Low", "Medium", "High"]
)
dbutils.widgets.text(name="sql_files", defaultValue="", label="SQL File Names")
dbutils.widgets.text(name="db_schema", defaultValue="", label="DEV - Database Schema Name")

# COMMAND ----------

# MAGIC %md
# MAGIC Set configurations and validate.

# COMMAND ----------

usecase_name = dbutils.widgets.get("usecase_name").lower().strip()
market = dbutils.widgets.get("market")
from_date = dbutils.widgets.get("from_date")
to_date = dbutils.widgets.get("to_date")
num_dimensions = dbutils.widgets.get("num_dimensions")
time_offset = dbutils.widgets.get("time_offset")
week_start = dbutils.widgets.get("week_start")
is_active = dbutils.widgets.get("is_active")
num_of_views = dbutils.widgets.get("num_of_views")
complexity = dbutils.widgets.get("complexity")
sql_files = dbutils.widgets.get("sql_files").replace(".sql", "")
db_schema = dbutils.widgets.get("db_schema")
num_of_metrics = ""

QUERY_BUILDER_UTILS.check_config_inputs(
    usecase_name, from_date, num_dimensions, sql_files, num_of_views, to_date, time_offset, db_schema
)

# COMMAND ----------

# MAGIC %md
# MAGIC Set Dimensions.
# MAGIC
# MAGIC In this step you will have to map the dimension columns with their respective order.
# MAGIC The options available in the widgets to fill are based on the number of dimensions previously defined.
# MAGIC For example, if you have two dimensions to analyze, such as country and category,
# MAGIC values must be set to D1 and D2.
# MAGIC For example:
# MAGIC D1. Dimension name = country
# MAGIC D2. Dimension name = category

# COMMAND ----------

QUERY_BUILDER_UTILS.set_dimensions(num_dimensions)

# COMMAND ----------

dimensions = QUERY_BUILDER_UTILS.get_dimensions(num_dimensions)

# COMMAND ----------

QUERY_BUILDER_UTILS.print_definitions(
    usecase_name=usecase_name,
    market=market,
    from_date=from_date,
    to_date=to_date,
    dimensions=dimensions,
    time_offset=time_offset,
    week_start=week_start,
    is_active=is_active,
    num_of_views=num_of_views,
    complexity=complexity,
    sql_files=sql_files,
    db_schema=db_schema,
)

# COMMAND ----------

# MAGIC %md
# MAGIC <h1> 1 - Configure view(s) name(s) and filter(s)

# COMMAND ----------

# MAGIC %md
# MAGIC The filters defined in this step will be based on the dimensions defined in the previous step.
# MAGIC
# MAGIC So, if you have set the country as D1, the filter here should be D1 = "Germany".
# MAGIC The commands allowed for the filter step are the same as those used in the where clause in SQL language.

# COMMAND ----------

QUERY_BUILDER_UTILS.set_views(num_of_views)

# COMMAND ----------

dims_dict = QUERY_BUILDER_UTILS.get_view_information(num_of_views)

# COMMAND ----------

QUERY_BUILDER_UTILS.print_definitions(
    usecase_name=usecase_name,
    market=market,
    from_date=from_date,
    to_date=to_date,
    dimensions=dimensions,
    time_offset=time_offset,
    week_start=week_start,
    is_active=is_active,
    num_of_views=num_of_views,
    complexity=complexity,
    sql_files=sql_files,
    db_schema=db_schema,
    dims_dict=dims_dict,
)

# COMMAND ----------

# MAGIC %md
# MAGIC # 2 - Configure Reconciliation

# COMMAND ----------

# MAGIC %md
# MAGIC The reconciliation configuration (recon) is mandatory.
# MAGIC In this section you will set the cadence, recon and snapshot behaviour of your use case.
# MAGIC
# MAGIC CADENCE - The cadence sets how often the data will be calculated. E.g: DAY, WEEK, MONTH, QUARTER, YEAR.
# MAGIC
# MAGIC RECON - The reconciliation for the cadence set.
# MAGIC
# MAGIC IS SNAPSHOT? - Set yes or no for the combination of cadence and reconciliation.
# MAGIC
# MAGIC Combination examples:
# MAGIC - DAILY CADENCE = DAY - This configuration means that only daily data will be refreshed.
# MAGIC - MONTHLY CADENCE - WEEKLY RECONCILIATION - WITHOUT SNAPSHOT = MONTH-WEEK-N -
# MAGIC This means after every week, the whole month data is refreshed without snapshot.
# MAGIC - WEEKLY CADENCE - DAY RECONCILIATION - WITH SNAPSHOT = WEEK-DAY-Y -
# MAGIC This means that every day, the entire week's data (week to date) is refreshed with snapshot.
# MAGIC It will generate a record for each day with the specific position of the value for the week.

# COMMAND ----------

dbutils.widgets.removeAll()
dbutils.widgets.multiselect(
    name="recon_cadence",
    defaultValue="DAY",
    label="Recon Cadence",
    choices=QUERY_BUILDER_UTILS.get_recon_choices(),
)

# COMMAND ----------

recon_list = list(filter(None, dbutils.widgets.get(name="recon_cadence").split(",")))
print(f"List of chosen reconciliation values: {recon_list}")

# COMMAND ----------

recon_dict = QUERY_BUILDER_UTILS.get_recon_config(recon_list)

# COMMAND ----------

QUERY_BUILDER_UTILS.print_definitions(
    usecase_name=usecase_name,
    market=market,
    from_date=from_date,
    to_date=to_date,
    dimensions=dimensions,
    time_offset=time_offset,
    week_start=week_start,
    is_active=is_active,
    num_of_views=num_of_views,
    complexity=complexity,
    sql_files=sql_files,
    db_schema=db_schema,
    dims_dict=dims_dict,
    recon_dict=recon_dict,
)

# COMMAND ----------

# MAGIC %md
# MAGIC <h1> 3 - Configure METRICS

# COMMAND ----------

# MAGIC %md
# MAGIC Define how many metrics your SQL files contain. For example, you have a sum (amount) as total_amount
# MAGIC and a count(*) as total_records, you will need to set 2 here.
# MAGIC
# MAGIC The metrics column must be configured in the same order they appear in the sql files.
# MAGIC
# MAGIC For example:
# MAGIC 1. Metric name = total_amount
# MAGIC 2. Metric name = total_records

# COMMAND ----------

dbutils.widgets.removeAll()
dbutils.widgets.text(name="num_of_metrics", defaultValue="1", label="How many metrics?")

# COMMAND ----------

num_of_metrics = dbutils.widgets.get("num_of_metrics")

QUERY_BUILDER_UTILS.set_metric(num_of_metrics)

# COMMAND ----------

# MAGIC %md
# MAGIC Based on the metric setup, it is possible to derive 4 new columns based on each metric.
# MAGIC Those new columns will be based on cadences like last_cadence, last_year_cadence and window function.
# MAGIC But also, you can create a derived column, which is a SQL statement that you can write on your own
# MAGIC by selecting the option of "derived_metric".

# COMMAND ----------

metrics_dict = QUERY_BUILDER_UTILS.get_metric_configuration(num_of_metrics)

# COMMAND ----------

QUERY_BUILDER_UTILS.set_extra_metric_config(num_of_metrics, metrics_dict)

# COMMAND ----------

QUERY_BUILDER_UTILS.print_definitions(
    usecase_name=usecase_name,
    market=market,
    from_date=from_date,
    to_date=to_date,
    dimensions=dimensions,
    time_offset=time_offset,
    week_start=week_start,
    is_active=is_active,
    num_of_views=num_of_views,
    complexity=complexity,
    sql_files=sql_files,
    db_schema=db_schema,
    dims_dict=dims_dict,
    recon_dict=recon_dict,
    metrics_dict=metrics_dict,
)

# COMMAND ----------

# MAGIC %md
# MAGIC <h1> 4 - Configure STAGES

# COMMAND ----------

# MAGIC %md
# MAGIC The parameters available for this step are:
# MAGIC
# MAGIC - Filter Date Column - This column will be used to filter the data of your use case.
# MAGIC This information will be replaced in the placeholder of the GAB template.
# MAGIC - Project Date Column - This column will be used as reference date for the query given.
# MAGIC This information will be replaced in the placeholder of the GAB template.
# MAGIC - Repartition Value - This parameter only has effect when used with Repartition Type parameter.
# MAGIC It sets the way of repartitioning the data while processing.
# MAGIC - Repartition Type - Type of repartitioning the data of the query.
# MAGIC Available values are Key and Number. When use Key, it expects column names separated by a comma.
# MAGIC When set number it expects and integer of how many partitions the user want.
# MAGIC - Storage Level - Defines the type of spark persistence storage levels you want to define
# MAGIC (e.g. Memory Only, Memory and Disk etc).
# MAGIC - Table Alias - The alias name of the sql file that will run.
# MAGIC

# COMMAND ----------

sql_files_list = QUERY_BUILDER_UTILS.set_stages(sql_files=sql_files)

# COMMAND ----------

# MAGIC %md
# MAGIC According to the number of sql files provided in the use case, a set of widgets will appear to be configured.
# MAGIC Remember that the configuration index matches the given sql file order.
# MAGIC
# MAGIC For example: 1_categories.sql, 2_fact_kpi.sql. Settings starting with index “1”.
# MAGIC will be set to sql file 1_categories.sql. The same will happen with index “2.”.

# COMMAND ----------

stages_dict = QUERY_BUILDER_UTILS.get_stages(sql_files_list, usecase_name)

# COMMAND ----------

# MAGIC %md
# MAGIC <h1> BUILD AND INSERT SQL INSTRUCTION

# COMMAND ----------

delete_sttmt, insert_sttmt = QUERY_BUILDER_UTILS.create_sql_statement(
    usecase_name,
    market,
    stages_dict,
    recon_dict,
    time_offset,
    week_start,
    is_active,
    complexity,
    db_schema,
    dims_dict,
    dimensions,
    from_date,
    to_date,
    metrics_dict,
)

print(delete_sttmt + "\n" + insert_sttmt)

# COMMAND ----------

# MAGIC %md
# MAGIC <h1> INSERT CONFIGURATION DATA
# MAGIC
# MAGIC **Note:** This insert will have effect just on dev/uat, to execute it on prod
# MAGIC it will need to use the Table/SQL Manager or another job.

# COMMAND ----------

QUERY_BUILDER_UTILS.insert_data_into_lkp_query_builder(delete_sttmt, insert_sttmt)


================================================
FILE: assets/gab/utils/databricks_job_utils.py
================================================
# Databricks notebook source
# imports
import enum
from typing import Tuple
from uuid import UUID

import requests


# COMMAND ----------

class BearerAuth:
    """Create authorisation object to be used in the requests header."""

    def __init__(self, token):
        """Create auth object with personal access token."""
        self.token = token

    def __call__(self, r):
        """Add bearer token to header.

        This function is internally called by get or post method of requests.
        """
        r.headers["authorization"] = "Bearer " + self.token
        return r


class ResultState(str, enum.Enum):
    """Possible values for result state of a job run."""

    SUCCESS = "SUCCESS"
    CANCELED = "CANCELED"
    FAILED = "FAILED"
    SKIPPED = "SKIPPED"


class DatabricksJobs:
    """Class with methods to execute databricks jobs API commands.
        Refer documentation for details: https://docs.databricks.com/dev-tools/api/latest/jobs.html#.
    """

    # api endpoints
    RUN_NOW = "/2.1/jobs/run-now"
    GET_OUTPUT = "/2.1/jobs/runs/get-output"
    GET_JOB = "/2.1/jobs/runs/get"
    GET_LIST_JOBS = "/2.1/jobs/list"
    CANCEL_JOB = "/2.1/jobs/runs/cancel"

    headers = {"Content-type": "application/json"}

    def __init__(self, databricks_instance: str, auth: str):
        """
        Construct a databricks jobs object using databricks instance and api token.

        Parameters:
            databricks_instance: domain name of databricks deployment. Use the form <account>.cloud.databricks.com
            auth: personal access token
        """
        self.databricks_instance = databricks_instance
        self.auth = BearerAuth(auth)

    @staticmethod
    def _check_response(response):
        if response.status_code != 200:
            raise Exception(f"Response Code: {response.status_code} \n {response.content}")

    def list_jobs(self, name: str = None, limit: int = 20, offset: int = 0, expand_tasks: bool = False) -> dict:
        """
        List the databricks jobs corresponding to given `name`.

        for details refer API documentation:
            https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsList

        Parameters:
            name: optional, to filter jobs as per name (case-insensitive)
            limit: optional, The number of jobs to return, valid range 0 to 25.
            offset: The offset of the first job to return, relative to the most recently created job
            expand_tasks: Whether to include task and cluster details in the response.
        Returns:
            A dictionary of job ids matching the name (if provided) else returns in chunks
        """
        params = {"limit": limit, "offset": offset, "expand_tasks": expand_tasks}

        if name:
            params.update({"name": name})
        response = requests.get(
            f"https://{self.databricks_instance}/api{self.GET_LIST_JOBS}",
            params=params,
            headers=self.headers,
            auth=self.auth,
        )
        self._check_response(response)  # Raises exception if not successful
        return response.json()

    def run_now(self, job_id: int, notebook_params: dict, idempotency_token: UUID = None) -> dict:
        """
        Trigger the job specified by the job id.

        Note: currently it expects notebook tasks in a job, but can be extended for other tasks

        Parameters:
            job_id: databricks job identifier
            notebook_params: key value pairs of the parameter name and its value to be passed to the job
            idempotency_token: An optional token to guarantee the idempotency of job run requests,
                it should have at most 64 characters
        Returns:
            A dictionary consisting of run_id and number_in_job

        """
        data = {"job_id": job_id, "notebook_params": notebook_params}
        if idempotency_token:
            data.update({"idempotency_token": str(idempotency_token)})

        response = requests.post(
            f"https://{self.databricks_instance}/api{self.RUN_NOW}",
            json=data,
            headers=self.headers,
            auth=self.auth,
        )
        self._check_response(response)  # Raises exception if not successful
        return response.json()

    def get_output(self, run_id: int) -> dict:
        """
        Fetch the single job run output and metadata for a single task.

        Reference: https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsGetOutput

        Parameters:
            run_id: identifier for the job run
        Returns:
            A dictionary containing the output and metadata from task
        """
        params = {}

        if run_id:
            params.update({"run_id": run_id})
        response = requests.get(
            f"https://{self.databricks_instance}/api{self.GET_OUTPUT}",
            params=params,
            headers=self.headers,
            auth=self.auth,
        )
        self._check_response(response)  # Raises exception if not successful
        return response.json()

    def get_job(self, run_id: int) -> dict:
        """
        Retrieve the metadata of a job run identified by run_id.

        Parameters:
            run_id: identifier for the job run
        Returns:
            A dictionary containing the metadata of a job
        """
        params = {}

        if run_id:
            params.update({"run_id": run_id})
        response = requests.get(
            f"https://{self.databricks_instance}/api{self.GET_JOB}", params=params, headers=self.headers, auth=self.auth
        )
        self._check_response(response)  # Raises exception if not successful
        return response.json()

    def cancel_job(self, run_id: int) -> dict:
        """
        Cancel job specified by run_id.

        Parameters:
            run_id: job run identifier

        Returns:
            Response received from endpoint
        """
        response = requests.post(
            f"https://{self.databricks_instance}/api{self.CANCEL_JOB}",
            json={"run_id": run_id},
            headers=self.headers,
            auth=self.auth,
        )
        self._check_response(response)  # Raises exception if not successful
        return response.json()

    def trigger_job_by_name(self, job_name: str, notebook_params: dict, idempotency_token: UUID = None) -> dict:
        """
        Triggers a job as specified by the job name, if found.

        Parameters:
            job_name: name of the job
            notebook_params: key value pairs of the parameter name and its value to be passed to the job
            idempotency_token: Optional token to guarantee the idempotency of job run requests, 64 characters max
        Returns:
            A dictionary consisting of run_id and number_in_job
        """
        result = self.list_jobs(name=job_name)
        if result.get("jobs") is None:
            raise Exception(f"job with name {job_name} not found.")

        return self.run_now(int(result.get("jobs")[0].get("job_id")), notebook_params, idempotency_token)

    def get_job_status(self, run_id: int) -> Tuple[bool, dict]:
        """
        Fetch the status of the job run id.

        Parameters:
            run_id: identifier for the job run
        Returns:
            Tuple bool and dict containing whether the job run has succeeded and its state
        """
        state = self.get_job(run_id)["state"]
        result_state = state.get("result_state") or state.get("life_cycle_state")
        return result_state == ResultState.SUCCESS, state

    def job_id_extraction(self, job_name: str) -> int:
        """Extract the job id from the job run.

        Args:
            job_name: Job name.

        Returns:
            Job ID number.
        """
        jobs_list = self.list_jobs(name=job_name)
        if jobs_list.get("jobs") is None:
            raise Exception("No jobs found.")
        return int(jobs_list.get("jobs")[0].get("job_id"))


================================================
FILE: assets/gab/utils/query_builder_utils.py
================================================
# Databricks notebook source
import json
import re

from databricks.sdk.runtime import *


class QueryBuilderUtils:
    """Class with methods to create GAB use case configuration."""

    def __init__(self):
        """Instantiate objects of the class QueryBuilderUtils."""
        self.regex_no_special_characters = "^[a-zA-Z0-9]+(_[a-zA-Z0-9]+)*$"
        self.cadences = ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"]

    def check_config_inputs(
            self,
            usecase_name: str,
            from_date: str,
            num_dimensions: str,
            sql_files: str,
            num_of_views: str,
            to_date: str,
            time_offset: str,
            db_schema: str
    ) -> str:
        """
        Check the parameters input.

        Args:
            usecase_name: The use case name.
            from_date: The reference date of the use case.
            num_dimensions: The number of dimensions chosen for analysis.
            sql_files: Name of the SQL files that will be submitted for the framework
                to process (e.g. file1.sql, file2.sql).
            num_of_views: Number of views the use case has.
            to_date: The end date of the snapshot configuration.
            time_offset: Hours related to the timezone (e.g. 8, -8).
            db_schema: Database name that lkp_query_builder is located.

        Returns:
            A message with the status of the validation.
        """
        message = ""
        if (
                usecase_name.strip() == ""
                or from_date.strip() == ""
                or num_dimensions.strip() == ""
                or sql_files.strip() == ""
                or num_of_views.strip() == ""
                or to_date.strip() == ""
                or db_schema.strip() == ""
        ):
            message = "WRONG CONFIGURATION:"
            if usecase_name.strip() == "":
                message += "\n\t - Please, add the Use Case Name."
            if from_date.strip() == "":
                message += "\n\t - Please, add the From Date."
            if num_dimensions.strip() == "":
                message += "\n\t - Please, add the Number of Dimensions."
            if sql_files.strip() == "":
                message += "\n\t - Please, add the SQL File Names."
            if num_of_views.strip() == "":
                message += "\n\t - Please, add the number of views."
            if to_date.strip() == "":
                message += "\n\t - Please, add the to date value. This information is mandatory. "
                message += "Keep it as 'to_date' unless you change its name in your SQL files."
            if db_schema.strip() == "":
                message += "\n\t - Please, add the database schema where the lkp_query_builder table is located."

        if time_offset.strip():
            try:
                int(re.findall('-?\d+\.?\d*',time_offset.strip())[0])
            except Exception:
                if message:
                    message += "\n\t The timezone offset must be a number (e.g. 0, 12 or -8)."
                else:
                    message = "WRONG CONFIGURATION:"
                    message += "\n\t - The timezone offset must be a number (e.g. 0, 12 or -8)."

        if num_dimensions.strip():
            try:
                int(num_dimensions)
                if int(num_dimensions) == 0:
                    message = "WRONG CONFIGURATION:"
                    message += "\n\t - The number of dimensions must be greater than zero."
            except Exception:
                if message:
                    message += "\n\t - The number of dimensions must be an integer."
                else:
                    message = "WRONG CONFIGURATION:"
                    message += "\n\t - The number of dimensions must be an integer."

        if sql_files.strip():
            files_list = self._sort_files(sql_files)
            for file in files_list:
                sql_files_err = f"""\n\t - Check the SQL file name '{file}'. """
                sql_files_err += "It must follow the pattern x_file_name (X is an integer digit)." ""
                try:
                    int(re.match("(.*?)_", file).group()[:-1])
                except Exception:
                    if message:
                        message += sql_files_err
                    else:
                        message = "WRONG CONFIGURATION:"
                        message += sql_files_err
        if not message:
            message = "Validation status: OK"

        return print(message)

    def create_sql_statement(
            self,
            usecase_name: str,
            market: str,
            stages_dict: dict,
            recon_dict: dict,
            time_offset: str,
            week_start: str,
            is_active: str,
            complexity: str,
            db_schema: str,
            dims_dict: dict,
            dimensions: str,
            from_date: str,
            to_date: str,
            metrics_dict: dict,
    ) -> tuple[str, str]:
        """
        Create the SQL statement to insert data into lkp_query_builder_table.

        Args:
            usecase_name: The name of use case.
            market: The market used for the use case (APAC, GLOBAL, NAM, NIGHTLY).
            stages_dict: A dictionary of stages and it's configurations.
            recon_dict: A dictionary of reconciliation setup.
            time_offset: Hours related to the timezone (e.g. 8, -8).
            week_start: Day of the start of the week (e.g. Sunday, Monday)
            is_active: If the use case is active or not. (e.g. Y, N)
            complexity: The categories are directly related to the number of workers in each cluster.
                That is, High = 10 workers, Medium = 6 workers and Low = 4 workers.
            db_schema: Database name that lkp_query_builder is located.
            dims_dict: The dictionary of views and it's setup.
            dimensions: Store supporting information to the fact table.
            from_date: Aggregating date column for the use case.
            to_date: Contains the current date (default value is to_date).
                Information used as template for the framework.
            metrics_dict: The dictionary of metrics and it's setup.

        Returns:
            A tuple with a text formatted with the delete and insert statement.

        """
        dbutils.widgets.removeAll()

        mapping_dict = self._get_mapping(dims_dict, dimensions, from_date, to_date, metrics_dict)

        query_id = self._generate_query_id(usecase_name)
        query_label = f"'{usecase_name}'"
        query_type = f"'{market}'"
        mapping_str = json.dumps(mapping_dict, indent=4)
        mappings = '"""' + mapping_str.replace('"', "'").replace("#+#-#", '\\"') + '"""'
        steps_str = json.dumps(stages_dict, indent=4)
        intermediate_stages = '"""' + steps_str.replace('"', "'") + '"""'
        recon_str = json.dumps(recon_dict)
        recon_window = '"""' + recon_str.replace('"', "'") + '"""'
        col_time_offset = f"'{time_offset}'"
        start_of_week = f"'{week_start}'"
        col_is_active = f"'{is_active}'"
        queue = f"'{complexity}'"

        delete_sttmt = f"""DELETE FROM {db_schema}.lkp_query_builder WHERE QUERY_LABEL = {query_label};"""
        insert_sttmt = f"""INSERT INTO {db_schema}.lkp_query_builder VALUES (
            {query_id},
            {query_label},
            {query_type},
            {mappings},
            {intermediate_stages},
            {recon_window},
            {col_time_offset},
            {start_of_week},
            {col_is_active},
            {queue},
            current_timestamp());"""

        return delete_sttmt, insert_sttmt

    def get_dimensions(self, num_dimensions: str) -> str:
        """
        Get the dimensions set on the widgets and validate.

        Args:
            num_dimensions: The number of dimensions set.

        Returns:
            A string with comma-separated dimensions names.

        """
        dimensions = ""
        list_status = []
        for i in range(int(num_dimensions)):
            i = i + 1
            if re.match(self.regex_no_special_characters, dbutils.widgets.get(f"D{i}").strip()):
                dimensions += "," + dbutils.widgets.get(f"D{i}").strip()
                list_status.append("success")
            else:
                print("WRONG CONFIGURATION:")
                print(f"\t- {dbutils.widgets.get(f'D{i}')} is empty of malformed!")
                print(
                    "\t Names can contain only alphanumeric characters and must begin with "
                    "an alphabetic character or an underscore (_)."
                )
                list_status.append("fail")
        if "fail" not in list_status:
            print("Dimensions validation status: OK")
            return dimensions[1:]

    @classmethod
    def get_recon_choices(cls) -> list:
        """
        Return all possible combinations for cadences, reconciliations and the snapshot flag value (Y,N).

        Returns:
            List used to generate a multiselect widget for the users to interact with.

        """
        return [
            "DAY",
            "DAY-WEEK-N",
            "DAY-MONTH-N",
            "DAY-QUARTER-N",
            "DAY-YEAR-N",
            "WEEK",
            "WEEK-DAY-N",
            "WEEK-DAY-Y",
            "WEEK-MONTH-N",
            "WEEK-QUARTER-N",
            "WEEK-YEAR-N",
            "MONTH",
            "MONTH-DAY-N",
            "MONTH-DAY-Y",
            "MONTH-WEEK-Y",
            "MONTH-WEEK-N",
            "MONTH-QUARTER-N",
            "MONTH-YEAR-N",
            "QUARTER",
            "QUARTER-DAY-N",
            "QUARTER-DAY-Y",
            "QUARTER-WEEK-N",
            "QUARTER-WEEK-Y",
            "QUARTER-MONTH-N",
            "QUARTER-MONTH-Y",
            "QUARTER-YEAR-N",
            "YEAR",
            "YEAR-DAY-N",
            "YEAR-DAY-Y",
            "YEAR-WEEK-N",
            "YEAR-WEEK-Y",
            "YEAR-MONTH-N",
            "YEAR-MONTH-Y",
            "YEAR-QUARTER-N",
            "YEAR-QUARTER-Y",
        ]

    @classmethod
    def get_metric_configuration(cls, num_of_metrics: str) -> dict:
        """
        Get metrics information based on the widget setup.

        Args:
            num_of_metrics: Number of metrics selected.

        Returns:
            metrics_dict: The dictionary of metrics and their setup.

        """
        metrics_dict = {}
        for i in range(int(num_of_metrics)):
            i = i + 1
            if dbutils.widgets.get(f"metric_name{i}"):
                metrics_dict[f"m{i}"] = {
                    "metric_name": dbutils.widgets.get(f"metric_name{i}"),
                    "calculated_metric": {},
                    "derived_metric": {},
                }
                calculated_metric_list = list(filter(None, dbutils.widgets.get(f"calculated_metric{i}").split(",")))
                for calc_metric in calculated_metric_list:
                    if calc_metric == "last_cadence":
                        metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}})
                        # add label and window for last_cadence
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
                        )
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_window", defaultValue="", label=f"{i}_{calc_metric}.Window"
                        )
                    if calc_metric == "last_year_cadence":
                        metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}})
                        # add label and window for last_cadence
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
                        )
                    if calc_metric == "window_function":
                        metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}})
                        # add label and window for window_function
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
                        )
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_window",
                            defaultValue="",
                            label=f"{i}_{calc_metric}.Window Interval",
                        )
                        dbutils.widgets.dropdown(
                            name=f"{i}_{calc_metric}_agg_func",
                            defaultValue="sum",
                            label=f"{i}_{calc_metric}.Agg Func",
                            choices=["sum", "avg", "max", "min", "count"],
                        )
                    # add label and window for derived_metric
                    if calc_metric == "derived_metric":
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
                        )
                        dbutils.widgets.text(
                            name=f"{i}_{calc_metric}_formula", defaultValue="", label=f"{i}_{calc_metric}.Formula"
                        )
                print("Metric configuration status: OK")
            else:
                print("WRONG CONFIGURATION:")
                print("\t- The metric name is mandatory!")

        return metrics_dict

    def get_recon_config(self, recon_list: list) -> dict:
        """
        Get reconciliation information based on the widget setup.

        Args:
            recon_list: List of cadences setup for the reconciliation.

        Returns:
            A dictionary of reconciliation setup.

        """
        cadence_list = []
        # create a list with the distinct cadences values.
        for cadence in recon_list:
            cadence_name = cadence.split("-")[0]
            cadence_list.append(cadence_name)

        cadence_list = list(dict.fromkeys(cadence_list))

        # create a dict with the structure of each cadence.
        recon_dict = {}
        for cad in cadence_list:
            recon_dict[f"{cad}"] = {}
            recon_dict[f"{cad}"]["recon_window"] = {}

        # updates the dict of each cadence with the recon configurations selected.
        for cadence in recon_list:
            if cadence in self.cadences:
                recon_dict[f"{cad}"]["recon_window"] = {}
            else:
                cadence_name = cadence.split("-")[0]
                recon = cadence.split("-")[1]
                snapshot = cadence.split("-")[2]
                for cad in cadence_list:
                    if cadence_name == cad:
                        recon_dict[cad]["recon_window"].update({recon: {"snapshot": snapshot}})

        # remove empty recon_window when the selected just cadence.
        for cadence in recon_list:
            if cadence in ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"]:
                if recon_dict[f"{cadence}"]["recon_window"] == {}:
                    del recon_dict[f"{cadence}"]["recon_window"]

        if recon_dict:
            print("Reconciliation configuration status: OK")
        else:
            print("WRONG CONFIGURATION:")
            print("\t- The recon information is mandatory!")
        return recon_dict

    def get_stages(self, sql_files_list: list, usecase_name: str) -> dict:
        """
        Set stages based on the widget setup.

        Args:
            sql_files_list: A list of sql files and their setup.
            usecase_name: The use case name.

        Returns:
            stages_dict: A dictionary of stages and their setup.

        """
        stages_dict = {}
        i = 0
        list_status = []
        for file in sql_files_list:
            i = i + 1
            if dbutils.widgets.get(name=f"{i}_script_table_alias"):
                stages_dict[f"{i}"] = {
                    "file_path": usecase_name + "/" + file.strip() + ".sql",
                    "table_alias": dbutils.widgets.get(name=f"{i}_script_table_alias"),
                    "storage_level": dbutils.widgets.get(name=f"{i}_script_storage_level"),
                    "project_date_column": dbutils.widgets.get(name=f"{i}_script_project_dt_col"),
                    "filter_date_column": dbutils.widgets.get(name=f"{i}_script_filter_dt_col"),
                }

                repartition_value = self._format_keys_list(dbutils.widgets.get(name=f"{i}_script_repartition_value"))

                stages_dict[f"{i}"]["repartition"] = {}
                if dbutils.widgets.get(name=f"{i}_script_repartition_type") == "NUMBER":
                    try:
                        int(dbutils.widgets.get(name=f"{i}_script_repartition_value").split(",")[0])
                        stages_dict[f"{i}"]["repartition"] = {
                            "numPartitions": dbutils.widgets.get(name=f"{i}_script_repartition_value")
                            .split(",")[0]
                            .replace("'", "")
                        }
                    except Exception:
                        print("The repartition value must be INTEGER when the type is defined as NUMBER.")
                        list_status.append("fail")

                elif dbutils.widgets.get(name=f"{i}_script_repartition_type") == "KEY":
                    stages_dict[f"{i}"]["repartition"] = {"keys": repartition_value}
            else:
                print(f"The field script alias is missing for {i}.Script Table Alias. This field is mandatory!")
                stages_dict = {}
                list_status.append("fail")

        if "fail" not in list_status:
            print("Stages configuration status: OK")
        return stages_dict

    def get_view_information(self, num_of_views: str) -> dict:
        """
        Get the views information based on the widget setup.

        Args:
            num_of_views: Number of views selected.

        Returns:
            The dictionary of views and their setup.

        """
        dims_dict = {}
        for i in range(int(num_of_views)):
            i = i + 1
            if re.match(self.regex_no_special_characters, dbutils.widgets.get(f"view_name{i}")):
                dims_dict[f"view_name{i}"] = {
                    "name": dbutils.widgets.get(f"view_name{i}"),
                    "filter": dbutils.widgets.get(f"view_filter{i}").replace("'", "#+#-#").replace('"', "#+#-#"),
                }
                print("Views validation status: OK")
            else:
                print("WRONG CONFIGURATION:")
                print("\t- View name is empty of malformed!")
                print(
                    "\t Names can contain only alphanumeric characters and must begin with "
                    "an alphabetic character or an underscore (_)."
                )
        return dims_dict

    @classmethod
    def insert_data_into_lkp_query_builder(cls, delete_sttmt: str, insert_sttmt: str):
        """
        Insert data into the lkp query builder table.

        Args:
            delete_sttmt: The delete statement.
            insert_sttmt: The insert statement.

        """
        try:
            spark.sql(f"{delete_sttmt}")
            spark.sql(f"{insert_sttmt}")
            print("CONFIGURATION INSERTED SUCCESSFULLY!")
        except Exception as e:
            print(e)

    def print_definitions(
            self,
            usecase_name,
            market,
            from_date,
            to_date,
            dimensions,
            time_offset,
            week_start,
            is_active,
            num_of_views,
            complexity,
            sql_files,
            db_schema,
            dims_dict: dict = None,
            recon_dict: dict = None,
            metrics_dict: dict = None,
            stages_dict: dict = None,
    ):
        """
        Print the definitions set on widgets.

        Args:
            usecase_name: The name of use case.
            market: The market used for the use case (APAC, GLOBAL, NAM, NIGHTLY).
            from_date: Aggregating date column for the use case.
            to_date: Contains the current date (default value is to_date).
                Information used as template for the framework.
            dimensions: Store supporting information to the fact table
            time_offset: Hours related to the timezone (e.g. 8, -8).
            week_start: Day of the start of the week (e.g. Sunday, Monday)
            is_active: If the use case is active or not. (e.g. Y, N)
            num_of_views: Number of views desired for the use case (e.g. 1, 2, 3).
            complexity: The categories are directly related to the number of workers in each cluster.
            That is, High = 10 workers, Medium = 6 workers and Low = 4 workers
            sql_files: Name of the SQL files that will be submitted for the framework
            to process (e.g. file1.sql, file2.sql).
            Database name that lkp_query_builder is located.
            dims_dict: A dictionary of dimensions.
            recon_dict: A dictionary of reconciliation setup.
            metrics_dict: The dictionary of metrics and their setup.
            stages_dict: A dictionary of stages and their setup.

        """
        print("USE CASE DEFINITIONS:")
        print("Use Case Name:", usecase_name)
        print("Market:", market)
        print("From Date:", from_date)
        print("To Date:", to_date)
        print("Dimensions:", dimensions)
        print("Time Offset:", time_offset)
        print("Week Start:", week_start)
        print("Is Active:", is_active)
        print("How many views?", num_of_views)
        print("Complexity:", complexity)
        print("SQL Files:", sql_files)
        print("Database Schema Name:", db_schema)
        self._print_dims_dict(dims_dict)
        self._print_recon_dict(recon_dict)
        if metrics_dict:
            print("METRICS CONFIGURED:")
            for key_metrics in metrics_dict:
                self._print_metrics_dict(key_metrics, metrics_dict)
        self._print_stages_dict(stages_dict)

    @classmethod
    def set_dimensions(cls, num_dimensions: str):
        """
        Set the dimension mappings based on the widget setup.

        Args:
            num_dimensions: Number of dimensions selected.

        """
        dbutils.widgets.removeAll()

        for i in range(int(num_dimensions)):
            i = i + 1
            dbutils.widgets.text(name=f"D{i}", defaultValue="", label=f"D{i}.Dimension Name")
        print("Please, configure the dimensions using the widgets and proceed to the next cmd.")

    def set_extra_metric_config(self, num_of_metrics: str, metrics_dict: dict):
        """
        Set extra metrics information based on the widget setup.

        Args:
            num_of_metrics: Number of metrics selected.

        """
        for i in range(int(num_of_metrics)):
            i = i + 1
            calculated_metric_list = list(filter(None, dbutils.widgets.get(f"calculated_metric{i}").split(",")))
            if calculated_metric_list:
                for calc_metric in calculated_metric_list:
                    self._validate_metrics_config(calc_metric, metrics_dict, i)
            else:
                print("Extra metrics configuration status: OK")

    @classmethod
    def set_metric(cls, num_of_metrics: str):
        """
        Set metrics information based on the widget setup.

        Args:
            num_of_metrics: Number of metrics selected.

        """
        dbutils.widgets.removeAll()
        for i in range(1, int(num_of_metrics) + 1):
            dbutils.widgets.text(name=f"metric_name{i}", defaultValue="", label=f"{i}.Metric Name")
            dbutils.widgets.multiselect(
                name=f"calculated_metric{i}",
                defaultValue="",
                label=f"{i}.Calculated Metric",
                choices=["", "last_cadence", "last_year_cadence", "window_function", "derived_metric"],
            )
        print("Please, configure the metrics using the widgets and proceed to the next cmd.")

    def set_stages(self, sql_files: list) -> list:
        """
        Set stages based on the widget setup.

        Args:
            sql_files: The SQL file names that will be used in the use case.

        Returns:
            sql_files_list: A list of sql files and their setup.

        """
        dbutils.widgets.removeAll()
        sql_files_list = self._sort_files(sql_files)

        for i in range(1, len(sql_files_list) + 1):
            dbutils.widgets.dropdown(
                name=f"{i}_script_storage_level",
                defaultValue="MEMORY_ONLY",
                label=f"{i}.Storage Level",
                choices=[
                    "DISK_ONLY",
                    "DISK_ONLY_2",
                    "DISK_ONLY_3",
                    "MEMORY_AND_DISK",
                    "MEMORY_AND_DISK_2",
                    "MEMORY_AND_DISK_DESER",
                    "MEMORY_ONLY",
                    "MEMORY_ONLY_2",
                    "OFF_HEAP",
                ],
            )
            dbutils.widgets.text(name=f"{i}_script_table_alias", defaultValue="", label=f"{i}.Table Alias")
            dbutils.widgets.text(name=f"{i}_script_project_dt_col", defaultValue="", label=f"{i}.Project Date Column")
            dbutils.widgets.text(name=f"{i}_script_filter_dt_col", defaultValue="", label=f"{i}.Filter Date Column")
            dbutils.widgets.dropdown(
                name=f"{i}_script_repartition_type",
                defaultValue="",
                label=f"{i}.Repartition Type",
                choices=["", "KEY", "NUMBER"],
            )
            dbutils.widgets.text(name=f"{i}_script_repartition_value", defaultValue="", label=f"{i}.Repartition Value")

        print("Please, configure the stages using the widgets and proceed to the next cmd.")
        return sql_files_list

    @classmethod
    def set_views(cls, num_of_views: str):
        """
        Set views that will be used in the use case.

        Args:
            num_of_views: Number of views selected.

        """
        dbutils.widgets.removeAll()

        for i in range(1, int(num_of_views) + 1):
            dbutils.widgets.text(name=f"view_name{i}", defaultValue="", label=f"{i}.View Name")
            dbutils.widgets.text(name=f"view_filter{i}", defaultValue="", label=f"{i}.View Filter")

        print("Please, configure the views using the widgets and proceed to the next cmd.")

    @classmethod
    def _format_keys_list(cls, key_str: str) -> list:
        """
        Format the list of keys based on the widget keys data provided.

        Args:
            key_str: Input text with key column names.

        Returns:
            A formatted list with the keys selected for repartitioning.

        """
        key_list = key_str.strip().split(",")
        output_list = []
        for key in key_list:
            output_list.append(key.replace("'", "").replace('"', "").strip())
        return output_list

    @classmethod
    def _generate_query_id(cls, usecase_name: str) -> int:
        """
        Generate the query id for the lookup query builder table.

        The logic to create the ID is a hash of the use case name converted to an integer.

        Args:
            usecase_name: The name of use case.

        Returns:
            The use case name hashed.

        """
        hash_val = int(str(hash(usecase_name))[0:9])
        return hash_val if hash_val > 0 else hash_val * -1

    @classmethod
    def _get_mapping(cls, dims_dict: dict, dimensions: str, from_date: str, to_date: str, metrics_dict: dict) -> dict:
        """
        Get mappings based on the dimensions defined on the widget setup.

        Args:
            dims_dict: A dictionary of dimensions.
            dimensions: Store supporting information to the fact table.
            from_date: Aggregating date column for the use case.
            to_date: Contains the current date (default value is to_date).
            Information used as template for the framework.
            metrics_dict: The dictionary of metrics and their setup.

        Returns:
            mapping_dict: A dictionary of mappings configuration.

        """
        mapping_dict = {}
        for key in dims_dict:
            mapping_dict.update({dims_dict[key]["name"]: {"dimensions": {}, "metric": {}, "filter": {}}})
            i = 0
            for d in dimensions.split(","):
                i = i + 1
                mapping_dict[dims_dict[key]["name"]]["dimensions"].update(
                    {"from_date": from_date, "to_date": to_date, f"d{i}": d.strip()}
                )
                mapping_dict[dims_dict[key]["name"]]["metric"].update(metrics_dict)
                if dims_dict[key]["filter"]:
                    mapping_dict[dims_dict[key]["name"]]["filter"] = dims_dict[key]["filter"]

        return mapping_dict

    @classmethod
    def _print_dims_dict(cls, dims_dict: dict):
        """
        Print the dictionary of dimensions and views formatted.

        Args:
            dims_dict: The dictionary of views and their setup.
        """
        if dims_dict:
            print("VIEWS CONFIGURED:")
            for key in dims_dict:
                print(f"{key}:")
                keys = [k for k, v in dims_dict[key].items()]
                for k in keys:
                    print(f"\t{k}:", dims_dict[key][k].replace("#+#-#", '"'))

    @classmethod
    def _print_derived_metrics(cls, key_metrics: str, derived_metric: str, metrics_dict: dict):
        """
        Print the derived dict formatted.

        Args:
            key_metrics: The key name of each metric configured (e.g. m1, m2, m3).
            derived_metric: The name of the derived metric configuration (e.g. last_cadence, last_year_cadence,
                            derived_metric, window_function).
            metrics_dict: The dictionary of metrics and their setup.
        """
        if derived_metric == "derived_metric":
            if metrics_dict[key_metrics][derived_metric]:
                print(f"\t- {derived_metric}:")
                derived_metric_val_list = [k for k, v in metrics_dict[key_metrics][derived_metric][0].items()]
                for derived_metric_val in derived_metric_val_list:
                    print(
                        f"\t  - {derived_metric_val} = "
                        f"{metrics_dict[key_metrics][derived_metric][0][derived_metric_val]}"
                    )

    def _print_metrics_dict(self, key_metrics: str, metrics_dict: dict):
        """
        Print the metrics configured formatted.

        Args:
            key_metrics: The key name of each metric configured (e.g. m1, m2, m3).
            metrics_dict: The dictionary of metrics and their setup.
        """
        print(f"{key_metrics}:")
        list_key_metrics = [k for k, v in metrics_dict[key_metrics].items()]
        if list_key_metrics:
            for metric in list_key_metrics:
                if metric == "metric_name":
                    print(f"  {metric} = {metrics_dict[key_metrics][metric]}")
                else:
                    for derived_metric in metrics_dict[key_metrics][metric]:
                        if derived_metric in ["last_cadence", "last_year_cadence", "window_function"]:
                            print(f"\t- {derived_metric}:")
                            derived_metric_val_list = [
                                k for k, v in metrics_dict[key_metrics][metric][derived_metric][0].items()
                            ]
                            for derived_metric_val in derived_metric_val_list:
                                print(
                                    f"\t  - {derived_metric_val} = "
                                    f"{metrics_dict[key_metrics][metric][derived_metric][0][derived_metric_val]}"
                                )
                        else:
                            self._print_derived_metrics(key_metrics, metric, metrics_dict)

    @classmethod
    def _print_recon_dict(cls, recon_dict: dict):
        """
        Print the recon dict formatted.

        Args:
            recon_dict: A dictionary of reconciliation setup.
        """
        if recon_dict:
            print("RECON CONFIGURED:")
            for key_cadence in recon_dict:
                if recon_dict[f"{key_cadence}"] == {}:
                    print(f"{key_cadence}")
                else:
                    print(f"{key_cadence}:")
                keys_recon = [k for k, v in recon_dict[key_cadence].items()]
                if keys_recon:
                    for k_recon in keys_recon:
                        print(f"  {k_recon}:")
                        keys_recon = [k for k, v in recon_dict[key_cadence][k_recon].items()]
                        for recon_val in keys_recon:
                            print(
                                f"\t- {recon_val}:snapshot = {recon_dict[key_cadence][k_recon][recon_val]['snapshot']}"
                            )

    @classmethod
    def _print_stages_dict(cls, stages_dict: dict):
        """
        Print the dictionary of stages formatted.

        Args:
            stages_dict: A dictionary of stages and their setup.
        """
        if stages_dict:
            print("STEPS CONFIGURED:")
            for key_stages in stages_dict:
                print(f"step {key_stages}:")
                keys_stages = [k for k, v in stages_dict[key_stages].items()]
                for k_stages in keys_stages:
                    if k_stages != "repartition":
                        print(f"  - {k_stages} = {stages_dict[key_stages][k_stages]}")
                    else:
                        repartition_stages = [k for k, v in stages_dict[key_stages][k_stages].items()]
                        for stg in repartition_stages:
                            print("  - repartition_type:")
                            print(f"\t {stg} = {stages_dict[key_stages][k_stages][stg]}")

    @classmethod
    def _sort_files(cls, sql_files: str) -> list:
        """
        Create a list sorted alphabetically based on the sql files provided.

        Args:
            sql_files: Name of the SQL files that will be sent to the framework
            to process (e.g. file1.sql, file2.sql).

        Returns:
            A list of sql files sorted alphabetically.

        """
        fileslist = sql_files.split(",")
        # remove extra spaces from items in the list
        fileslist = [x.strip() for x in fileslist]
        for file in range(len(fileslist)):
            fileslist[file] = fileslist[file].lower().strip()
            # apply bubble sort to sort the words
            for n in range(len(fileslist) - 1, 0, -1):
                for i in range(n):
                    if fileslist[i] > fileslist[i + 1]:
                        # swap data if the element is less than the next element in the array
                        fileslist[i], fileslist[i + 1] = fileslist[i + 1], fileslist[i]
        return fileslist

    @classmethod
    def _validate_metrics_config(cls, calc_metric: str, metrics_dict: dict, widget_index: int):
        """
        Validate the metrics widgets setup.

        Args:
            calc_metric: Name of the metric calculation set (e.g. last_cadence, last_year_cadence).
            metrics_dict: The dictionary of metrics and their setup.
            widget_index: Index of the widget selected to be validated.

        """
        if calc_metric == "last_cadence":
            if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "":
                try:
                    int(dbutils.widgets.get(f"{widget_index}_{calc_metric}_window"))
                    metrics_dict[f"m{widget_index}"]["calculated_metric"].update(
                        {
                            f"{calc_metric}": [
                                {
                                    "label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"),
                                    "window": dbutils.widgets.get(f"{widget_index}_{calc_metric}_window"),
                                }
                            ]
                        }
                    )
                    print(f"{calc_metric} configuration status: OK")
                except Exception:
                    print(f"{calc_metric} - WRONG CONFIGURATION:")
                    print(f"\t- The {calc_metric} window value must be INTEGER.")
            else:
                print(f"{calc_metric} - WRONG CONFIGURATION:")
                print(f"\t- The {calc_metric} label is mandatory.")
        elif calc_metric == "last_year_cadence":
            if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "":
                metrics_dict[f"m{widget_index}"]["calculated_metric"].update(
                    {
                        f"{calc_metric}": [
                            {
                                "label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"),
                                "window": 1,
                            }
                        ]
                    }
                )
                print(f"{calc_metric} configuration status: OK")
            else:
                print(f"{calc_metric} - WRONG CONFIGURATION:")
                print(f"\t- The {calc_metric} label is mandatory.")
        elif calc_metric == "window_function":
            if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "":
                window_list = dbutils.widgets.get(f"{widget_index}_{calc_metric}_window").split(",")
                if len(window_list) > 1:
                    metrics_dict[f"m{widget_index}"]["calculated_metric"].update(
                        {
                            f"{calc_metric}": [
                                {
                                    "label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"),
                                    "window": [int(x.strip()) for x in window_list],
                                    "agg_func": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_agg_func"),
                                }
                            ]
                        }
                    )
                    print(f"{calc_metric} configuration status: OK")
                else:
                    print(f"{calc_metric} - WRONG CONFIGURATION:")
                    print(
                        "\t- The window function must follow the pattern of "
                        "two integer digits separated with comma (e.g. 3,1)."
                    )
            else:
                print(f"{calc_metric} - WRONG CONFIGURATION:")
                print("\t- The window_function label is mandatory.")
        elif calc_metric == "derived_metric":
            if (
                    dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_label").strip() != ""
                    and dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_formula").strip() != ""
            ):
                metrics_dict[f"m{widget_index}"].update(
                    {
                        f"{calc_metric}": [
                            {
                                "label": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_label"),
                                "formula": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_formula"),
                            }
                        ]
                    }
                )
                print(f"{calc_metric} configuration status: OK")
            else:
                print(f"{calc_metric} - WRONG CONFIGURATION:")
                print("\t- The derived_metric label and formula are mandatory.")

================================================
FILE: cicd/.bumpversion.cfg
================================================
[bumpversion]
current_version = 2.0.0
commit = False
tag = False

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"


================================================
FILE: cicd/Dockerfile
================================================
ARG PYTHON_IMAGE=python:3.12-slim-bullseye

FROM $PYTHON_IMAGE

ARG USER_ID=1000
ARG GROUP_ID=1000
ARG CPU_ARCHITECTURE

# Install Prerequisites
RUN mkdir -p /usr/share/man/man1 && \
    apt-get -y update && \
    apt-get install -y wget=1.21* gnupg2=2.2* git=1:2* g++=4:10.2.1* rsync=3.2* && \
    apt-get -y clean

# Install jdk
RUN mkdir -p /etc/apt/keyrings && \
    wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | gpg --dearmor | tee /etc/apt/trusted.gpg.d/adoptium.gpg > /dev/null && \
    echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list && \
    apt-get -y update && \
    apt-get -y install temurin-17-jdk && \
    apt-get -y clean
ENV JAVA_HOME=/usr/lib/jvm/temurin-17-jdk-${CPU_ARCHITECTURE}

# useradd -l is necessary to avoid docker build hanging in export image phase when using large uids
RUN groupadd -g ${GROUP_ID} appuser && \
    useradd -rm -l -u ${USER_ID} -d /home/appuser -s /bin/bash -g appuser appuser

COPY cicd/requirements_full.lock /tmp/requirements.txt

USER appuser

ENV PATH="/home/appuser/.local/bin:$PATH"
RUN python -m pip install --upgrade pip==25.2 setuptools==74.* --user
RUN python -m pip install --user -r /tmp/requirements.txt

RUN mkdir /home/appuser/.ssh/ && touch /home/appuser/.ssh/known_hosts

RUN echo Image built for $CPU_ARCHITECTURE with python image $PYTHON_IMAGE.


================================================
FILE: cicd/Jenkinsfile
================================================
@Library(['GlobalJenkinsLibrary']) _

pipeline {
    options {
        buildDiscarder(logRotator(numToKeepStr: '30', artifactNumToKeepStr: '30'))
        timeout(time: 2, unit: 'HOURS')
        disableConcurrentBuilds()
        skipDefaultCheckout(true)
        ansiColor('xterm')
        timestamps()
    }

    agent {
        node {
            label 'lakehouse_base'
        }
    }

    environment {
        VERSION = env.BRANCH_NAME.replaceAll("[/-]", "_").toLowerCase()
        GIT_CREDENTIALS_ID = "git-lakehouse-cicd"
    }

    stages {
        stage('cleanup workspace') {
            steps {
                cleanWs(disableDeferredWipeout: true, deleteDirs: true)
            }
        }

        stage('Clone') {
            steps {
                retry(3) {
                    script {
                        checkout([
                                $class           : 'GitSCM',
                                branches         : scm.branches,
                                userRemoteConfigs: [[url: 'https://bitbucket.tools.3stripes.net/scm/lak/lakehouse-engine.git', credentialsId: GIT_CREDENTIALS_ID]]
                        ])
                    }
                }
            }
        }

        stage('Build Image') {
            steps {
                sh 'make build-image version=$VERSION'
            }
        }

        stage('Create Docs') {
            steps {
                sh 'make docs version=$VERSION'
            }
        }

        stage('Parallel') {
            parallel {
                stage('Lint') {
                    steps {
                        sh 'make lint version=$VERSION'
                    }
                }

                stage('Test Security') {
                    steps {
                        sh 'make test-security version=$VERSION'
                    }
                }

                stage('Audit Dependency Safety'){
                    steps{
                        catchError(message: "${STAGE_NAME} is unstable", buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
                            sh 'make audit-dep-safety version=$VERSION'
                        }
                    }
                }

                stage('Test dependencies') {
                    steps {
                        sh 'make test-deps version=$VERSION'
                    }
                }

                stage('Test') {
                    steps {
                        sh 'make test version=$VERSION'
                    }
                }
            }
        }

        stage('Sonar') {
            steps {
                script {
                    tools.sonar.run(env: 'COMMUNITY-PRD', version: '1.0', branch: env.BRANCH_NAME)
                }
            }
        }
    }

    post {
        always {
            archiveArtifacts artifacts: 'artefacts/docs/**/*'
            archiveArtifacts artifacts: 'artefacts/*.json'
            junit 'artefacts/tests.xml'
            step([$class: 'CoberturaPublisher', coberturaReportFile: 'artefacts/coverage.xml'])
        }
    }
}

================================================
FILE: cicd/Jenkinsfile_deploy
================================================
pipeline {
    parameters {
        string(name: 'BRANCH', defaultValue: 'master', description: 'Branch to use for the deployment process.')
        string(name: 'VERSION', defaultValue: null, description: 'Version to deploy (git tag in master branch without the "v"). E.g., 0.2.0. If you are deploying to dev, from your branch, ignore this.')
        booleanParam(name: 'SKIP_VALIDATIONS', defaultValue: false, description: 'Whether to skip the validations. Only applicable for feature releases to make them faster.')
        booleanParam(name: 'SKIP_OS_DEPLOYMENT', defaultValue: false, description: 'Whether to skip the OS Deployment related stages or not.')
        booleanParam(name: 'NOTIFY', defaultValue: true, description: 'Whether to notify the release or not.')
    }

    options {
        buildDiscarder(logRotator(numToKeepStr: '100', artifactNumToKeepStr: '30'))
        timeout(time: 2, unit: 'HOURS')
        disableConcurrentBuilds()
        skipDefaultCheckout(true)
        ansiColor('xterm')
        timestamps()
    }

    agent {
        node {
            label 'lakehouse_base'
        }
    }

    environment {
        PYPI_CREDENTIALS = credentials('pypi-credentials')
        ARTIFACTORY_CREDENTIALS = credentials('artifactory-credentials')
        GIT_CREDENTIALS_ID = "git-lakehouse-cicd"
        GIT_CREDENTIALS_LAK = credentials('push-to-github-lak')
        GIT_CREDENTIALS_LAK_DOCS = credentials('push-to-github-lak-docs')
        DEPLOY_VERSION = getDeploymentVersion()
        DEPLOY_GIT_OBJECT = getDeploymentGitObject()
    }

    stages {
        stage('cleanup workspace') {
            steps {
                cleanWs(disableDeferredWipeout: true, deleteDirs: true)
            }
        }

        stage('Clone') {
            steps {
                retry(3) {
                    script {
                        checkout([
                                $class           : 'GitSCM',
                                branches         : [['name': env.DEPLOY_GIT_OBJECT]],
                                userRemoteConfigs: [[url: 'https://bitbucket.tools.3stripes.net/scm/lak/lakehouse-engine.git', credentialsId: GIT_CREDENTIALS_ID]]
                        ])
                    }
                }
            }
        }

        stage('Build Image') {
            steps {
                sh 'make build-image version=' + "${env.DEPLOY_VERSION}"
            }
        }

        stage('Parallel') {
            when {
                expression {
                    (!params.SKIP_VALIDATIONS && params.BRANCH != 'master')
                }
            }
            parallel {

                stage('Lint') {
                    steps {
                        sh 'make lint version=' + "${env.DEPLOY_VERSION}"
                    }
                }

                stage('Test Security') {
                    steps {
                        sh 'make test-security version=' + "${env.DEPLOY_VERSION}"
                    }
                }

                stage('Audit Dependency Safety'){
                    steps{
                        catchError(message: "${STAGE_NAME} is unstable", buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
                            sh 'make audit-dep-safety version=$VERSION'
                        }
                    }
                }

                stage('Test dependencies') {
                    steps {
                        sh 'make test-deps version=' + "${env.DEPLOY_VERSION}"
                    }
                }

                stage('Test') {
                    steps {
                        sh 'make test version=' + "${env.DEPLOY_VERSION}"
                    }
                }

            }
        }

        stage('Deploy') {
            steps {
                script {
                    sh 'make deploy version=' + "${env.DEPLOY_VERSION}" + ' artifactory_credentials_file=$ARTIFACTORY_CREDENTIALS'
                }
            }
        }

        stage('Open Source Deployment') {
            when {
                expression {
                    (params.BRANCH == 'master' && !params.SKIP_OS_DEPLOYMENT)
                }
            }
            stages {
                stage('Sync Code with GitHub') {
                    steps {
                        script {
                            sh 'make sync-to-github version=' + "${env.DEPLOY_VERSION}" + ' git_credentials_file=$GIT_CREDENTIALS_LAK repository=lakehouse-engine'
                        }
                    }
                }

                stage('Deploy Docs to Github') {
                    steps {
                        script {
                            sh 'make deploy-docs-to-github version=' + "${env.DEPLOY_VERSION}" + ' git_credentials_file=$GIT_CREDENTIALS_LAK_DOCS repository=lakehouse-engine-docs os_deployment=True'
                        }
                    }
                }

                stage('Deploy to Pypi') {
                    steps {
                        script {
                            // we are forcing make build as it was not happening sometimes, for no reason.
                            sh 'make build os_deployment=True'
                            sh 'make deploy-to-pypi-and-clean os_deployment=True version=' + "${env.DEPLOY_VERSION}" + ' pypi_credentials_file=$PYPI_CREDENTIALS'
                        }
                    }
                }
            }
        }

        stage('Notify') {
            when {
                expression {
                    params.BRANCH == 'master' && params.NOTIFY
                }
            }
            steps {
                script {
                    params = readYaml file: 'cicd/meta.yaml'
                    release_notes = sh(script:'cat CHANGELOG.md | cut -d ")" -f 2 | head -n 10', returnStdout: true).trim()
                    recipients = params["mail_recipients"].join(";")
                    emailext(
                            attachLog: false,
                            compressLog: true,
                            body: """
                            <BR>A new version <b>$env.DEPLOY_VERSION</b> of the <b>Lakehouse Engine</b> was deployed into Artifactory.<BR><BR>
                            You can install it just like any other python library, either notebook scoped with pip install or cluster scoped
                            by specifying the library in the cluster configuration.: 
                            You can check the lakehouse-engine documentation here: ${params["engine_docs"]}.
                            Check the latest updates here:<BR>
                            <pre>
                            ${release_notes}
                            </pre><BR>
                            For more details, please check the complete changelog and/or the additional resources listed below:
                            <ul>
                              <li>${params["changelog_url"]}</li>
                              <li>${params["code_url"]}</li>
                              <li>${params["confluence_url"]}</li>
                            </ul>
                            """,
                            mimeType: 'text/html',
                            replyTo: "${params['reply_to']}",
                            from: "${params['from']}",
                            to: recipients,
                            subject: "Lakehouse Engine Updates - $env.DEPLOY_VERSION"
                    )
                }
            }
        }
    }
}

/**
 * Get deployment git object (branch name or tag reference) given certain Jenkins parameters and the team's deployment guidelines.
 * @return git object (branch or tag)
 */
def String getDeploymentGitObject() {
    gitObject = params.BRANCH

    if (params.BRANCH == 'master') {
        if (params.VERSION ==~ '[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}') {
            // force the git object to checkout to be a version tag
            gitObject = "refs/tags/v${params.VERSION}"
            return gitObject
        }
        else {
            throw new Exception("Version ${params.VERSION} does not match valid git version tag. It should be in the form of <major>.<minor>.<patch>.")
        }
    } else {
        return gitObject
    }
}

/**
 * Get deployment version given certain Jenkins parameters and the team's deployment guidelines.
 * @return deployment version
 */
def String getDeploymentVersion() {
    version = params.VERSION

    if (params.BRANCH == 'master') {
        if (version ==~ '[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}') {
            return version
        }
        else {
            throw new Exception("Version ${version} does not match valid git version tag. It should be in the form of <major>.<minor>.<patch>.")
        }
    } else {
        // force branch as the version to be deployed when we are dealing with feature branches.
        return params.BRANCH.replaceAll("[/-]", "_").toLowerCase()
    }
}

================================================
FILE: cicd/bandit.yaml
================================================
assert_used:
  skips: ['*test*']

================================================
FILE: cicd/code_doc/content.css
================================================
/*
This CSS file contains all style definitions for documentation content.

All selectors are scoped with ".pdoc".
This makes sure that the pdoc styling doesn't leak to the rest of the page when pdoc is embedded.
*/

.pdoc {
    color: var(--text);
    /* enforce some styling even if bootstrap reboot is not included */
    box-sizing: border-box;
    line-height: 1.5;
    /* override background from pygments */
    /*unnecessary since pdoc 10, only left here to keep old custom templates working. */
    background: none;
}

.pdoc .pdoc-button {
    cursor: pointer;
    display: inline-block;
    border: solid black 1px;
    border-radius: 2px;
    font-size: .75rem;
    padding: calc(0.5em - 1px) 1em;
    transition: 100ms all;
}


/* Admonitions */
.pdoc .pdoc-alert {
    padding: 1rem 1rem 1rem calc(1.5rem + 24px);
    border: 1px solid transparent;
    border-radius: .25rem;
    background-repeat: no-repeat;
    background-position: 1rem center;
    margin-bottom: 1rem;
}

.pdoc .pdoc-alert > *:last-child {
    margin-bottom: 0;
}

/* Admonitions are currently not stylable via theme.css */
.pdoc .pdoc-alert-note  {
    color: #000000;
    background-color: #f1efef;
    border-color: #f1f1f1;
    background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/info-circle-fill.svg' %}{% endfilter %}");
}

.pdoc .pdoc-alert-warning {
    color: #664d03;
    background-color: #fff3cd;
    border-color: #ffecb5;
    background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/exclamation-triangle-fill.svg' %}{% endfilter %}");
}

.pdoc .pdoc-alert-danger {
    color: #842029;
    background-color: #f8d7da;
    border-color: #f5c2c7;
    background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/lightning-fill.svg' %}{% endfilter %}");
}

.pdoc .visually-hidden {
    position: absolute !important;
    width: 1px !important;
    height: 1px !important;
    padding: 0 !important;
    margin: -1px !important;
    overflow: hidden !important;
    clip: rect(0, 0, 0, 0) !important;
    white-space: nowrap !important;
    border: 0 !important;
}

.pdoc h1, .pdoc h2, .pdoc h3 {
    font-weight: 300;
    margin: .3em 0;
    padding: .2em 0;
}

.pdoc > section:not(.module-info) h1 {
    font-size: 1.5rem;
    font-weight: 500;
}
.pdoc > section:not(.module-info) h2 {
    font-size: 1.4rem;
    font-weight: 500;
}
.pdoc > section:not(.module-info) h3 {
    font-size: 1.3rem;
    font-weight: 500;
}
.pdoc > section:not(.module-info) h4 {
    font-size: 1.2rem;
}
.pdoc > section:not(.module-info) h5 {
    font-size: 1.1rem;
}

.pdoc a {
    text-decoration: none;
    color: var(--link);
}

.pdoc a:hover {
    color: var(--link-hover);
}

.pdoc blockquote {
    margin-left: 2rem;
}

.pdoc pre {
    border-top: 1px solid var(--accent2);
    border-bottom: 1px solid var(--accent2);
    margin-top: 0;
    margin-bottom: 1em;
    padding: .5rem 0 .5rem .5rem;
    overflow-x: auto;
    /*unnecessary since pdoc 10, only left here to keep old custom templates working. */
    background-color: var(--code);
}

.pdoc code {
    color: var(--text);
    padding: .2em .4em;
    margin: 0;
    font-size: 85%;
    background-color: var(--accent);
    border-radius: 6px;
}

.pdoc a > code {
    color: inherit;
}

.pdoc pre > code {
    display: inline-block;
    font-size: inherit;
    background: none;
    border: none;
    padding: 0;
}

.pdoc > section:not(.module-info) {
    /* this margin should collapse with docstring margin,
       but not for the module docstr which is followed by view_source. */
    margin-bottom: 1.5rem;
}

/* Page Heading */
.pdoc .modulename {
    margin-top: 0;
    font-weight: bold;
}

.pdoc .modulename a {
    color: var(--link);
    transition: 100ms all;
}

/* GitHub Button */
.pdoc .git-button {
    float: right;
    border: solid var(--link) 1px;
}

.pdoc .git-button:hover {
    background-color: var(--link);
    color: var(--pdoc-background);
}

.view-source-toggle-state,
.view-source-toggle-state ~ .pdoc-code {
    display: none;
}
.view-source-toggle-state:checked ~ .pdoc-code {
    display: block;
}

.view-source-button {
    display: inline-block;
    float: right;
    font-size: .75rem;
    line-height: 1.5rem;
    color: var(--muted);
    padding: 0 .4rem 0 1.3rem;
    cursor: pointer;
    /* odd hack to reduce space between "bullet" and text */
    text-indent: -2px;
}
.view-source-button > span {
    visibility: hidden;
}
.module-info .view-source-button {
    float: none;
    display: flex;
    justify-content: flex-end;
    margin: -1.2rem .4rem -.2rem 0;
}
.view-source-button::before {
    /* somewhat awkward recreation of a <summary> element. ideally we'd just use `display: inline list-item`, but
     that does not work in Chrome (yet), see https://crbug.com/995106. */
    position: absolute;
    content: "View Source";
    display: list-item;
    list-style-type: disclosure-closed;
}
.view-source-toggle-state:checked ~ .attr .view-source-button::before,
.view-source-toggle-state:checked ~ .view-source-button::before {
    list-style-type: disclosure-open;
}

/* Docstrings */
.pdoc .docstring {
    margin-bottom: 1.5rem;
}

.pdoc section:not(.module-info) .docstring {
    margin-left: clamp(0rem, 5vw - 2rem, 1rem);
}

.pdoc .docstring .pdoc-code {
    margin-left: 1em;
    margin-right: 1em;
}

/* Highlight focused element */
.pdoc h1:target,
.pdoc h2:target,
.pdoc h3:target,
.pdoc h4:target,
.pdoc h5:target,
.pdoc h6:target,
.pdoc .pdoc-code > pre > span:target {
    background-color: var(--active);
    box-shadow: -1rem 0 0 0 var(--active);
}

.pdoc .pdoc-code > pre > span:target {
    /* make the highlighted line full width so that the background extends */
    display: block;
}

.pdoc div:target > .attr,
.pdoc section:target > .attr,
.pdoc dd:target > a {
    background-color: var(--active);
}

.pdoc * {
    scroll-margin: 2rem;
}

.pdoc .pdoc-code .linenos {
    user-select: none;
}

.pdoc .attr:hover {
    filter: contrast(0.95);
}

/* Header link */
.pdoc section, .pdoc .classattr {
    position: relative;
}

.pdoc .headerlink {
    --width: clamp(1rem, 3vw, 2rem);
    position: absolute;
    top: 0;
    left: calc(0rem - var(--width));
    transition: all 100ms ease-in-out;
    opacity: 0;
}
.pdoc .headerlink::before {
    content: "#";
    display: block;
    text-align: center;
    width: var(--width);
    height: 2.3rem;
    line-height: 2.3rem;
    font-size: 1.5rem;
}

.pdoc .attr:hover ~ .headerlink,
.pdoc *:target > .headerlink,
.pdoc .headerlink:hover {
    opacity: 1;
}

/* Attributes */
.pdoc .attr {
    display: block;
    margin: .5rem 0 .5rem;
    padding: .4rem .4rem .4rem 1rem;
    background-color: var(--accent);
    overflow-x: auto;
}

.pdoc .classattr {
    margin-left: 2rem;
}

.pdoc .name {
    color: var(--name);
    font-weight: bold;
}

.pdoc .def {
    color: var(--def);
    font-weight: bold;
}

.pdoc .signature {
    /* override pygments background color */
    background-color: transparent;
}

.pdoc .param, .pdoc .return-annotation {
    white-space: pre;
}
.pdoc .signature.multiline .param {
    display: block;
}
.pdoc .signature.condensed .param {
    display:inline-block;
}

.pdoc .annotation {
    color: var(--annotation);
}

/* Show/Hide buttons for long default values */
.pdoc .view-value-toggle-state,
.pdoc .view-value-toggle-state ~ .default_value {
    display: none;
}
.pdoc .view-value-toggle-state:checked ~ .default_value {
    display: inherit;
}
.pdoc .view-value-button {
    font-size: .5rem;
    vertical-align: middle;
    border-style: dashed;
    margin-top: -0.1rem;
}
.pdoc .view-value-button:hover {
    background: white;
}
.pdoc .view-value-button::before {
    content: "show";
    text-align: center;
    width: 2.2em;
    display: inline-block;
}
.pdoc .view-value-toggle-state:checked ~ .view-value-button::before {
    content: "hide";
}

/* Inherited Members */
.pdoc .inherited {
    margin-left: 2rem;
}

.pdoc .inherited dt {
    font-weight: 700;
}

.pdoc .inherited dt, .pdoc .inherited dd {
    display: inline;
    margin-left: 0;
    margin-bottom: .5rem;
}

.pdoc .inherited dd:not(:last-child):after {
    content: ", ";
}

.pdoc .inherited .class:before {
    content: "class ";
}

.pdoc .inherited .function a:after {
    content: "()";
}

/* Search results */
.pdoc .search-result .docstring {
    overflow: auto;
    max-height: 25vh;
}

.pdoc .search-result.focused > .attr {
    background-color: var(--active);
}

/* "built with pdoc" attribution */
.pdoc .attribution {
    margin-top: 2rem;
    display: block;
    opacity: 0.5;
    transition: all 200ms;
    filter: grayscale(100%);
}

.pdoc .attribution:hover {
    opacity: 1;
    filter: grayscale(0%);
}

.pdoc .attribution img {
    margin-left: 5px;
    height: 35px;
    vertical-align: middle;
    width: 70px;
    transition: all 200ms;
}

.pdoc table {
    display: block;
    width: max-content;
    max-width: 150%;
    overflow: auto;
    margin-bottom: 1rem;
}

.pdoc table th, .pdoc table td {
    padding: 12px 13px;
    border: 1px solid var(--accent2);
}

.pdoc table th {
    font-weight: 600;
}

================================================
FILE: cicd/code_doc/custom_example_macros.py
================================================
"""Macro methods to be used on Lakehouse Engine Docs."""
import warnings
import json
import pygments.formatters.html
from markupsafe import Markup

STACK_LEVEL = 2


def _search_files(file: dict, search_string: str) -> list:
    """Searches for a string and outputs the line.

    Search for a given string in a file and output the line where it is first
    found.

    Args:
        file: path of the file to be searched.
        search_string: string that will be searched for.

    Returns:
        The number of the first line where a given search_string appears.
    """
    range_lines = []
    with open(file) as f:
        for num, line in enumerate(f, 1):
            if search_string in line:
                range_lines.append(num - 1)
    return range_lines[0]


def _link_example(method_name: str) -> str or None:
    """Searches for a link in a dict.

    Searches for the link of a given method_name, in a specific config file and
    outputs it.

    Args:
        method_name: name of the method to be searched for.

    Returns:
        None or the example link for the given method_name.
    """
    if method_name in list(lakehouse_engine_examples.keys()):
        file_link = lakehouse_engine_examples[str(method_name)]

        return lakehouse_engine_examples["base_link"] + file_link if file_link != "" else None
    else:
        warnings.warn(
                "No entry provided for the following transformer: "
                + method_name,
                RuntimeWarning,
                STACK_LEVEL,
        )

        return None


def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
    """Searches for a transformer and returns the first dictionary occurrence.

    Search for a given transformer in a dictionary and return the first occurrence.

    Args:
        dict_to_search: path of the file to be searched.
        transformer: string that will be searched for.

    Returns:
        First dictionary where a given transformer is found.
    """
    dict_transformer = []
    for spec in dict_to_search["transform_specs"]:
        for transformer_dict in spec["transformers"]:
            if transformer_dict["function"] == transformer:
                dict_transformer.append(transformer_dict)

    return json.dumps(dict_transformer[0], indent=4)


def _highlight_examples(method_name: str) -> str or None:
    """Creates a code snippet.

    Constructs and exposes the code snippet of a given method_name.

    Args:
        method_name: name of the module to be searched for.

    Returns:
        None or the code snippet wrapped in html tags.
    """
    for key, item in lakehouse_engine_examples.items():
        if method_name == key:
            file_path = f"../../{item}"
            if file_path == "../../":
                warnings.warn(
                    "No unit testing for the following transformer: " + method_name,
                    RuntimeWarning,
                    STACK_LEVEL,
                    )
                return None

            first_line = _search_files(file_path, f'"function": "{method_name}"')
            with open(file_path) as json_file:
                acon_file = json.load(json_file)
            code_snippet = _get_dict_transformer(acon_file, method_name)

            # Defining the lexer which will parse through the snippet of code we want
            # to highlight
            lexer = pygments.lexers.JsonLexer()
            # Defining the format that will be outputted by the pygments library
            # (on our case it will output the code within html tags)
            formatter = pygments.formatters.html.HtmlFormatter(
                linenos="inline",
                anchorlinenos=True,
            )
            formatter.linenostart = first_line

            return Markup(pygments.highlight(code_snippet, lexer, formatter))


def get_example(method_name: str) -> str:
    """Get example based on given argument.

    Args:
        method_name: name of the module to be searched for.

    Returns:
        A example.
    """
    example_link = _link_example(method_name=method_name)
    json_example = _highlight_examples(method_name=method_name)

    if example_link:
        return (
            """<details class="example">\n"""
            f"""<summary>View Example of {method_name} (See full example <a href="{example_link}">here</a>)</summary>"""
            f"""<div class="language-json highlight"><pre><span></span><code>{json_example}</code></pre></div>\n"""
            """</details>"""
        )
    else:
        return ""


with open("./examples.json") as json_file:
    lakehouse_engine_examples = json.load(json_file)

def define_env(env):
    "Declare environment for jinja2 templates for markdown"

    for fn in [get_example]:
        env.macro(fn)

    # get mkdocstrings' Python handler
    python_handler = env.conf["plugins"]["mkdocstrings"].get_handler("python")

    # get the `update_env` method of the Python handler
    update_env = python_handler.update_env

    # override the `update_env` method of the Python handler
    def patched_update_env(md, config):
        update_env(md, config)

        # get the `convert_markdown` filter of the env
        convert_markdown = python_handler.env.filters["convert_markdown"]

        # build a chimera made of macros+mkdocstrings
        def render_convert(markdown: str, *args, **kwargs):
            return convert_markdown(env.render(markdown), *args, **kwargs)

        # patch the filter
        python_handler.env.filters["convert_markdown"] = render_convert

    # patch the method
    python_handler.update_env = patched_update_env


================================================
FILE: cicd/code_doc/examples.json
================================================
{
  "base_link":"https://github.com/adidas/lakehouse-engine/blob/master/",
  "get_max_value": "tests/resources/feature/delta_load/merge_options/update_column_set/batch_delta.json",
  "with_row_id": "tests/resources/feature/transformations/chain_transformations/acons/streaming_batch.json",
  "with_auto_increment_id": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json",
  "with_literals": "tests/resources/feature/transformations/column_creators/batch.json",
  "cast": "tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled.json",
  "column_selector": "",
  "flatten_schema": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
  "explode_columns": "tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json",
  "with_expressions": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
  "rename": "tests/resources/feature/schema_evolution/append_load/batch_append_disabled.json",
  "from_avro": "",
  "from_avro_with_registry": "",
  "from_json": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
  "to_json": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
  "condense_record_mode_cdc": "tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_init.json",
  "group_and_rank": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json",
  "hash_masker": "tests/resources/feature/transformations/data_maskers/hash_masking.json",
  "column_dropper": "tests/resources/feature/transformations/data_maskers/drop_columns.json",
  "add_current_date": "tests/resources/feature/transformations/date_transformers/streaming.json",
  "convert_to_date": "tests/resources/feature/transformations/date_transformers/streaming.json",
  "convert_to_timestamp": "tests/resources/feature/transformations/date_transformers/streaming.json",
  "format_date": "tests/resources/feature/transformations/date_transformers/streaming.json",
  "get_date_hierarchy": "tests/resources/feature/transformations/date_transformers/streaming.json",
  "incremental_filter": "tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_delta.json",
  "expression_filter": "tests/resources/feature/full_load/with_filter/batch.json",
  "column_filter_exp": "tests/resources/feature/transformations/multiple_transform/batch.json",
  "join": "tests/resources/feature/transformations/joiners/batch.json",
  "replace_nulls": "tests/resources/feature/transformations/null_handlers/replace_nulls_col_subset.json",
  "with_regex_value": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json",
  "coalesce": "tests/resources/feature/writers/acons/write_batch_console.json",
  "repartition": "tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json",
  "get_transformer": "",
  "with_watermark": "tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/streaming_drop_duplicates_overall_watermark.json"
}

================================================
FILE: cicd/code_doc/gen_ref_nav.py
================================================
"""Module to generate code reference docs."""

# Import necessary libraries
from pathlib import Path
import mkdocs_gen_files

# Create a new navigation structure
nav = mkdocs_gen_files.Nav()

# Define the root directory and the source directory
root = Path(__file__).parent
src = root / "mkdocs/lakehouse_engine"

print(f"Looking for files in {src}")

# Loop over all Python files in the source directory
for path in sorted(src.rglob("*.py")):
    # Get the module path and the documentation path for each file
    module_path = path.relative_to(src).with_suffix("")
    doc_path = path.relative_to(src / "").with_suffix(".md")
    full_doc_path = Path("reference", doc_path)

    # Split the module path into parts
    parts = tuple(module_path.parts)

    # Skip files that start with an underscore or have no parts
    if not parts:
        continue

    # If the file is an __init__.py file, remove the last part and rename the doc file to index.md
    if parts[-1] == "__init__" and str(parts[:-1]) != "()":
        parts = parts[:-1]
        doc_path = doc_path.with_name("index.md")
        full_doc_path = full_doc_path.with_name("index.md")
    elif parts[-1].startswith("_"):
        continue

    # Skip the loop iteration if there is no doc path
    if not doc_path:
        continue

    # If the doc path has at least one part, add it to the navigation
    if len(doc_path.parts) >= 1:
        nav_parts = [f"{part}" for part in parts]
        nav[tuple(nav_parts)] = doc_path.as_posix()

        # Open the full doc path and write the module identifier to it
        with mkdocs_gen_files.open(full_doc_path, "w") as fd:
            ident = ".".join(parts)
            fd.write(f"::: {ident}")

        # Set the edit path for the file
        mkdocs_gen_files.set_edit_path(
            full_doc_path, ".." / path.relative_to(root))

# Open the index.md file and write the built navigation to it
with mkdocs_gen_files.open("reference/index.md", "w") as nav_file:
    nav_file.writelines(nav.build_literate_nav())

================================================
FILE: cicd/code_doc/index.html.jinja2
================================================
{% set root_module_name = "" %}
{% extends "default/index.html.jinja2" %}
{% block title %}Lakehouse Engine Documentation{% endblock %}
{% block nav %}
    <img src="{{ logo }}" class="logo" alt="project logo"/>
    <input type="search" placeholder="Search..." role="searchbox" aria-label="search"
                   pattern=".+" required>
    <h2>Available Modules</h2>
    <ul>
        {% for submodule in all_modules if "." not in submodule and not submodule.startswith("_") %}
            <li><a href="{{ submodule.replace(".","/") }}.html">{{ submodule.replace("_"," ").title() }}</a></li>
        {% endfor %}
    </ul>
{% endblock %}
{% block content %}
    <header class="pdoc">
        <h1>Lakehouse Engine Documentation</h1>
    </header>
    <main class="pdoc">
        {% filter to_html %}

{% include "README.md" %}

        {% endfilter %}
    </main>
    {% if search %}
        {% include "search.html.jinja2" %}
    {% endif %}
{% endblock %}

================================================
FILE: cicd/code_doc/mkdocs.yml
================================================
site_name: Lakehouse Engine Documentation
site_url: https://adidas.github.io/lakehouse-engine-docs
repo_url: https://github.com/adidas/lakehouse-engine
repo_name: lakehouse-engine
docs_dir: "mkdocs/docs"

nav:
  - Lakehouse Engine: index.md
  - How to use the Lakehouse Engine?:
    - Overview: lakehouse_engine_usage/lakehouse_engine_usage.md
    - Algorithms:
      - Data Loader:
        - Overview: lakehouse_engine_usage/data_loader/data_loader.md
        - Scenarios:
          - Append Load from JDBC with PERMISSIVE mode (default): lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/append_load_from_jdbc_with_permissive_mode.md
          - Append Load with FAILFAST: lakehouse_engine_usage/data_loader/append_load_with_failfast/append_load_with_failfast.md
          - Batch Delta Load Init, Delta and Backfill with Merge: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/batch_delta_load_init_delta_backfill_with_merge.md
          - Custom Transformer: lakehouse_engine_usage/data_loader/custom_transformer/custom_transformer.md
          - Custom Transformer (SQL): lakehouse_engine_usage/data_loader/custom_transformer_sql/custom_transformer_sql.md
          - Extract from SAP B4 ADSOs: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md
          - Extract from SAP BW DSOs: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md
          - Extract from SFTP: lakehouse_engine_usage/data_loader/extract_from_sftp/extract_from_sftp.md
          - Extract using JDBC connection: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/extract_using_jdbc_connection.md
          - Filtered Full Load: lakehouse_engine_usage/data_loader/filtered_full_load/filtered_full_load.md
          - Filtered Full Load with Selective Replace: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/filtered_full_load_with_selective_replace.md
          - Flatten Schema and Explode Columns: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/flatten_schema_and_explode_columns.md
          - Full Load: lakehouse_engine_usage/data_loader/full_load/full_load.md
          - Read from Dataframe: lakehouse_engine_usage/data_loader/read_from_dataframe/read_from_dataframe.md
          - Read from Sharepoint: lakehouse_engine_usage/data_loader/read_from_sharepoint/read_from_sharepoint.md
          - Streaming Append Load with DROPMALFORMED: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/streaming_append_load_with_malformed.md
          - Streaming Append Load with Optimize Dataset Terminator: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/streaming_append_load_with_terminator.md
          - Streaming Delta Load with Group and Rank Condensation: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/streaming_delta_load_with_group_and_rank_condensation.md
          - Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking): lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md
          - Write and Read Dataframe: lakehouse_engine_usage/data_loader/write_and_read_dataframe/write_and_read_dataframe.md
          - Write to Console: lakehouse_engine_usage/data_loader/write_to_console/write_to_console.md
          - Write to REST API: lakehouse_engine_usage/data_loader/write_to_rest_api/write_to_rest_api.md
          - Write to Sharepoint: lakehouse_engine_usage/data_loader/write_to_sharepoint/write_to_sharepoint.md
      - Data Quality:
        - Overview: lakehouse_engine_usage/data_quality/data_quality.md
        - Scenarios:
          - Custom Expectations: lakehouse_engine_usage/data_quality/custom_expectations/custom_expectations.md
          - Data Quality Validator: lakehouse_engine_usage/data_quality/data_quality_validator/data_quality_validator.md
          - Minimal Example: lakehouse_engine_usage/data_quality/minimal_example/minimal_example.md
          - Prisma: lakehouse_engine_usage/data_quality/prisma/prisma.md
          - Result Sink: lakehouse_engine_usage/data_quality/result_sink/result_sink.md
          - Row Tagging: lakehouse_engine_usage/data_quality/row_tagging/row_tagging.md
          - Validations Failing: lakehouse_engine_usage/data_quality/validations_failing/validations_failing.md
      - Reconciliator:
        - Overview: lakehouse_engine_usage/reconciliator/reconciliator.md
      - Sensors:
          - Overview: lakehouse_engine_usage/sensors/sensors.md
          - Sensor:
              - Overview: lakehouse_engine_usage/sensors/sensor/sensor.md
              - Supported Sources:
                  - Delta Table: lakehouse_engine_usage/sensors/sensor/delta_table/delta_table.md
                  - Sensor from other Sensor Delta Table: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md
                  - Sensor from Files: lakehouse_engine_usage/sensors/sensor/file/file.md
                  - Sensor from JDBC: lakehouse_engine_usage/sensors/sensor/jdbc_table/jdbc_table.md
                  - Sensor from Kafka: lakehouse_engine_usage/sensors/sensor/kafka/kafka.md
                  - Sensor from SAP: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/sap_bw_b4.md
              - Update Sensor control Delta Table after processing the data: lakehouse_engine_usage/sensors/sensor/update_sensor_status/update_sensor_status.md
          - Heartbeat Sensor:
              - Overview: lakehouse_engine_usage/sensors/heartbeat/heartbeat.md
              - Supported Sources:
                  - Delta Table: lakehouse_engine_usage/sensors/heartbeat/delta_table/delta_table.md
                  - Kafka: lakehouse_engine_usage/sensors/heartbeat/kafka/kafka.md
                  - Manual Table: lakehouse_engine_usage/sensors/heartbeat/manual_table/manual_table.md
                  - SAP BW/4HANA: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/sap_bw_b4.md
                  - Trigger File: lakehouse_engine_usage/sensors/heartbeat/trigger_file/trigger_file.md
              - Feed Heartbeat Sensor Control Delta Table: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md
              - Update Heartbeat Sensor control Delta Table after processing the data: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/update_heartbeat_sensor_status.md
      - GAB:
        - Overview: lakehouse_engine_usage/gab/gab.md
        - Step-by-Step: lakehouse_engine_usage/gab/step_by_step/step_by_step.md
  - Tools:
    - Table & File Manager Helper: lakehouse_engine_usage/managerhelper/managerhelper.md
  - API Documentation: reference/ # (1)!

theme:
  name: material
  language: en
  logo: assets/img/lakehouse_engine_logo.png
  favicon: assets/img/lakehouse_engine_logo_symbol_large.png
  icon:
    repo: fontawesome/brands/github-alt
  palette:
    - media: "(prefers-color-scheme: light)"
      scheme: default
      primary: blue
      accent: yellow
      toggle:
        icon: material/toggle-switch
        name: Switch to dark mode
    - media: "(prefers-color-scheme: dark), (prefers-color-scheme: no-preference)"
      scheme: slate
      primary: blue
      accent: yellow
      toggle:
        icon: material/toggle-switch-off
        name: Switch to light mode
  features:
    - content.code.annotate
    - content.code.annotation
    - content.code.copy
    - content.code.select
    - content.tabs.link
    - content.tooltips
    - navigation.indexes
    - navigation.path
    - navigation.tabs
    - navigation.tabs.instant
    - navigation.tabs.sticky
    - navigation.top
    - navigation.sections
    - toc.follow
    - toc.integrate
    - search.highlight
    - search.suggest

extra:
  social:
    - icon: fontawesome/brands/github-alt
      link: https://adidas.github.io/lakehouse-engine
  version:
    provider: mike
    name: Version

plugins:
  - search
  - markdown-exec
  - offline
  - section-index
  - mkdocstrings:
      enabled: !ENV [ENABLE_MKDOCSTRINGS, true]
      default_handler: python
      handlers:
        python:
          paths: [mkdocs/lakehouse_engine]
          options:
            show_source: true
  - macros:
      module_name: mkdocs_macros
  - gen-files:
      scripts:
        - gen_ref_nav.py
  - literate-nav:
      nav_file: SUMMARY.md
  - mike:
      alias_type: symlink
      canonical_version: latest

extra:
  social:
    - icon: fontawesome/brands/github-alt
      link: https://adidas.github.io/lakehouse-engine

markdown_extensions:
  - admonition
  - attr_list
  - extra
  - footnotes
  - markdown_include.include:
      base_path: mkdocs/docs
  - md_in_html
  - pymdownx.arithmatex:
      generic: true
  - pymdownx.details
  - pymdownx.emoji:
      emoji_index: !!python/name:materialx.emoji.twemoji
      emoji_generator: !!python/name:materialx.emoji.to_svg
  - pymdownx.highlight:
      anchor_linenums: true
      line_spans: __span
      pygments_lang_class: true
  - pymdownx.inlinehilite
  - pymdownx.mark
  - pymdownx.tabbed:
      alternate_style: true
  - pymdownx.snippets
  - pymdownx.superfences:
      custom_fences:
        - name: mermaid
          class: mermaid
          format: !!python/name:pymdownx.superfences.fence_code_format ''
  - toc:
      permalink: true

copyright: |
  &copy; 2025 <a href="https://github.com/adidas"  target="_blank" rel="noopener">adidas</a>

================================================
FILE: cicd/code_doc/mkdocs_macros.py
================================================
"""Macro methods to be used on Lakehouse Engine Docs."""
import warnings
import json
import pygments.formatters.html
from markupsafe import Markup

STACK_LEVEL = 2


def _search_files(file: dict, search_string: str) -> list:
    """Searches for a string and outputs the line.

    Search for a given string in a file and output the line where it is first
    found.

    Args:
        file: path of the file to be searched.
        search_string: string that will be searched for.

    Returns:
        The number of the first line where a given search_string appears.
    """
    range_lines = []
    with open(file) as f:
        for num, line in enumerate(f, 1):
            if search_string in line:
                range_lines.append(num - 1)
    return range_lines[0]


def _link_example(method_name: str) -> str or None:
    """Searches for a link in a dict.

    Searches for the link of a given method_name, in a specific config file and
    outputs it.

    Args:
        method_name: name of the method to be searched for.

    Returns:
        None or the example link for the given method_name.
    """
    if method_name in list(lakehouse_engine_examples.keys()):
        file_link = lakehouse_engine_examples[str(method_name)]

        return lakehouse_engine_examples["base_link"] + file_link if file_link != "" else None
    else:
        warnings.warn(
                "No entry provided for the following transformer: "
                + method_name,
                RuntimeWarning,
                STACK_LEVEL,
        )

        return None


def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
    """Searches for a transformer and returns the first dictionary occurrence.

    Search for a given transformer in a dictionary and return the first occurrence.

    Args:
        dict_to_search: path of the file to be searched.
        transformer: string that will be searched for.

    Returns:
        First dictionary where a given transformer is found.
    """
    dict_transformer = []
    for spec in dict_to_search["transform_specs"]:
        for transformer_dict in spec["transformers"]:
            if transformer_dict["function"] == transformer:
                dict_transformer.append(transformer_dict)

    return json.dumps(dict_transformer[0], indent=4)


def _highlight_examples(method_name: str) -> str or None:
    """Creates a code snippet.

    Constructs and exposes the code snippet of a given method_name.

    Args:
        method_name: name of the module to be searched for.

    Returns:
        None or the code snippet wrapped in html tags.
    """
    for key, item in lakehouse_engine_examples.items():
        if method_name == key:
            file_path = f"../../{item}"
            if file_path == "../../":
                warnings.warn(
                    "No unit testing for the following transformer: " + method_name,
                    RuntimeWarning,
                    STACK_LEVEL,
                    )
                return None

            first_line = _search_files(file_path, f'"function": "{method_name}"')
            with open(file_path) as json_file:
                acon_file = json.load(json_file)
            code_snippet = _get_dict_transformer(acon_file, method_name)

            # Defining the lexer which will parse through the snippet of code we want
            # to highlight
            lexer = pygments.lexers.JsonLexer()
            # Defining the format that will be outputted by the pygments library
            # (on our case it will output the code within html tags)
            formatter = pygments.formatters.html.HtmlFormatter(
                linenos="inline",
                anchorlinenos=True,
            )
            formatter.linenostart = first_line

            return Markup(pygments.highlight(code_snippet, lexer, formatter))


def get_example(method_name: str) -> str:
    """Get example based on given argument.

    Args:
        method_name: name of the module to be searched for.

    Returns:
        A example.
    """
    example_link = _link_example(method_name=method_name)
    json_example = _highlight_examples(method_name=method_name)

    if example_link:
        return (
            """<details class="example">\n"""
            f"""<summary>View Example of {method_name} (See full example <a href="{example_link}">here</a>)</summary>"""
            f"""<div class="language-json highlight"><pre><span></span><code>{json_example}</code></pre></div>\n"""
            """</details>"""
        )
    else:
        return ""


with open("./examples.json") as json_file:
    lakehouse_engine_examples = json.load(json_file)


def format_operations_table(operations_dict: dict) -> str:
    """Format operations dictionary into a markdown table.

    Args:
        operations_dict: Dictionary containing operations and their parameters.

    Returns:
        A markdown formatted table with operation details.
    """
    if not operations_dict:
        return ""

    markdown_output = "\n\n**Available Operations:**\n\n"
    markdown_output += "| Operation | Parameters | Type | Mandatory |\n"
    markdown_output += "|-----------|------------|------|----------|\n"

    for operation, params in sorted(operations_dict.items()):
        if not params:
            markdown_output += f"| `{operation}` | - | - | - |\n"
        else:
            first_param = True
            for param_name, param_info in params.items():
                if first_param:
                    markdown_output += f"| `{operation}` | `{param_name}` | {param_info.get('type', 'N/A')} | {param_info.get('mandatory', False)} |\n"
                    first_param = False
                else:
                    markdown_output += f"|  | `{param_name}` | {param_info.get('type', 'N/A')} | {param_info.get('mandatory', False)} |\n"

    return markdown_output


def get_table_manager_operations() -> str:
    """Get formatted table of TableManager operations.

    Returns:
        A markdown formatted table with TableManager operations.
    """
    from lakehouse_engine.core.definitions import TABLE_MANAGER_OPERATIONS
    return format_operations_table(TABLE_MANAGER_OPERATIONS)


def get_file_manager_operations() -> str:
    """Get formatted table of FileManager operations.

    Returns:
        A markdown formatted table with FileManager operations.
    """
    from lakehouse_engine.core.definitions import FILE_MANAGER_OPERATIONS
    return format_operations_table(FILE_MANAGER_OPERATIONS)


def define_env(env):
    "Declare environment for jinja2 templates for markdown"

    for fn in [get_example, get_table_manager_operations, get_file_manager_operations]:
        env.macro(fn)

    # get mkdocstrings' Python handler
    python_handler = env.conf["plugins"]["mkdocstrings"].get_handler("python")

    # get the `update_env` method of the Python handler
    update_env = python_handler.update_env

    # override the `update_env` method of the Python handler
    def patched_update_env(config):
        update_env(config)

        # get the `convert_markdown` filter of the env
        convert_markdown = python_handler.env.filters["convert_markdown"]

        # build a chimera made of macros+mkdocstrings
        def render_convert(markdown: str, *args, **kwargs):
            return convert_markdown(env.render(markdown), *args, **kwargs)

        # patch the filter
        python_handler.env.filters["convert_markdown"] = render_convert

    # patch the method
    python_handler.update_env = patched_update_env


================================================
FILE: cicd/code_doc/module.html.jinja2
================================================
{#
On this Jinja template we're extending a pre-existing template,
copying the block on which we would like to make changes and
adding both the "View Example" summary tag and the "View Full Acon" button.
#}
{% extends "default/module.html.jinja2" %}
{% block title %}{{ module.modulename }}{% endblock %}
{% block nav_submodules %}
        {% if module.submodules %}
            <h2>Submodules</h2>
            <ul>
                {% for submodule in module.submodules if is_public(submodule) | trim %}
                    <li><a href="./{{ module.name }}/{{ submodule.name }}.html">{{ submodule.name.replace("_"," ").title() }}</a></li>
                {% endfor %}
            </ul>
        {% endif %}
    {% endblock %}
{% block module_contents %}
    {% for m in module.flattened_own_members if is_public(m) | trim %}
        <section id="{{ m.qualname or m.name }}">
            {{ member(m) }}
            {% if m.type == "class" %}
                {% for m in m.own_members if m.type != "class" and is_public(m) | trim %}
                    <div id="{{ m.qualname }}" class="classattr">
                        {{ member(m) }}
                        {% if m.fullname | highlight_examples %}
                            {{ view_example(m.fullname) }}
                        {% endif %}
                        {% if m.fullname | link_example %}
                            {{ view_full_acon(m.fullname) }}
                        {% endif %}
                    </div>
                {% endfor %}
                {% set inherited_members = inherited(m) | trim %}
                {% if inherited_members %}
                    <div class="inherited">
                        <h5>Inherited Members</h5>
                        <dl>
                            {{ inherited_members }}
                        </dl>
                    </div>
                {% endif %}
            {% endif %}
        </section>
    {% endfor %}
{% endblock %}
{% block attribution %}
{% endblock %}

{% block module_info %}
    <section class="module-info">
        {% block edit_button %}
            {% if edit_url %}
                {% if "github.com" in edit_url %}
                    {% set edit_text = "Edit on GitHub" %}
                {% elif "gitlab" in edit_url %}
                    {% set edit_text = "Edit on GitLab" %}
                {% else %}
                    {% set edit_text = "Edit Source" %}
                {% endif %}
                <a class="pdoc-button git-button" href="{{ edit_url }}">{{ edit_text }}</a>
            {% endif %}
        {% endblock %}

        {% if "lakehouse_engine" == module.modulename.split(".")[0] %}
            {{ module_name() }}
        {% endif %}
        {{ docstring(module) }}
        {% if "lakehouse_engine" == module.modulename.split(".")[0] %}
            {{ view_source_state(module) }}
            {{ view_source_button(module) }}
            {{ view_source_code(module) }}
        {% endif %}
    </section>
{% endblock %}

{#
On this macro we're creating the "View Example" structure.
#}
{% defaultmacro view_example(doc) %}
    <details>
    <summary>View Example</summary>
    {{ doc | highlight_examples }}
    </details>
{% enddefaultmacro %}

{#
On this macro we're creating the "View Full Acon" structure.
#}
{% defaultmacro view_full_acon(doc) %}
    <section>
        {% set edit_text = "View Full Acon" %}
        <a class="pdoc-button git-button" href="{{ doc | link_example }}" target="_blank">{{ edit_text }}</a>
    </section>
    </br>
    </br>
{% enddefaultmacro %}


================================================
FILE: cicd/code_doc/render_doc.py
================================================
"""Module for customizing pdoc documentation."""

import json
import os
import shutil
import warnings
from pathlib import Path

import pygments.formatters.html
from markupsafe import Markup
from pdoc import pdoc, render

STACK_LEVEL = 2

logo_path = (
    "https://github.com/adidas/lakehouse-engine/blob/master/assets/img/"
    "lakehouse_engine_logo_no_bg_160.png?raw=true"
)


def _get_project_version() -> str:
    version = (
        os.popen(
            "cat cicd/.bumpversion.cfg | grep 'current_version =' | cut -f 3 -d ' '"
        )
        .read()
        .replace("\n", "")
    )
    return version


def _search_files(file: dict, search_string: str) -> list:
    """Searches for a string and outputs the line.

    Search for a given string in a file and output the line where it is first
    found.

    :param file: path of the file to be searched.
    :param search_string: string that will be searched for.

    :returns: the number of the first line where a given search_string appears.
    """
    range_lines = []
    with open(file) as f:
        for num, line in enumerate(f, 1):
            if search_string in line:
                range_lines.append(num - 1)
    return range_lines[0]


def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
    """Searches for a transformer and returns the first dictionary occurrence.

    Search for a given transformer in a dictionary and return the first occurrence.

    :param dict_to_search: path of the file to be searched.
    :param transformer: string that will be searched for.

    :returns: first dictionary where a given transformer is found.
    """
    dict_transformer = []
    for spec in dict_to_search["transform_specs"]:
        for transformer_dict in spec["transformers"]:
            if transformer_dict["function"] == transformer:
                dict_transformer.append(transformer_dict)
    return json.dumps(dict_transformer[0], indent=4)


def _link_example(module_name: str) -> str or None:
    """Searches for a link in a dict.

    Searches for the link of a given module_name, in a specific config file and
    outputs it.

    :param module_name: name of the module to be searched for.

    :returns: None or the example link for the given module_name.
    """
    if module_name in list(link_dict.keys()):
        file_link = link_dict[str(module_name)]
        return link_dict["base_link"] + file_link if file_link != "" else None
    else:
        return None


def _highlight_examples(module_name: str) -> str or None:
    """Creates a code snippet.

    Constructs and exposes the code snippet of a given module_name.

    :param module_name: name of the module to be searched for.

    :returns: None or the code snippet wrapped in html tags.
    """
    transformers_to_ignore = [
        "UNSUPPORTED_STREAMING_TRANSFORMERS",
        "AVAILABLE_TRANSFORMERS",
        "__init__",
    ]
    if module_name.split(".")[1] == "transformers":
        if module_name not in list(link_dict.keys()):
            if module_name.split(".")[-1] not in list(transformers_to_ignore):
                warnings.warn(
                    "No entry provided for the following transformer: "
                    + module_name.split(".")[-1],
                    RuntimeWarning,
                    STACK_LEVEL,
                )
                return None

    for key, item in link_dict.items():
        if module_name == key:
            file_path = f"./{item}"
            transformer = key.split(".")[-1].lower()
            if file_path == "./":
                warnings.warn(
                    "No unit testing for the following transformer: " + transformer,
                    RuntimeWarning,
                    STACK_LEVEL,
                )
                return None

            first_line = _search_files(file_path, f'"function": "{transformer}"')
            with open(file_path) as json_file:
                acon_file = json.load(json_file)
            code_snippet = _get_dict_transformer(acon_file, transformer)
            # Defining the lexer which will parse through the snippet of code we want
            # to highlight
            lexer = pygments.lexers.JsonLexer()
            # Defining the format that will be outputted by the pygments library
            # (on our case it will output the code within html tags)
            formatter = pygments.formatters.html.HtmlFormatter(
                cssclass="pdoc-code codehilite",
                linenos="inline",
                anchorlinenos=True,
            )
            formatter.linenostart = first_line
            return Markup(pygments.highlight(code_snippet, lexer, formatter))


with open("./cicd/code_doc/examples.json") as json_file:
    link_dict = json.load(json_file)

# Adding our custom filters to jinja environment
env_jinja = render.env
env_jinja.filters["link_example"] = _link_example
env_jinja.filters["highlight_examples"] = _highlight_examples


root_path = Path(__file__).parents[2]
documentation_path = root_path / "artefacts" / "docs"
# Tell pdoc's render to use our jinja template
render.configure(
    template_directory=root_path / "cicd" / "code_doc" / ".",
    docformat="google",
    logo=logo_path,
    favicon=logo_path,
    footer_text=f"Lakehouse Engine v{_get_project_version()}",
    mermaid=True,
)
# Temporarily copy README file to be used in index.html page
shutil.copyfile("README.md", root_path / "cicd" / "code_doc" / "README.md")

# Render pdoc's documentation into artefacts/docs
pdoc(
    "./lakehouse_engine/",
    "./lakehouse_engine_usage/",
    output_directory=documentation_path,
)

# Copy the images used on the documentation, to the path where we have the rendered
# html pages.
shutil.copytree("./assets", documentation_path / "assets", dirs_exist_ok=True)

# Remove the temporary copy README file
os.remove(root_path / "cicd" / "code_doc" / "README.md")


================================================
FILE: cicd/code_doc/render_docs.py
================================================
"""Module for customizing mkdocs documentation."""

# Import necessary libraries
import os
import shutil
from pathlib import Path

# Define the root directory and the necessary directories
root_path = Path(__file__).parents[2]
code_doc_path = root_path / "cicd" / "code_doc"
mkdocs_base_path = code_doc_path / "mkdocs"
mkdocs_build_path = mkdocs_base_path / "docs"
documentation_path = root_path / "artefacts" / "docs"

# Files and directories to be copied to build the mkdocs documentation
documentation_to_copy = {
    "directories_to_copy": [
        {
            "source": root_path / "lakehouse_engine_usage",
            "target": mkdocs_build_path / "lakehouse_engine_usage",
        },
        {
            "source": root_path / "lakehouse_engine",
            "target": mkdocs_base_path / "lakehouse_engine" / "packages",
        },
        {
            "source": "./assets",
            "target": mkdocs_build_path / "assets",
        },
    ],
    "files_to_copy": [
        {
            "source": "README.md",
            "target": mkdocs_build_path / "index.md",
        },
        {
            "source": "pyproject.toml",
            "target": mkdocs_build_path / "pyproject.toml",
        },
    ],
}


def _copy_documentation(directories: list = "", files: list = ""):
    """Copy files to other directory based on given parameters.

    Args:
        directories (list): list of directories to copy.
        files (list): list of files to copy.
    """
    if directories:
        for directory in directories:
            shutil.copytree(
                directory.get("source"), directory.get("target"), dirs_exist_ok=True
            )
    if files:
        for file in files:
            shutil.copyfile(file.get("source"), file.get("target"))


_copy_documentation(
    directories=documentation_to_copy.get("directories_to_copy"),
    files=documentation_to_copy.get("files_to_copy"),
)

# Use mkdocs build command to build the documentation into the "site" folder
os.system(f"cd {code_doc_path} && mkdocs build --site-dir {documentation_path}/site")

# Remove the temporary docs directory mkdocs_base_path
shutil.rmtree(mkdocs_base_path)


================================================
FILE: cicd/flake8.conf
================================================
[flake8]
max-line-length = 88
extend-ignore = E203
inline-quotes=double
docstring-quotes="""
max-expression-complexity=11
max-cognitive-complexity=15
# there is a python module with same name as io engine module, so
# we need to ignore this error
per-file-ignores =
    lakehouse_engine/io/__init__.py:A005

================================================
FILE: cicd/meta.yaml
================================================
dev_deploy_bucket: s3://sample-dev-bucket
prod_deploy_bucket: s3://sample-prod-bucket
arm_python_image: arm64v8/python:3.12-slim-bullseye
amd_python_image: python:3.12-slim-bullseye
engine_docs: https://adidas.github.io/lakehouse-engine-docs/lakehouse_engine.html
code_url: https://github.com/adidas/lakehouse-engine

================================================
FILE: cicd/requirements.txt
================================================
# The main dependencies without which the core functionalities of the project will not work.
# These dependencies are not optional and are always installed when people install the lakehouse-engine library.
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#
boto3==1.40.23
Jinja2==3.1.6
pyyaml==6.0.2
pendulum==3.1.0
importlib-resources==6.5.2

================================================
FILE: cicd/requirements_azure.txt
================================================
# Dependencies necessary for azure related features to work (ex: mail notifications using o365).
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#
msgraph-sdk==1.40.0
aiohttp==3.13.3 # msgraph-sdk uses a version with known vulnerabilities
h2==4.3.0 # msgraph-sdk uses a version with known vulnerabilities
azure-core==1.38.0
nest-asyncio==1.6.0
msal==1.32.3
urllib3==2.6.3 # msal uses a version with known vulnerabilities
# Fixing the version to solve known vulnerabilities
requests==2.32.4 # when updating also update in all files

================================================
FILE: cicd/requirements_cicd.txt
================================================
# Dependencies necessary for the Lakehouse Engine CICD (tests, linting, deployment,...).
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#

# cicd
pytest==8.4.1
pytest-cov==6.2.1
isort==6.0.1
flake8==7.3.0
flake8-black==0.3.6
black==24.4.0 # fixed because flake8-black points always to the latest black
flake8-builtins==3.0.0
flake8-bugbear==24.12.12
flake8-isort==6.1.2
flake8-comprehensions==3.16.0
flake8-docstrings==1.7.0
flake8-eradicate==1.5.0
flake8-quotes==3.4.0
flake8-mutable==1.2.0
flake8-cognitive-complexity==0.1.0
flake8-expression-complexity==0.0.11
mypy==1.17.1
bandit==1.8.6
bump2version==1.0.1
lxml==6.0.0
pytest-sftpserver==1.3.0
pip-tools==7.5.0
pip-audit==2.10.0
cachecontrol==0.14.4
filelock==3.20.3
build==1.3.0
aiosmtpd==1.4.6

# docs
distlib==0.3.6
ghp-import==2.1.0
griffe==1.15.0
Markdown==3.10
markdown-callouts==0.4.0
markdown-exec==1.12.1
markdown-include==0.8.1
mergedeep==1.3.4
mike==2.0.0
mkdocs==1.6.1
mkdocs-autorefs==1.4.3
mkdocs-material==9.7.1
mkdocs-material-extensions==1.3.1
mkdocstrings-crystal==0.3.9
mkdocs-macros-plugin==1.5.0
mkdocstrings-python==2.0.1
mkdocstrings[python]==1.0.0
mkdocs-gen-files==0.6.0
mkdocs-section-index==0.3.10
mkdocs-literate-nav==0.6.2
pymdown-extensions==10.20
pyyaml_env_tag==0.1
regex==2023.6.3
watchdog==3.0.0
# Fixing the version to solve known vulnerabilities
requests==2.32.4 # when updating also update in all files


# types
types-boto3==1.40.23
types-paramiko==2.12.0
types-requests<2.31.0.7

# test
moto==4.2.14
Werkzeug==3.1.6

# deploy to pypi
twine==5.1.1


================================================
FILE: cicd/requirements_dq.txt
================================================
# Dependencies necessary for the Data Quality features to work.
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#
great-expectations==1.11.0
marshmallow==3.26.2
# Note: Numpy is not a direct dependency.
# It is included temporarily to prevent version conflicts.
#numpy==1.26.4 #dbr17 uses 2.1.3
# Fixing the version to solve known vulnerabilities
requests==2.32.4 # when updating also update in all files dbr17 uses 2.32.3


================================================
FILE: cicd/requirements_os.txt
================================================
# Special requirements from which the project depends on, but for which some use cases might use environments with
# these dependencies pre-installed from the vendors. Thus, they are delivered as optional OS dependencies.
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#
pyspark==4.0.0
delta-spark==4.0.0

================================================
FILE: cicd/requirements_sftp.txt
================================================
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#
paramiko==4.0.0
pynacl==1.6.2

================================================
FILE: cicd/requirements_sharepoint.txt
================================================
#
# ! Do not forget running `make build-lock-files` after updating dependency list !
#
tenacity==9.0.0
msal==1.32.3
azure-core==1.38.0

================================================
FILE: lakehouse_engine/__init__.py
================================================
"""Lakehouse engine package containing all the system subpackages."""


================================================
FILE: lakehouse_engine/algorithms/__init__.py
================================================
"""Package containing all the lakehouse engine algorithms."""


================================================
FILE: lakehouse_engine/algorithms/algorithm.py
================================================
"""Module containing the Algorithm class."""

from typing import List, Tuple

from lakehouse_engine.core.definitions import (
    DQDefaults,
    DQFunctionSpec,
    DQSpec,
    OutputFormat,
)
from lakehouse_engine.core.executable import Executable


class Algorithm(Executable):
    """Class to define the behavior of every algorithm based on ACONs."""

    def __init__(self, acon: dict):
        """Construct Algorithm instances.

        Args:
            acon: algorithm configuration.
        """
        self.acon = acon

    @classmethod
    def get_dq_spec(
        cls, spec: dict
    ) -> Tuple[DQSpec, List[DQFunctionSpec], List[DQFunctionSpec]]:
        """Get data quality specification object from acon.

        Args:
            spec: data quality specifications.

        Returns:
            The DQSpec and the List of DQ Functions Specs.
        """
        dq_spec = DQSpec(
            spec_id=spec["spec_id"],
            input_id=spec["input_id"],
            dq_type=spec["dq_type"],
            dq_functions=[],
            dq_db_table=spec.get("dq_db_table"),
            dq_table_table_filter=spec.get("dq_table_table_filter"),
            dq_table_extra_filters=spec.get(
                "dq_table_extra_filters", DQSpec.dq_table_extra_filters
            ),
            execution_point=spec.get("execution_point"),
            unexpected_rows_pk=spec.get(
                "unexpected_rows_pk", DQSpec.unexpected_rows_pk
            ),
            gx_result_format=spec.get("gx_result_format", DQSpec.gx_result_format),
            tbl_to_derive_pk=spec.get("tbl_to_derive_pk", DQSpec.tbl_to_derive_pk),
            tag_source_data=spec.get("tag_source_data", DQSpec.tag_source_data),
            data_asset_name=spec.get("data_asset_name", DQSpec.data_asset_name),
            expectation_suite_name=spec.get(
                "expectation_suite_name", DQSpec.expectation_suite_name
            ),
            store_backend=spec.get("store_backend", DQDefaults.STORE_BACKEND.value),
            local_fs_root_dir=spec.get("local_fs_root_dir", DQSpec.local_fs_root_dir),
            bucket=spec.get("bucket", DQSpec.bucket),
            checkpoint_store_prefix=spec.get(
                "checkpoint_store_prefix", DQDefaults.CHECKPOINT_STORE_PREFIX.value
            ),
            expectations_store_prefix=spec.get(
                "expectations_store_prefix",
                DQDefaults.EXPECTATIONS_STORE_PREFIX.value,
            ),
            validations_store_prefix=spec.get(
                "validations_store_prefix",
                DQDefaults.VALIDATIONS_STORE_PREFIX.value,
            ),
            result_sink_db_table=spec.get(
                "result_sink_db_table", DQSpec.result_sink_db_table
            ),
            result_sink_location=spec.get(
                "result_sink_location", DQSpec.result_sink_location
            ),
            processed_keys_location=spec.get(
                "processed_keys_location", DQSpec.processed_keys_location
            ),
            result_sink_partitions=spec.get(
                "result_sink_partitions", DQSpec.result_sink_partitions
            ),
            result_sink_chunk_size=spec.get(
                "result_sink_chunk_size", DQSpec.result_sink_chunk_size
            ),
            result_sink_format=spec.get(
                "result_sink_format", OutputFormat.DELTAFILES.value
            ),
            result_sink_options=spec.get(
                "result_sink_options", DQSpec.result_sink_options
            ),
            result_sink_explode=spec.get(
                "result_sink_explode", DQSpec.result_sink_explode
            ),
            result_sink_extra_columns=spec.get("result_sink_extra_columns", []),
            source=spec.get("source", spec["input_id"]),
            fail_on_error=spec.get("fail_on_error", DQSpec.fail_on_error),
            cache_df=spec.get("cache_df", DQSpec.cache_df),
            critical_functions=spec.get(
                "critical_functions", DQSpec.critical_functions
            ),
            max_percentage_failure=spec.get(
                "max_percentage_failure", DQSpec.max_percentage_failure
            ),
            enable_row_condition=spec.get(
                "enable_row_condition", DQSpec.enable_row_condition
            ),
        )

        dq_functions = cls._get_dq_functions(spec, "dq_functions")

        critical_functions = cls._get_dq_functions(spec, "critical_functions")

        cls._validate_dq_tag_strategy(dq_spec)

        return dq_spec, dq_functions, critical_functions

    @staticmethod
    def _get_dq_functions(spec: dict, function_key: str) -> List[DQFunctionSpec]:
        """Get DQ Functions from a DQ Spec, based on a function_key.

        Args:
            spec: data quality specifications.
            function_key: dq function key ("dq_functions" or
                "critical_functions").

        Returns:
            a list of DQ Function Specs.
        """
        functions = []

        if spec.get(function_key, []):
            for f in spec.get(function_key, []):
                dq_fn_spec = DQFunctionSpec(
                    function=f["function"],
                    args=f.get("args", {}),
                )
                functions.append(dq_fn_spec)

        return functions

    @staticmethod
    def _validate_dq_tag_strategy(spec: DQSpec) -> None:
        """Validate DQ Spec arguments related with the data tagging strategy.

        Args:
            spec: data quality specifications.
        """
        if spec.tag_source_data:
            spec.gx_result_format = DQSpec.gx_result_format
            spec.fail_on_error = False
            spec.result_sink_explode = DQSpec.result_sink_explode
        elif spec.gx_result_format != DQSpec.gx_result_format:
            spec.tag_source_data = False


================================================
FILE: lakehouse_engine/algorithms/data_loader.py
================================================
"""Module to define DataLoader class."""

from collections import OrderedDict
from copy import deepcopy
from logging import Logger
from typing import List, Optional

from lakehouse_engine.algorithms.algorithm import Algorithm
from lakehouse_engine.core.definitions import (
    DQFunctionSpec,
    DQSpec,
    DQType,
    InputSpec,
    MergeOptions,
    OutputFormat,
    OutputSpec,
    ReadType,
    SharepointOptions,
    TerminatorSpec,
    TransformerSpec,
    TransformSpec,
)
from lakehouse_engine.dq_processors.exceptions import DQDuplicateRuleIdException
from lakehouse_engine.io.reader_factory import ReaderFactory
from lakehouse_engine.io.writer_factory import WriterFactory
from lakehouse_engine.terminators.notifier_factory import NotifierFactory
from lakehouse_engine.terminators.terminator_factory import TerminatorFactory
from lakehouse_engine.transformers.transformer_factory import TransformerFactory
from lakehouse_engine.utils.dq_utils import PrismaUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DataLoader(Algorithm):
    """Load data using an algorithm configuration (ACON represented as dict).

    This algorithm focuses on the cases where users will be specifying all the algorithm
    steps and configurations through a dict based configuration, which we name ACON
    in our framework.

    Since an ACON is a dict you can pass a custom transformer through a python function
    and, therefore, the DataLoader can also be used to load data with custom
    transformations not provided in our transformers package.

    As the algorithm base class of the lakehouse-engine framework is based on the
    concept of ACON, this DataLoader algorithm simply inherits from Algorithm,
    without overriding anything. We designed the codebase like this to avoid
    instantiating the Algorithm class directly, which was always meant to be an
    abstraction for any specific algorithm included in the lakehouse-engine framework.
    """

    def __init__(self, acon: dict):
        """Construct DataLoader algorithm instances.

        A data loader needs several specifications to work properly,
        but some of them might be optional. The available specifications are:

        - input specifications (mandatory): specify how to read data.
        - transform specifications (optional): specify how to transform data.
        - data quality specifications (optional): specify how to execute the data
            quality process.
        - output specifications (mandatory): specify how to write data to the
            target.
        - terminate specifications (optional): specify what to do after writing into
            the target (e.g., optimizing target table, vacuum, compute stats, etc).

        Args:
            acon: algorithm configuration.
        """
        self._logger: Logger = LoggingHandler(self.__class__.__name__).get_logger()
        super().__init__(acon)
        self.input_specs: List[InputSpec] = self._get_input_specs()
        # the streaming transformers plan is needed to future change the
        # execution specification to accommodate streaming mode limitations in invoking
        # certain functions (e.g., sort, window, generate row ids/auto increments, ...).
        self._streaming_micro_batch_transformers_plan: dict = {}
        self.transform_specs: List[TransformSpec] = self._get_transform_specs()
        # our data quality process is not compatible with streaming mode, hence we
        # have to run it in micro batches, similar to what happens to certain
        # transformation functions not supported in streaming mode.
        self._streaming_micro_batch_dq_plan: dict = {}
        self.dq_specs: List[DQSpec] = self._get_dq_specs()
        self.output_specs: List[OutputSpec] = self._get_output_specs()
        self.terminate_specs: List[TerminatorSpec] = self._get_terminate_specs()

    def read(self) -> OrderedDict:
        """Read data from an input location into a distributed dataframe.

        Returns:
             An ordered dict with all the dataframes that were read.
        """
        read_dfs: OrderedDict = OrderedDict({})
        for spec in self.input_specs:
            self._logger.info(f"Found input specification: {spec}")
            read_dfs[spec.spec_id] = ReaderFactory.get_data(spec)
        return read_dfs

    def transform(self, data: OrderedDict) -> OrderedDict:
        """Transform (optionally) the data that was read.

        If there isn't a transformation specification this step will be skipped, and the
        original dataframes that were read will be returned.
        Transformations can have dependency from another transformation result, however
        we need to keep in mind if we are using streaming source and for some reason we
        need to enable micro batch processing, this result cannot be used as input to
        another transformation. Micro batch processing in pyspark streaming is only
        available in .write(), which means this transformation with micro batch needs
        to be the end of the process.

        Args:
            data: input dataframes in an ordered dict.

        Returns:
            Another ordered dict with the transformed dataframes, according to the
            transformation specification.
        """
        if not self.transform_specs:
            return data
        else:
            transformed_dfs = OrderedDict(data)
            for spec in self.transform_specs:
                self._logger.info(f"Found transform specification: {spec}")
                transformed_df = transformed_dfs[spec.input_id]
                for transformer in spec.transformers:
                    transformed_df = transformed_df.transform(
                        TransformerFactory.get_transformer(transformer, transformed_dfs)
                    )
                transformed_dfs[spec.spec_id] = transformed_df
            return transformed_dfs

    def process_dq(
        self, data: OrderedDict
    ) -> tuple[OrderedDict, Optional[dict[str, str]]]:
        """Process the data quality tasks for the data that was read and/or transformed.

        It supports multiple input dataframes. Although just one is advisable.

        It is possible to use data quality validators/expectations that will validate
        your data and fail the process in case the expectations are not met. The DQ
        process also generates and keeps updating a site containing the results of the
        expectations that were done on your data. The location of the site is
        configurable and can either be on file system or S3. If you define it to be
        stored on S3, you can even configure your S3 bucket to serve the site so that
        people can easily check the quality of your data. Moreover, it is also
        possible to store the result of the DQ process into a defined result sink.

        Args:
            data: dataframes from previous steps of the algorithm that we which to
                run the DQ process on.

        Returns:
            Another ordered dict with the validated dataframes and
            a dictionary with the errors if they exist, or None.
        """
        if not self.dq_specs:
            return data, None

        dq_processed_dfs, error = self._verify_dq_rule_id_uniqueness(
            data, self.dq_specs
        )
        if error:
            return dq_processed_dfs, error
        else:
            from lakehouse_engine.dq_processors.dq_factory import DQFactory

            dq_processed_dfs = OrderedDict(data)
            for spec in self.dq_specs:
                df_processed_df = dq_processed_dfs[spec.input_id]
                self._logger.info(f"Found data quality specification: {spec}")
                if (
                    spec.dq_type == DQType.PRISMA.value or spec.dq_functions
                ) and spec.spec_id not in self._streaming_micro_batch_dq_plan:

                    if spec.cache_df:
                        df_processed_df.cache()
                    dq_processed_dfs[spec.spec_id] = DQFactory.run_dq_process(
                        spec, df_processed_df
                    )
                else:
                    dq_processed_dfs[spec.spec_id] = df_processed_df

            return dq_processed_dfs, None

    def write(self, data: OrderedDict) -> OrderedDict:
        """Write the data that was read and transformed (if applicable).

        It supports writing multiple datasets. However, we only recommend to write one
        dataframe. This recommendation is based on easy debugging and reproducibility,
        since if we start mixing several datasets being fueled by the same algorithm, it
        would unleash an infinite sea of reproducibility issues plus tight coupling and
        dependencies between datasets. Having said that, there may be cases where
        writing multiple datasets is desirable according to the use case requirements.
        Use it accordingly.

        Args:
            data: dataframes that were read and transformed (if applicable).

        Returns:
            Dataframes that were written.
        """
        written_dfs: OrderedDict = OrderedDict({})
        for spec in self.output_specs:
            self._logger.info(f"Found output specification: {spec}")

            written_output = WriterFactory.get_writer(
                spec, data[spec.input_id], data
            ).write()
            if written_output:
                written_dfs.update(written_output)
            else:
                written_dfs[spec.spec_id] = data[spec.input_id]

        return written_dfs

    def terminate(self, data: OrderedDict) -> None:
        """Terminate the algorithm.

        Args:
            data: dataframes that were written.
        """
        if self.terminate_specs:
            for spec in self.terminate_specs:
                self._logger.info(f"Found terminate specification: {spec}")
                TerminatorFactory.execute_terminator(
                    spec, data[spec.input_id] if spec.input_id else None
                )

    def execute(self) -> Optional[OrderedDict]:
        """Define the algorithm execution behaviour."""
        try:
            self._logger.info("Starting read stage...")
            read_dfs = self.read()
            self._logger.info("Starting transform stage...")
            transformed_dfs = self.transform(read_dfs)
            self._logger.info("Starting data quality stage...")
            validated_dfs, errors = self.process_dq(transformed_dfs)
            self._logger.info("Starting write stage...")
            written_dfs = self.write(validated_dfs)
            self._logger.info("Starting terminate stage...")
            self.terminate(written_dfs)
            self._logger.info("Execution of the algorithm has finished!")
        except Exception as e:
            NotifierFactory.generate_failure_notification(self.terminate_specs, e)
            raise e

        if errors:
            raise DQDuplicateRuleIdException(
                "Data Written Successfully, but DQ Process Encountered an Issue.\n"
                "We detected a duplicate dq_rule_id in the dq_spec definition. "
                "As a result, none of the Data Quality (DQ) processes (dq_spec) "
                "were executed.\n"
                "Please review and verify the following dq_rules:\n"
                f"{errors}"
            )

        return written_dfs

    def _get_input_specs(self) -> List[InputSpec]:
        """Get the input specifications from an acon.

        Returns:
            List of input specifications.
        """
        return [InputSpec(**spec) for spec in self.acon["input_specs"]]

    def _get_transform_specs(self) -> List[TransformSpec]:
        """Get the transformation specifications from an acon.

        If we are executing the algorithm in streaming mode and if the
        transformer function is not supported in streaming mode, it is
        important to note that ONLY those unsupported operations will
        go into the streaming_micro_batch_transformers (see if in the function code),
        in the same order that they appear in the list of transformations. This means
        that other supported transformations that appear after an
        unsupported one continue to stay one the normal execution plan,
        i.e., outside the foreachBatch function. Therefore, this may
        make your algorithm to execute a different logic than the one you
        originally intended. For this reason:
            1) ALWAYS PLACE UNSUPPORTED STREAMING TRANSFORMATIONS AT LAST;
            2) USE force_streaming_foreach_batch_processing option in transform_spec
            section.
            3) USE THE CUSTOM_TRANSFORMATION AND WRITE ALL YOUR TRANSFORMATION LOGIC
            THERE.

        Check list of unsupported spark streaming operations here:
        https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#unsupported-operations

        Returns:
            List of transformation specifications.
        """
        input_read_types = self._get_input_read_types(self.acon["input_specs"])
        transform_input_ids = self._get_transform_input_ids(
            self.acon.get("transform_specs", [])
        )
        prev_spec_read_types = self._get_previous_spec_read_types(
            input_read_types, transform_input_ids
        )
        transform_specs = []
        for spec in self.acon.get("transform_specs", []):
            transform_spec = TransformSpec(
                spec_id=spec["spec_id"],
                input_id=spec["input_id"],
                transformers=[],
                force_streaming_foreach_batch_processing=spec.get(
                    "force_streaming_foreach_batch_processing", False
                ),
            )

            for s in spec["transformers"]:
                transformer_spec = TransformerSpec(
                    function=s["function"], args=s.get("args", {})
                )
                if (
                    prev_spec_read_types[transform_spec.input_id]
                    == ReadType.STREAMING.value
                    and s["function"]
                    in TransformerFactory.UNSUPPORTED_STREAMING_TRANSFORMERS
                ) or (
                    prev_spec_read_types[transform_spec.input_id]
                    == ReadType.STREAMING.value
                    and transform_spec.force_streaming_foreach_batch_processing
                ):
                    self._move_to_streaming_micro_batch_transformers(
                        transform_spec, transformer_spec
                    )
                else:
                    transform_spec.transformers.append(transformer_spec)

            transform_specs.append(transform_spec)

        return transform_specs

    def _get_dq_specs(self) -> List[DQSpec]:
        """Get list of data quality specification objects from acon.

        In streaming mode, we automatically convert the data quality specification in
        the streaming_micro_batch_dq_processors list for the respective output spec.
        This is needed because our dq process cannot be executed using native streaming
        functions.

        Returns:
            List of data quality spec objects.
        """
        input_read_types = self._get_input_read_types(self.acon["input_specs"])
        transform_input_ids = self._get_transform_input_ids(
            self.acon.get("transform_specs", [])
        )
        prev_spec_read_types = self._get_previous_spec_read_types(
            input_read_types, transform_input_ids
        )

        dq_specs = []
        for spec in self.acon.get("dq_specs", []):

            dq_spec, dq_functions, critical_functions = Algorithm.get_dq_spec(spec)

            if prev_spec_read_types[dq_spec.input_id] == ReadType.STREAMING.value:
                # we need to use deepcopy to explicitly create a copy of the dict
                # otherwise python only create binding for dicts, and we would be
                # modifying the original dict, which we don't want to.
                self._move_to_streaming_micro_batch_dq_processors(
                    deepcopy(dq_spec), dq_functions, critical_functions
                )
            else:
                dq_spec.dq_functions = dq_functions
                dq_spec.critical_functions = critical_functions

            self._logger.info(
                f"Streaming Micro Batch DQ Plan: "
                f"{str(self._streaming_micro_batch_dq_plan)}"
            )
            dq_specs.append(dq_spec)

        return dq_specs

    def _get_output_specs(self) -> List[OutputSpec]:
        """Get the output specifications from an acon.

        Returns:
            List of output specifications.
        """
        return [
            OutputSpec(
                spec_id=spec["spec_id"],
                input_id=spec["input_id"],
                write_type=spec.get("write_type", None),
                data_format=spec.get("data_format", OutputFormat.DELTAFILES.value),
                db_table=spec.get("db_table", None),
                location=spec.get("location", None),
                merge_opts=(
                    MergeOptions(**spec["merge_opts"])
                    if spec.get("merge_opts")
                    else None
                ),
                sharepoint_opts=(
                    SharepointOptions(**spec["sharepoint_opts"])
                    if spec.get("sharepoint_opts")
                    else None
                ),
                partitions=spec.get("partitions", []),
                streaming_micro_batch_transformers=self._get_streaming_transformer_plan(
                    spec["input_id"], self.dq_specs
                ),
                streaming_once=spec.get("streaming_once", None),
                streaming_processing_time=spec.get("streaming_processing_time", None),
                streaming_available_now=spec.get(
                    "streaming_available_now",
                    (
                        False
                        if (
                            spec.get("streaming_once", None)
                            or spec.get("streaming_processing_time", None)
                            or spec.get("streaming_continuous", None)
                        )
                        else True
                    ),
                ),
                streaming_continuous=spec.get("streaming_continuous", None),
                streaming_await_termination=spec.get(
                    "streaming_await_termination", True
                ),
                streaming_await_termination_timeout=spec.get(
                    "streaming_await_termination_timeout", None
                ),
                with_batch_id=spec.get("with_batch_id", False),
                options=spec.get("options", None),
                streaming_micro_batch_dq_processors=(
                    self._streaming_micro_batch_dq_plan.get(spec["input_id"], [])
                ),
            )
            for spec in self.acon["output_specs"]
        ]

    def _get_streaming_transformer_plan(
        self, input_id: str, dq_specs: Optional[List[DQSpec]]
    ) -> List[TransformerSpec]:
        """Gets the plan for transformations to be applied on streaming micro batches.

        When running both DQ processes and transformations in streaming micro batches,
        the _streaming_micro_batch_transformers_plan to consider is the one associated
        with the transformer spec_id and not with the dq spec_id. Thus, on those cases,
        this method maps the input id of the output_spec (which is the spec_id of a
        dq_spec) with the dependent transformer spec_id.

        Args:
            input_id: id of the corresponding input specification.
            dq_specs: data quality specifications.

        Returns:
            a list of TransformerSpec, representing the transformations plan.
        """
        transformer_id = (
            [dq_spec.input_id for dq_spec in dq_specs if dq_spec.spec_id == input_id][0]
            if self._streaming_micro_batch_dq_plan.get(input_id)
            and self._streaming_micro_batch_transformers_plan
            else input_id
        )

        streaming_micro_batch_transformers_plan: list[TransformerSpec] = (
            self._streaming_micro_batch_transformers_plan.get(transformer_id, [])
        )

        return streaming_micro_batch_transformers_plan

    def _get_terminate_specs(self) -> List[TerminatorSpec]:
        """Get the terminate specifications from an acon.

        Returns:
            List of terminate specifications.
        """
        return [TerminatorSpec(**spec) for spec in self.acon.get("terminate_specs", [])]

    def _move_to_streaming_micro_batch_transformers(
        self, transform_spec: TransformSpec, transformer_spec: TransformerSpec
    ) -> None:
        """Move the transformer to the list of streaming micro batch transformations.

        If the transform specs contain functions that cannot be executed in streaming
        mode, this function sends those functions to the output specs
        streaming_micro_batch_transformers, where they will be executed inside the
        stream foreachBatch function.

        To accomplish that we use an instance variable that associates the
        streaming_micro_batch_transformers to each output spec, in order to do reverse
        lookup when creating the OutputSpec.

        Args:
            transform_spec: transform specification (overall
                transformation specification - a transformation may contain multiple
                transformers).
            transformer_spec: the specific transformer function and arguments.
        """
        if transform_spec.spec_id not in self._streaming_micro_batch_transformers_plan:
            self._streaming_micro_batch_transformers_plan[transform_spec.spec_id] = []

        self._streaming_micro_batch_transformers_plan[transform_spec.spec_id].append(
            transformer_spec
        )

    def _move_to_streaming_micro_batch_dq_processors(
        self,
        dq_spec: DQSpec,
        dq_functions: List[DQFunctionSpec],
        critical_functions: List[DQFunctionSpec],
    ) -> None:
        """Move the dq function to the list of streaming micro batch transformations.

        If the dq specs contain functions that cannot be executed in streaming mode,
        this function sends those functions to the output specs
        streaming_micro_batch_dq_processors, where they will be executed inside the
        stream foreachBatch function.

        To accomplish that we use an instance variable that associates the
        streaming_micro_batch_dq_processors to each output spec, in order to do reverse
        lookup when creating the OutputSpec.

        Args:
            dq_spec: dq specification (overall dq process specification).
            dq_functions: the list of dq functions to be considered.
            critical_functions: list of critical functions to be considered.
        """
        if dq_spec.spec_id not in self._streaming_micro_batch_dq_plan:
            self._streaming_micro_batch_dq_plan[dq_spec.spec_id] = []

        dq_spec.dq_functions = dq_functions
        dq_spec.critical_functions = critical_functions
        self._streaming_micro_batch_dq_plan[dq_spec.spec_id].append(dq_spec)

    @staticmethod
    def _get_input_read_types(list_of_specs: List) -> dict:
        """Get a dict of spec ids and read types from a list of input specs.

        Args:
            list_of_specs: list of input specs ([{k:v}]).

        Returns:
            Dict of {input_spec_id: read_type}.
        """
        return {item["spec_id"]: item["read_type"] for item in list_of_specs}

    @staticmethod
    def _get_transform_input_ids(list_of_specs: List) -> dict:
        """Get a dict of transform spec ids and input ids from list of transform specs.

        Args:
            list_of_specs: list of transform specs ([{k:v}]).

        Returns:
            Dict of {transform_spec_id: input_id}.
        """
        return {item["spec_id"]: item["input_id"] for item in list_of_specs}

    @staticmethod
    def _get_previous_spec_read_types(
        input_read_types: dict, transform_input_ids: dict
    ) -> dict:
        """Get the read types of the previous specification: input and/or transform.

        For the chaining transformations and for DQ process to work seamlessly in batch
        and streaming mode, we have to figure out if the previous spec to the transform
        or dq spec(e.g., input spec or transform spec) refers to a batch read type or
        a streaming read type.

        Args:
            input_read_types: dict of {input_spec_id: read_type}.
            transform_input_ids: dict of {transform_spec_id: input_id}.

        Returns:
            Dict of {input_spec_id or transform_spec_id: read_type}
        """
        combined_read_types = input_read_types
        for spec_id, input_id in transform_input_ids.items():
            combined_read_types[spec_id] = combined_read_types[input_id]

        return combined_read_types

    @staticmethod
    def _verify_dq_rule_id_uniqueness(
        data: OrderedDict, dq_specs: list[DQSpec]
    ) -> tuple[OrderedDict, dict[str, str]]:
        """Verify the uniqueness of dq_rule_id.

        Verify the existence of duplicate dq_rule_id values
        and prepare the DataFrame for the next stage.

        Args:
            data: dataframes.
            dq_specs: a list of DQSpec to be validated.

        Returns:
             processed df and error if existed.
        """
        error_dict = PrismaUtils.validate_rule_id_duplication(dq_specs)
        dq_processed_dfs = OrderedDict(data)
        for spec in dq_specs:
            df_processed_df = dq_processed_dfs[spec.input_id]
            dq_processed_dfs[spec.spec_id] = df_processed_df
        return dq_processed_dfs, error_dict


================================================
FILE: lakehouse_engine/algorithms/dq_validator.py
================================================
"""Module to define Data Validator class."""

from delta.tables import DeltaTable
from pyspark.sql import DataFrame
from pyspark.sql.utils import StreamingQueryException

from lakehouse_engine.algorithms.algorithm import Algorithm
from lakehouse_engine.core.definitions import DQSpec, DQValidatorSpec, InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.dq_processors.dq_factory import DQFactory
from lakehouse_engine.dq_processors.exceptions import (
    DQDuplicateRuleIdException,
    DQValidationsFailedException,
)
from lakehouse_engine.io.reader_factory import ReaderFactory
from lakehouse_engine.utils.dq_utils import PrismaUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DQValidator(Algorithm):
    """Validate data using an algorithm configuration (ACON represented as dict).

    This algorithm focuses on isolate Data Quality Validations from loading,
    applying a set of data quality functions to a specific input dataset,
    without the need to define any output specification.
    You can use any input specification compatible with the lakehouse engine
    (dataframe, table, files, etc).
    """

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(self, acon: dict):
        """Construct DQValidator algorithm instances.

        A data quality validator needs the following specifications to work properly:

        - input specification (mandatory): specify how and what data to
        read.
        - data quality specification (mandatory): specify how to execute
        the data quality process.
        - restore_prev_version (optional): specify if, having
        delta table/files as input, they should be restored to the
        previous version if the data quality process fails. Note: this
        is only considered if fail_on_error is kept as True.

        Args:
            acon: algorithm configuration.
        """
        self.spec: DQValidatorSpec = DQValidatorSpec(
            input_spec=InputSpec(**acon["input_spec"]),
            dq_spec=self._get_dq_spec(acon["dq_spec"]),
            restore_prev_version=acon.get("restore_prev_version", None),
        )

    def read(self) -> DataFrame:
        """Read data from an input location into a distributed dataframe.

        Returns:
             Dataframe with data that was read.
        """
        current_df = ReaderFactory.get_data(self.spec.input_spec)

        return current_df

    def process_dq(self, data: DataFrame) -> DataFrame:
        """Process the data quality tasks for the data that was read.

        It supports a single input dataframe.

        It is possible to use data quality validators/expectations that will validate
        your data and fail the process in case the expectations are not met. The DQ
        process also generates and keeps updating a site containing the results of the
        expectations that were done on your data. The location of the site is
        configurable and can either be on file system or S3. If you define it to be
        stored on S3, you can even configure your S3 bucket to serve the site so that
        people can easily check the quality of your data. Moreover, it is also
        possible to store the result of the DQ process into a defined result sink.

        Args:
            data: input dataframe on which to run the DQ process.

        Returns:
            Validated dataframe.
        """
        return DQFactory.run_dq_process(self.spec.dq_spec, data)

    def execute(self) -> None:
        """Define the algorithm execution behaviour."""
        self._LOGGER.info("Starting read stage...")
        read_df = self.read()

        self._LOGGER.info("Starting data quality validator...")

        self._LOGGER.info("Validating DQ definitions")
        error_dict = PrismaUtils.validate_rule_id_duplication(specs=[self.spec.dq_spec])
        if error_dict:
            raise DQDuplicateRuleIdException(
                "Duplicate dq_rule_id detected in dq_spec definition.\n"
                "We have identified one or more duplicate dq_rule_id "
                "entries in the dq_spec definition. "
                "Please review and verify the following dq_rules:\n"
                f"{error_dict}"
            )
        try:
            if read_df.isStreaming:
                # To handle streaming, and although we are not interested in
                # writing any data, we still need to start the streaming and
                # execute the data quality process in micro batches of data.
                def write_dq_validator_micro_batch(
                    batch_df: DataFrame, batch_id: int
                ) -> None:
                    ExecEnv.get_for_each_batch_session(batch_df)
                    self.process_dq(batch_df)

                read_df.writeStream.trigger(once=True).foreachBatch(
                    write_dq_validator_micro_batch
                ).start().awaitTermination()

            else:
                self.process_dq(read_df)
        except (DQValidationsFailedException, StreamingQueryException):
            if not self.spec.input_spec.df_name and self.spec.restore_prev_version:
                self._LOGGER.info("Restoring delta table/files to previous version...")

                self._restore_prev_version()

                raise DQValidationsFailedException(
                    "Data Quality Validations Failed! The delta "
                    "table/files were restored to the previous version!"
                )

            elif self.spec.dq_spec.fail_on_error:
                raise DQValidationsFailedException("Data Quality Validations Failed!")
        else:
            self._LOGGER.info("Execution of the algorithm has finished!")

    @staticmethod
    def _get_dq_spec(input_dq_spec: dict) -> DQSpec:
        """Get data quality specification from acon.

        Args:
            input_dq_spec: data quality specification.

        Returns:
            Data quality spec.
        """
        dq_spec, dq_functions, critical_functions = Algorithm.get_dq_spec(input_dq_spec)

        dq_spec.dq_functions = dq_functions
        dq_spec.critical_functions = critical_functions

        return dq_spec

    def _restore_prev_version(self) -> None:
        """Restore delta table or delta files to previous version."""
        if self.spec.input_spec.db_table:
            delta_table = DeltaTable.forName(
                ExecEnv.SESSION, self.spec.input_spec.db_table
            )
        else:
            delta_table = DeltaTable.forPath(
                ExecEnv.SESSION, self.spec.input_spec.location
            )

        previous_version = (
            delta_table.history().agg({"version": "max"}).collect()[0][0] - 1
        )

        delta_table.restoreToVersion(previous_version)


================================================
FILE: lakehouse_engine/algorithms/exceptions.py
================================================
"""Package defining all the algorithm custom exceptions."""


class ReconciliationFailedException(Exception):
    """Exception for when the reconciliation process fails."""

    pass


class NoNewDataException(Exception):
    """Exception for when no new data is available."""

    pass


class SensorAlreadyExistsException(Exception):
    """Exception for when a sensor with same sensor id already exists."""

    pass


class RestoreTypeNotFoundException(Exception):
    """Exception for when the restore type is not found."""

    pass


================================================
FILE: lakehouse_engine/algorithms/gab.py
================================================
"""Module to define Gold Asset Builder algorithm behavior."""

import copy
from datetime import datetime, timedelta

import pendulum
from jinja2 import Template
from pyspark import Row
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit

from lakehouse_engine.algorithms.algorithm import Algorithm
from lakehouse_engine.core.definitions import (
    GABCadence,
    GABCombinedConfiguration,
    GABDefaults,
    GABKeys,
    GABReplaceableKeys,
    GABSpec,
    GABStartOfWeek,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.gab_manager import GABCadenceManager, GABViewManager
from lakehouse_engine.core.gab_sql_generator import (
    GABDeleteGenerator,
    GABInsertGenerator,
)
from lakehouse_engine.utils.gab_utils import GABPartitionUtils, GABUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class GAB(Algorithm):
    """Class representing the gold asset builder."""

    _LOGGER = LoggingHandler(__name__).get_logger()
    _SPARK_DEFAULT_PARALLELISM_CONFIG = (
        "spark.sql.sources.parallelPartitionDiscovery.parallelism"
    )
    _SPARK_DEFAULT_PARALLELISM_VALUE = "10000"

    def __init__(self, acon: dict):
        """Construct GAB instances.

        Args:
            acon: algorithm configuration.
        """
        self.spec: GABSpec = GABSpec.create_from_acon(acon=acon)

    def execute(self) -> None:
        """Execute the Gold Asset Builder."""
        self._LOGGER.info(f"Reading {self.spec.lookup_table} as lkp_query_builder")
        lookup_query_builder_df = ExecEnv.SESSION.read.table(self.spec.lookup_table)
        ExecEnv.SESSION.read.table(self.spec.calendar_table).createOrReplaceTempView(
            "df_cal"
        )
        self._LOGGER.info(f"Generating calendar from {self.spec.calendar_table}")

        query_label = self.spec.query_label_filter
        queue = self.spec.queue_filter
        cadence = self.spec.cadence_filter

        self._LOGGER.info(f"Query Label Filter {query_label}")
        self._LOGGER.info(f"Queue Filter {queue}")
        self._LOGGER.info(f"Cadence Filter {cadence}")

        gab_path = self.spec.gab_base_path
        self._LOGGER.info(f"Gab Base Path {gab_path}")

        lookup_query_builder_df = lookup_query_builder_df.filter(
            (
                (lookup_query_builder_df.query_label.isin(query_label))
                & (lookup_query_builder_df.queue.isin(queue))
                & (lookup_query_builder_df.is_active != lit("N"))
            )
        )

        cached = True
        try:
            lookup_query_builder_df.cache()
        except Exception as e:
            cached = False
            self._LOGGER.warning(
                "Could not cache lookup_query_builder_df dataframe. "
                f"Continuing without caching. Exception: {e}"
            )

        for use_case in lookup_query_builder_df.collect():
            self._process_use_case(
                use_case=use_case,
                lookup_query_builder=lookup_query_builder_df,
                selected_cadences=cadence,
                gab_path=gab_path,
            )

        if cached:
            lookup_query_builder_df.unpersist()

    def _process_use_case(
        self,
        use_case: Row,
        lookup_query_builder: DataFrame,
        selected_cadences: list[str],
        gab_path: str,
    ) -> None:
        """Process each gab use case.

        Args:
            use_case: gab use case to process.
            lookup_query_builder: gab configuration data.
            selected_cadences: selected cadences to process.
            gab_path: gab base path used to get the use case stages sql files.
        """
        self._LOGGER.info(f"Executing use case: {use_case['query_label']}")

        reconciliation = GABUtils.get_json_column_as_dict(
            lookup_query_builder=lookup_query_builder,
            query_id=use_case["query_id"],
            query_column="recon_window",
        )
        self._LOGGER.info(f"reconcilation window - {reconciliation}")
        configured_cadences = list(reconciliation.keys())

        stages = GABUtils.get_json_column_as_dict(
            lookup_query_builder=lookup_query_builder,
            query_id=use_case["query_id"],
            query_column="intermediate_stages",
        )
        self._LOGGER.info(f"intermediate stages - {stages}")

        self._LOGGER.info(f"selected_cadences: {selected_cadences}")
        self._LOGGER.info(f"configured_cadences: {configured_cadences}")
        cadences = self._get_filtered_cadences(selected_cadences, configured_cadences)
        self._LOGGER.info(f"filtered cadences - {cadences}")

        latest_run_date, latest_config_date = self._get_latest_usecase_data(
            use_case["query_id"]
        )
        self._LOGGER.info(f"latest_config_date: {latest_config_date}")
        self._LOGGER.info(f"latest_run_date: - {latest_run_date}")
        self._set_use_case_stage_template_file(stages, gab_path, use_case)
        processed_cadences = []

        for cadence in cadences:
            is_cadence_processed = self._process_use_case_query_cadence(
                cadence,
                reconciliation,
                use_case,
                stages,
                lookup_query_builder,
            )
            if is_cadence_processed:
                processed_cadences.append(is_cadence_processed)

        if processed_cadences:
            self._generate_ddl(
                latest_config_date=latest_config_date,
                latest_run_date=latest_run_date,
                query_id=use_case["query_id"],
                lookup_query_builder=lookup_query_builder,
            )
        else:
            self._LOGGER.info(
                f"Skipping use case {use_case['query_label']}. No cadence processed "
                "for the use case."
            )

    @classmethod
    def _set_use_case_stage_template_file(
        cls, stages: dict, gab_path: str, use_case: Row
    ) -> None:
        """Set templated file for each stage.

        Args:
            stages: use case stages with their configuration.
            gab_path: gab base path used to get the use case stages SQL files.
            use_case: gab use case to process.
        """
        cls._LOGGER.info("Reading templated file for each stage...")

        for i in range(1, len(stages) + 1):
            stage = stages[str(i)]
            stage_file_path = stage["file_path"]
            full_path = gab_path + stage_file_path
            cls._LOGGER.info(f"Stage file path is: {full_path}")
            file_read = open(full_path, "r").read()
            templated_file = file_read.replace(
                "replace_offset_value", str(use_case["timezone_offset"])
            )
            stage["templated_file"] = templated_file
            stage["full_file_path"] = full_path

    def _process_use_case_query_cadence(
        self,
        cadence: str,
        reconciliation: dict,
        use_case: Row,
        stages: dict,
        lookup_query_builder: DataFrame,
    ) -> bool:
        """Identify use case reconciliation window and cadence.

        Args:
            cadence:  cadence to process.
            reconciliation: configured use case reconciliation window.
            use_case: gab use case to process.
            stages: use case stages with their configuration.
            lookup_query_builder: gab configuration data.
        """
        selected_reconciliation_window = {}
        selected_cadence = reconciliation.get(cadence)
        self._LOGGER.info(f"Processing cadence: {cadence}")
        self._LOGGER.info(f"Reconciliation Window - {selected_cadence}")

        if selected_cadence:
            selected_reconciliation_window = selected_cadence.get("recon_window")

        self._LOGGER.info(f"{cadence}: {self.spec.start_date} - {self.spec.end_date}")

        start_of_week = use_case["start_of_the_week"]

        self._set_week_configuration_by_uc_start_of_week(start_of_week)

        cadence_configuration_at_end_date = (
            GABUtils.get_cadence_configuration_at_end_date(self.spec.end_date)
        )

        reconciliation_cadences = GABUtils().get_reconciliation_cadences(
            cadence=cadence,
            selected_reconciliation_window=selected_reconciliation_window,
            cadence_configuration_at_end_date=cadence_configuration_at_end_date,
            rerun_flag=self.spec.rerun_flag,
        )

        start_date_str = GABUtils.format_datetime_to_default(self.spec.start_date)
        end_date_str = GABUtils.format_datetime_to_default(self.spec.end_date)

        for reconciliation_cadence, snapshot_flag in reconciliation_cadences.items():
            self._process_reconciliation_cadence(
                reconciliation_cadence=reconciliation_cadence,
                snapshot_flag=snapshot_flag,
                cadence=cadence,
                start_date_str=start_date_str,
                end_date_str=end_date_str,
                use_case=use_case,
                lookup_query_builder=lookup_query_builder,
                stages=stages,
            )

        return (cadence in reconciliation.keys()) or (
            reconciliation_cadences is not None
        )

    def _process_reconciliation_cadence(
        self,
        reconciliation_cadence: str,
        snapshot_flag: str,
        cadence: str,
        start_date_str: str,
        end_date_str: str,
        use_case: Row,
        lookup_query_builder: DataFrame,
        stages: dict,
    ) -> None:
        """Process use case reconciliation window.

        Reconcile the pre-aggregated data to cover the late events.

        Args:
            reconciliation_cadence: reconciliation to process.
            snapshot_flag: flag indicating if for this cadence the snapshot is enabled.
            cadence: cadence to process.
            start_date_str: start date of the period to process.
            end_date_str: end date of the period to process.
            use_case: gab use case to process.
            lookup_query_builder: gab configuration data.
            stages: use case stages with their configuration.

        Example:
            Cadence: week;
            Reconciliation: monthly;
            This means every weekend previous week aggregations will be calculated and
                on month end we will reconcile the numbers calculated for last 4 weeks
                to readjust the number for late events.
        """
        (
            window_start_date,
            window_end_date,
            filter_start_date,
            filter_end_date,
        ) = GABCadenceManager().extended_window_calculator(
            cadence,
            reconciliation_cadence,
            self.spec.current_date,
            start_date_str,
            end_date_str,
            use_case["query_type"],
            self.spec.rerun_flag,
            snapshot_flag,
        )

        if use_case["timezone_offset"]:
            filter_start_date = filter_start_date + timedelta(
                hours=use_case["timezone_offset"]
            )
            filter_end_date = filter_end_date + timedelta(
                hours=use_case["timezone_offset"]
            )

        filter_start_date_str = GABUtils.format_datetime_to_default(filter_start_date)
        filter_end_date_str = GABUtils.format_datetime_to_default(filter_end_date)

        partition_end = GABUtils.format_datetime_to_default(
            (window_end_date - timedelta(days=1))
        )

        window_start_date_str = GABUtils.format_datetime_to_default(window_start_date)
        window_end_date_str = GABUtils.format_datetime_to_default(window_end_date)

        partition_filter = GABPartitionUtils.get_partition_condition(
            filter_start_date_str, partition_end
        )

        self._LOGGER.info(
            "extended window for start and end dates are: "
            f"{filter_start_date_str} - {filter_end_date_str}"
        )

        unpersist_list = []

        for i in range(1, len(stages) + 1):
            stage = stages[str(i)]
            templated_file = stage["templated_file"]
            stage_file_path = stage["full_file_path"]

            templated = self._process_use_case_query_step(
                stage=stages[str(i)],
                templated_file=templated_file,
                use_case=use_case,
                reconciliation_cadence=reconciliation_cadence,
                cadence=cadence,
                snapshot_flag=snapshot_flag,
                window_start_date=window_start_date_str,
                partition_end=partition_end,
                filter_start_date=filter_start_date_str,
                filter_end_date=filter_end_date_str,
                partition_filter=partition_filter,
            )

            temp_stage_view_name = self._create_stage_view(
                templated,
                stages[str(i)],
                window_start_date_str,
                window_end_date_str,
                use_case["query_id"],
                use_case["query_label"],
                cadence,
                stage_file_path,
            )
            unpersist_list.append(temp_stage_view_name)

        insert_success = self._generate_view_statement(
            query_id=use_case["query_id"],
            cadence=cadence,
            temp_stage_view_name=temp_stage_view_name,
            lookup_query_builder=lookup_query_builder,
            window_start_date=window_start_date_str,
            window_end_date=window_end_date_str,
            query_label=use_case["query_label"],
        )
        self._LOGGER.info(f"Inserted data to generate the view: {insert_success}")

        self._unpersist_cached_views(unpersist_list)

    def _process_use_case_query_step(
        self,
        stage: dict,
        templated_file: str,
        use_case: Row,
        reconciliation_cadence: str,
        cadence: str,
        snapshot_flag: str,
        window_start_date: str,
        partition_end: str,
        filter_start_date: str,
        filter_end_date: str,
        partition_filter: str,
    ) -> str:
        """Process each use case step.

        Process any intermediate view defined in the gab configuration table as step for
            the use case.

        Args:
            stage: stage to process.
            templated_file: sql file to process at this stage.
            use_case: gab use case to process.
            reconciliation_cadence: configured use case reconciliation window.
            cadence: cadence to process.
            snapshot_flag: flag indicating if for this cadence the snapshot is enabled.
            window_start_date: start date for the configured stage.
            partition_end: end date for the configured stage.
            filter_start_date: filter start date to replace in the stage query.
            filter_end_date: filter end date to replace in the stage query.
            partition_filter: partition condition.
        """
        filter_col = stage["project_date_column"]
        if stage["filter_date_column"]:
            filter_col = stage["filter_date_column"]

        # dummy value to avoid empty error if empty on the configuration
        project_col = stage.get("project_date_column", "X")

        gab_base_configuration_copy = copy.deepcopy(
            GABCombinedConfiguration.COMBINED_CONFIGURATION.value
        )

        for item in gab_base_configuration_copy.values():
            self._update_rendered_item_cadence(
                reconciliation_cadence, cadence, project_col, item  # type: ignore
            )

        (
            rendered_date,
            rendered_to_date,
            join_condition,
        ) = self._get_cadence_configuration(
            gab_base_configuration_copy,
            cadence,
            reconciliation_cadence,
            snapshot_flag,
            use_case["start_of_the_week"],
            project_col,
            window_start_date,
            partition_end,
        )

        rendered_file = self._render_template_query(
            templated=templated_file,
            cadence=cadence,
            start_of_the_week=use_case["start_of_the_week"],
            query_id=use_case["query_id"],
            rendered_date=rendered_date,
            filter_start_date=filter_start_date,
            filter_end_date=filter_end_date,
            filter_col=filter_col,
            timezone_offset=use_case["timezone_offset"],
            join_condition=join_condition,
            partition_filter=partition_filter,
            rendered_to_date=rendered_to_date,
        )

        return rendered_file

    @classmethod
    def _get_filtered_cadences(
        cls, selected_cadences: list[str], configured_cadences: list[str]
    ) -> list[str]:
        """Get filtered cadences.

        Get the intersection of user selected cadences and use case configured cadences.

        Args:
            selected_cadences: user selected cadences.
            configured_cadences: use case configured cadences.
        """
        return (
            configured_cadences
            if "All" in selected_cadences
            else GABCadence.order_cadences(
                list(set(selected_cadences).intersection(configured_cadences))
            )
        )

    def _get_latest_usecase_data(self, query_id: str) -> tuple[datetime, datetime]:
        """Get latest use case data.

        Args:
            query_id: use case query id.
        """
        return (
            self._get_latest_run_date(query_id),
            self._get_latest_use_case_date(query_id),
        )

    def _get_latest_run_date(self, query_id: str) -> datetime:
        """Get latest use case run date.

        Args:
            query_id: use case query id.
        """
        last_success_run_sql = """
            SELECT run_start_time
            FROM {database}.gab_log_events
            WHERE query_id = {query_id}
            AND stage_name = 'Final Insert'
            AND status = 'Success'
            ORDER BY 1 DESC
            LIMIT 1
            """.format(  # nosec: B608
            database=self.spec.target_database, query_id=query_id
        )
        try:
            latest_run_date: datetime = ExecEnv.SESSION.sql(
                last_success_run_sql
            ).collect()[0][0]
        except Exception:
            latest_run_date = datetime.strptime(
                "2020-01-01", GABDefaults.DATE_FORMAT.value
            )

        return latest_run_date

    def _get_latest_use_case_date(self, query_id: str) -> datetime:
        """Get latest use case configured date.

        Args:
            query_id: use case query id.
        """
        query_config_sql = """
            SELECT lh_created_on
            FROM {lkp_query_builder}
            WHERE query_id = {query_id}
        """.format(  # nosec: B608
            lkp_query_builder=self.spec.lookup_table,
            query_id=query_id,
        )

        latest_config_date: datetime = ExecEnv.SESSION.sql(query_config_sql).collect()[
            0
        ][0]

        return latest_config_date

    @classmethod
    def _set_week_configuration_by_uc_start_of_week(cls, start_of_week: str) -> None:
        """Set week configuration by use case start of week.

        Args:
            start_of_week: use case start of week (MONDAY or SUNDAY).
        """
        if start_of_week.upper() == "MONDAY":
            pendulum.week_starts_at(pendulum.MONDAY)
            pendulum.week_ends_at(pendulum.SUNDAY)
        elif start_of_week.upper() == "SUNDAY":
            pendulum.week_starts_at(pendulum.SUNDAY)
            pendulum.week_ends_at(pendulum.SATURDAY)
        else:
            raise NotImplementedError(
                f"The requested {start_of_week} is not implemented."
                "Supported `start_of_week` values: [MONDAY, SUNDAY]"
            )

    @classmethod
    def _update_rendered_item_cadence(
        cls, reconciliation_cadence: str, cadence: str, project_col: str, item: dict
    ) -> None:
        """Override item properties based in the rendered item cadence.

        Args:
            reconciliation_cadence: configured use case reconciliation window.
            cadence: cadence to process.
            project_col: use case projection date column name.
            item: predefined use case combination.
        """
        rendered_item = cls._get_rendered_item_cadence(
            reconciliation_cadence, cadence, project_col, item
        )
        item["join_select"] = rendered_item["join_select"]
        item["project_start"] = rendered_item["project_start"]
        item["project_end"] = rendered_item["project_end"]

    @classmethod
    def _get_rendered_item_cadence(
        cls, reconciliation_cadence: str, cadence: str, project_col: str, item: dict
    ) -> dict:
        """Update pre-configured gab parameters with use case data.

        Args:
            reconciliation_cadence: configured use case reconciliation window.
            cadence: cadence to process.
            project_col: use case projection date column name.
            item: predefined use case combination.
        """
        return {
            GABKeys.JOIN_SELECT: (
                item[GABKeys.JOIN_SELECT]
                .replace(GABReplaceableKeys.CONFIG_WEEK_START, "Monday")
                .replace(
                    GABReplaceableKeys.RECONCILIATION_CADENCE,
                    reconciliation_cadence,
                )
                .replace(GABReplaceableKeys.CADENCE, cadence)
            ),
            GABKeys.PROJECT_START: (
                item[GABKeys.PROJECT_START]
                .replace(GABReplaceableKeys.CADENCE, cadence)
                .replace(GABReplaceableKeys.DATE_COLUMN, project_col)
            ),
            GABKeys.PROJECT_END: (
                item[GABKeys.PROJECT_END]
                .replace(GABReplaceableKeys.CADENCE, cadence)
                .replace(GABReplaceableKeys.DATE_COLUMN, project_col)
            ),
        }

    @classmethod
    def _get_cadence_configuration(
        cls,
        use_case_configuration: dict,
        cadence: str,
        reconciliation_cadence: str,
        snapshot_flag: str,
        start_of_week: str,
        project_col: str,
        window_start_date: str,
        partition_end: str,
    ) -> tuple[str, str, str]:
        """Get use case configuration fields to replace pre-configured parameters.

        Args:
            use_case_configuration: use case configuration.
            cadence: cadence to process.
            reconciliation_cadence: cadence to be reconciliated.
            snapshot_flag: flag indicating if for this cadence the snapshot is enabled.
            start_of_week: use case start of week (MONDAY or SUNDAY).
            project_col: use case projection date column name.
            window_start_date: start date for the configured stage.
            partition_end: end date for the configured stage.

        Returns:
            rendered_from_date: projection start date.
            rendered_to_date: projection end date.
            join_condition: string containing the join condition to replace in the
                templated query by jinja substitution.
        """
        cadence_dict = next(
            (
                dict(configuration)
                for configuration in use_case_configuration.values()
                if (
                    (cadence in configuration["cadence"])
                    and (reconciliation_cadence in configuration["recon"])
                    and (snapshot_flag in configuration["snap_flag"])
                    and (
                        GABStartOfWeek.get_start_of_week()[start_of_week.upper()]
                        in configuration["week_start"]
                    )
                )
            ),
            None,
        )
        rendered_from_date = None
        rendered_to_date = None
        join_condition = None

        if cadence_dict:
            rendered_from_date = (
                cadence_dict[GABKeys.PROJECT_START]
                .replace(GABReplaceableKeys.CADENCE, cadence)
                .replace(GABReplaceableKeys.DATE_COLUMN, project_col)
            )
            rendered_to_date = (
                cadence_dict[GABKeys.PROJECT_END]
                .replace(GABReplaceableKeys.CADENCE, cadence)
                .replace(GABReplaceableKeys.DATE_COLUMN, project_col)
            )

            if cadence_dict[GABKeys.JOIN_SELECT]:
                join_condition = """
                 inner join (
                     {join_select} from df_cal
                     where calendar_date
                     between '{bucket_start}' and '{bucket_end}'
                 )
                 df_cal on date({date_column})
                     between df_cal.cadence_start_date and df_cal.cadence_end_date
                 """.format(
                    join_select=cadence_dict[GABKeys.JOIN_SELECT],
                    bucket_start=window_start_date,
                    bucket_end=partition_end,
                    date_column=project_col,
                )

        return rendered_from_date, rendered_to_date, join_condition

    def _render_template_query(
        self,
        templated: str,
        cadence: str,
        start_of_the_week: str,
        query_id: str,
        rendered_date: str,
        filter_start_date: str,
        filter_end_date: str,
        filter_col: str,
        timezone_offset: str,
        join_condition: str,
        partition_filter: str,
        rendered_to_date: str,
    ) -> str:
        """Replace jinja templated parameters in the SQL with the actual data.

        Args:
            templated: templated sql file to process at this stage.
            cadence: cadence to process.
            start_of_the_week: use case start of week (MONDAY or SUNDAY).
            query_id: gab configuration table use case identifier.
            rendered_date: projection start date.
            filter_start_date: filter start date to replace in the stage query.
            filter_end_date: filter end date to replace in the stage query.
            filter_col: use case projection date column name.
            timezone_offset: timezone offset configured in the use case.
            join_condition: string containing the join condition.
            partition_filter: partition condition.
            rendered_to_date: projection end date.
        """
        return Template(templated).render(
            cadence="'{cadence}' as cadence".format(cadence=cadence),
            cadence_run=cadence,
            week_start=start_of_the_week,
            query_id="'{query_id}' as query_id".format(query_id=query_id),
            project_date_column=rendered_date,
            target_table=self.spec.target_table,
            database=self.spec.source_database,
            start_date=filter_start_date,
            end_date=filter_end_date,
            filter_date_column=filter_col,
            offset_value=timezone_offset,
            joins=join_condition if join_condition else "",
            partition_filter=partition_filter,
            to_date=rendered_to_date,
        )

    def _create_stage_view(
        self,
        rendered_template: str,
        stage: dict,
        window_start_date: str,
        window_end_date: str,
        query_id: str,
        query_label: str,
        cadence: str,
        stage_file_path: str,
    ) -> str:
        """Create each use case stage view.

        Each stage has a specific order and refer to a specific SQL to be executed.

        Args:
            rendered_template: rendered stage SQL file.
            stage: stage to process.
            window_start_date: start date for the configured stage.
            window_end_date: end date for the configured stage.
            query_id: gab configuration table use case identifier.
            query_label: gab configuration table use case name.
            cadence: cadence to process.
            stage_file_path: full stage file path (gab path + stage path).
        """
        run_start_time = datetime.now()
        creation_status: str
        error_message: Exception | str

        try:
            tmp = ExecEnv.SESSION.sql(rendered_template)
            num_partitions = ExecEnv.SESSION.conf.get(
                self._SPARK_DEFAULT_PARALLELISM_CONFIG,
                self._SPARK_DEFAULT_PARALLELISM_VALUE,
            )

            if stage["repartition"]:
                if stage["repartition"].get("numPartitions"):
                    num_partitions = stage["repartition"]["numPartitions"]

                if stage["repartition"].get("keys"):
                    tmp = tmp.repartition(
                        int(num_partitions), *stage["repartition"]["keys"]
                    )
                    self._LOGGER.info("Repartitioned on given Key(s)")
                else:
                    tmp = tmp.repartition(int(num_partitions))
                    self._LOGGER.info("Repartitioned on given partition count")

            temp_step_view_name: str = stage["table_alias"]
            tmp.createOrReplaceTempView(temp_step_view_name)

            if stage["storage_level"]:
                ExecEnv.SESSION.sql(
                    "CACHE TABLE {tbl} "
                    "OPTIONS ('storageLevel' '{type}')".format(
                        tbl=temp_step_view_name,
                        type=stage["storage_level"],
                    )
                )
                ExecEnv.SESSION.sql(
                    "SELECT COUNT(*) FROM {tbl}".format(  # nosec: B608
                        tbl=temp_step_view_name
                    )
                )
                self._LOGGER.info(f"Cached stage view - {temp_step_view_name} ")

            creation_status = "Success"
            error_message = "NA"
        except Exception as err:
            creation_status = "Failed"
            error_message = err
            raise err
        finally:
            run_end_time = datetime.now()
            GABUtils().logger(
                run_start_time,
                run_end_time,
                window_start_date,
                window_end_date,
                query_id,
                query_label,
                cadence,
                stage_file_path,
                rendered_template,
                creation_status,
                error_message,
                self.spec.target_database,
            )

        return temp_step_view_name

    def _generate_view_statement(
        self,
        query_id: str,
        cadence: str,
        temp_stage_view_name: str,
        lookup_query_builder: DataFrame,
        window_start_date: str,
        window_end_date: str,
        query_label: str,
    ) -> bool:
        """Feed use case data to the insights table (default: unified use case table).

        Args:
            query_id: gab configuration table use case identifier.
            cadence: cadence to process.
            temp_stage_view_name: name of the temp view generated by the stage.
            lookup_query_builder: gab configuration data.
            window_start_date: start date for the configured stage.
            window_end_date: end date for the configured stage.
            query_label: gab configuration table use case name.
        """
        run_start_time = datetime.now()
        creation_status: str
        error_message: Exception | str

        GABDeleteGenerator(
            query_id=query_id,
            cadence=cadence,
            temp_stage_view_name=temp_stage_view_name,
            lookup_query_builder=lookup_query_builder,
            target_database=self.spec.target_database,
            target_table=self.spec.target_table,
        ).generate_sql()

        gen_ins = GABInsertGenerator(
            query_id=query_id,
            cadence=cadence,
            final_stage_table=temp_stage_view_name,
            lookup_query_builder=lookup_query_builder,
            target_database=self.spec.target_database,
            target_table=self.spec.target_table,
        ).generate_sql()
        try:
            ExecEnv.SESSION.sql(gen_ins)

            creation_status = "Success"
            error_message = "NA"
            inserted = True
        except Exception as err:
            creation_status = "Failed"
            error_message = err
            raise
        finally:
            run_end_time = datetime.now()
            GABUtils().logger(
                run_start_time,
                run_end_time,
                window_start_date,
                window_end_date,
                query_id,
                query_label,
                cadence,
                "Final Insert",
                gen_ins,
                creation_status,
                error_message,
                self.spec.target_database,
            )

        return inserted

    @classmethod
    def _unpersist_cached_views(cls, unpersist_list: list[str]) -> None:
        """Unpersist cached views.

        Args:
            unpersist_list: list containing the view names to unpersist.
        """
        [
            ExecEnv.SESSION.sql("UNCACHE TABLE {tbl}".format(tbl=i))
            for i in unpersist_list
        ]

    def _generate_ddl(
        self,
        latest_config_date: datetime,
        latest_run_date: datetime,
        query_id: str,
        lookup_query_builder: DataFrame,
    ) -> None:
        """Generate the actual gold asset.

        It will create and return the view containing all specified dimensions, metrics
            and computed metric for each cadence/reconciliation window.

        Args:
            latest_config_date: latest use case configuration date.
            latest_run_date: latest use case run date.
            query_id: gab configuration table use case identifier.
            lookup_query_builder: gab configuration data.
        """
        if str(latest_config_date) > str(latest_run_date):
            GABViewManager(
                query_id=query_id,
                lookup_query_builder=lookup_query_builder,
                target_database=self.spec.target_database,
                target_table=self.spec.target_table,
            ).generate_use_case_views()
        else:
            self._LOGGER.info(
                "View is not being re-created as there are no changes in the "
                "configuration after the latest run"
            )


================================================
FILE: lakehouse_engine/algorithms/reconciliator.py
================================================
"""Module containing the Reconciliator class."""

from enum import Enum
from typing import List

import pyspark.sql.functions as spark_fns
from pyspark.sql import DataFrame
from pyspark.sql.functions import (  # noqa: A004
    abs,
    coalesce,
    col,
    lit,
    try_divide,
    when,
)
from pyspark.sql.types import FloatType

from lakehouse_engine.algorithms.exceptions import ReconciliationFailedException
from lakehouse_engine.core.definitions import InputSpec, ReconciliatorSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.executable import Executable
from lakehouse_engine.io.reader_factory import ReaderFactory
from lakehouse_engine.transformers.optimizers import Optimizers
from lakehouse_engine.utils.logging_handler import LoggingHandler


class ReconciliationType(Enum):
    """Type of Reconciliation."""

    PCT = "percentage"
    ABS = "absolute"


class ReconciliationTransformers(Enum):
    """Transformers Available for the Reconciliation Algorithm."""

    AVAILABLE_TRANSFORMERS = {
        "cache": Optimizers.cache,
        "persist": Optimizers.persist,
    }


class Reconciliator(Executable):
    """Class to define the behavior of an algorithm that checks if data reconciles.

    Checking if data reconciles, using this algorithm, is a matter of reading the
    'truth' data and the 'current' data. You can use any input specification compatible
    with the lakehouse engine to read 'truth' or 'current' data. On top of that, you
    can pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can
    preprocess the data before it goes into the actual reconciliation process.
    Moreover, you can use the 'truth_preprocess_query_args' and
    'current_preprocess_query_args' to pass additional arguments to be used to apply
    additional operations on top of the dataframe, resulting from the previous steps.
    With these arguments you can apply additional operations like caching or persisting
    the Dataframe. The way to pass the additional arguments for the operations is
    similar to the TransformSpec, but only a few operations are allowed. Those are
    defined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.

    The reconciliation process is focused on joining 'truth' with 'current' by all
    provided columns except the ones passed as 'metrics'. After that it calculates the
    differences in the metrics attributes (either percentage or absolute difference).
    Finally, it aggregates the differences, using the supplied aggregation function
    (e.g., sum, avg, min, max, etc).

    All of these configurations are passed via the ACON to instantiate a
    ReconciliatorSpec object.

    !!! note
        It is crucial that both the current and truth datasets have exactly the same
        structure.
    !!! note
        You should not use 0 as yellow or red threshold, as the algorithm will verify
        if the difference between the truth and current values is bigger
        or equal than those thresholds.
    !!! note
        The reconciliation does not produce any negative values or percentages, as we
        use the absolute value of the differences. This means that the recon result
        will not indicate if it was the current values that were bigger or smaller
        than the truth values, or vice versa.
    """

    _logger = LoggingHandler(__name__).get_logger()

    def __init__(self, acon: dict):
        """Construct Algorithm instances.

        Args:
            acon: algorithm configuration.
        """
        self.spec: ReconciliatorSpec = ReconciliatorSpec(
            metrics=acon["metrics"],
            truth_input_spec=InputSpec(**acon["truth_input_spec"]),
            current_input_spec=InputSpec(**acon["current_input_spec"]),
            truth_preprocess_query=acon.get("truth_preprocess_query", None),
            truth_preprocess_query_args=acon.get("truth_preprocess_query_args", None),
            current_preprocess_query=acon.get("current_preprocess_query", None),
            current_preprocess_query_args=acon.get(
                "current_preprocess_query_args", None
            ),
            ignore_empty_df=acon.get("ignore_empty_df", False),
        )

    def get_source_of_truth(self) -> DataFrame:
        """Get the source of truth (expected result) for the reconciliation process.

        Returns:
            DataFrame containing the source of truth.
        """
        truth_df = ReaderFactory.get_data(self.spec.truth_input_spec)
        if self.spec.truth_preprocess_query:
            truth_df.createOrReplaceTempView("truth")
            truth_df = ExecEnv.SESSION.sql(self.spec.truth_preprocess_query)

        return truth_df

    def get_current_results(self) -> DataFrame:
        """Get the current results from the table that we are checking if it reconciles.

        Returns:
            DataFrame containing the current results.
        """
        current_df = ReaderFactory.get_data(self.spec.current_input_spec)
        if self.spec.current_preprocess_query:
            current_df.createOrReplaceTempView("current")
            current_df = ExecEnv.SESSION.sql(self.spec.current_preprocess_query)

        return current_df

    def execute(self) -> None:
        """Reconcile the current results against the truth dataset."""
        truth_df = self.get_source_of_truth()
        self._apply_preprocess_query_args(
            truth_df, self.spec.truth_preprocess_query_args
        )
        self._logger.info("Source of truth:")
        truth_df.show(1000, truncate=False)

        current_results_df = self.get_current_results()
        self._apply_preprocess_query_args(
            current_results_df, self.spec.current_preprocess_query_args
        )
        self._logger.info("Current results:")
        current_results_df.show(1000, truncate=False)

        status = "green"

        # if ignore_empty_df is true, run empty check on truth_df and current_results_df
        # if both the dataframes are empty then exit with green
        if (
            self.spec.ignore_empty_df
            and truth_df.isEmpty()
            and current_results_df.isEmpty()
        ):
            self._logger.info(
                f"ignore_empty_df is {self.spec.ignore_empty_df}, "
                f"truth_df and current_results_df are empty, "
                f"hence ignoring reconciliation"
            )
            self._logger.info("The Reconciliation process has succeeded.")
            return

        recon_results = self._get_recon_results(
            truth_df, current_results_df, self.spec.metrics
        )
        self._logger.info(f"Reconciliation result: {recon_results}")

        for m in self.spec.metrics:
            metric_name = f"{m['metric']}_{m['type']}_diff_{m['aggregation']}"
            if m["yellow"] <= recon_results[metric_name] < m["red"]:
                if status == "green":
                    # only switch to yellow if it was green before, otherwise we want
                    # to preserve 'red' as the final status.
                    status = "yellow"
            elif m["red"] <= recon_results[metric_name]:
                status = "red"

        if status != "green":
            raise ReconciliationFailedException(
                f"The Reconciliation process has failed with status: {status}."
            )
        else:
            self._logger.info("The Reconciliation process has succeeded.")

    @staticmethod
    def _apply_preprocess_query_args(
        df: DataFrame, preprocess_query_args: List[dict]
    ) -> DataFrame:
        """Apply transformers on top of the preprocessed query.

        Args:
            df: dataframe being transformed.
            preprocess_query_args: dict having the functions/transformations to
                apply and respective arguments.

        Returns: the transformed Dataframe.
        """
        transformed_df = df

        if preprocess_query_args is None:
            try:
                transformed_df = df.transform(Optimizers.cache())
            except Exception as e:
                Reconciliator._logger.warning(
                    f"Could not apply default caching to the dataframe."
                    f"Continuing without caching. Exception: {e}"
                )
        elif len(preprocess_query_args) > 0:
            for transformation in preprocess_query_args:
                rec_func = ReconciliationTransformers.AVAILABLE_TRANSFORMERS.value[
                    transformation["function"]
                ](
                    **transformation.get("args", {})
                )  # type: ignore

                transformed_df = df.transform(rec_func)
        else:
            transformed_df = df

        return transformed_df

    def _get_recon_results(
        self, truth_df: DataFrame, current_results_df: DataFrame, metrics: List[dict]
    ) -> dict:
        """Get the reconciliation results by comparing truth_df with current_results_df.

        Args:
            truth_df: dataframe with the truth data to reconcile against. It is
                typically an aggregated dataset to use as baseline and then we match the
                current_results_df (Aggregated at the same level) against this truth.
            current_results_df: dataframe with the current results of the dataset we
                are trying to reconcile.
            metrics: list of dicts containing metric, aggregation, yellow threshold and
                red threshold.

        Return:
            dictionary with the results (difference between truth and current results)
        """
        if len(truth_df.head(1)) == 0 or len(current_results_df.head(1)) == 0:
            raise ReconciliationFailedException(
                "The reconciliation has failed because either the truth dataset or the "
                "current results dataset was empty."
            )

        # truth and current are joined on all columns except the metrics
        joined_df = truth_df.alias("truth").join(
            current_results_df.alias("current"),
            [
                truth_df[c] == current_results_df[c]
                for c in current_results_df.columns
                if c not in [m["metric"] for m in metrics]
            ],
            how="full",
        )

        for m in metrics:
            if m["type"] == ReconciliationType.PCT.value:
                joined_df = joined_df.withColumn(
                    f"{m['metric']}_{m['type']}_diff",
                    coalesce(
                        (
                            # we need to make sure we don't produce negative values
                            # because our thresholds only accept > or >= comparisons.
                            abs(
                                try_divide(
                                    (
                                        col(f"current.{m['metric']}")
                                        - col(f"truth.{m['metric']}")
                                    ),
                                    abs(col(f"truth.{m['metric']}")),
                                )
                            )
                        ),
                        # if the formula above produces null, we need to consider where
                        # it came from: we check below if the values were the same,
                        # and if so the diff is 0, if not the diff is 1 (e.g., the null
                        # result might have come from a division by 0).
                        when(
                            col(f"current.{m['metric']}").eqNullSafe(
                                col(f"truth.{m['metric']}")
                            ),
                            lit(0),
                        ).otherwise(lit(1)),
                    ),
                )
            elif m["type"] == ReconciliationType.ABS.value:
                joined_df = joined_df.withColumn(
                    f"{m['metric']}_{m['type']}_diff",
                    abs(
                        coalesce(col(f"current.{m['metric']}"), lit(0))
                        - coalesce(col(f"truth.{m['metric']}"), lit(0))
                    ),
                )
            else:
                raise NotImplementedError(
                    "The requested reconciliation type is not yet implemented."
                )

            joined_df = joined_df.withColumn(
                f"{m['metric']}_{m['type']}_diff",
                col(f"{m['metric']}_{m['type']}_diff").cast(FloatType()),
            )

        results_df = joined_df.agg(
            *[
                getattr(spark_fns, m["aggregation"])(
                    f"{m['metric']}_{m['type']}_diff"
                ).alias(f"{m['metric']}_{m['type']}_diff_{m['aggregation']}")
                for m in metrics
            ]
        )

        return results_df.collect()[0].asDict()


================================================
FILE: lakehouse_engine/algorithms/sensor.py
================================================
"""Module to define Sensor algorithm behavior."""

from pyspark.sql import DataFrame

from lakehouse_engine.algorithms.algorithm import Algorithm
from lakehouse_engine.algorithms.exceptions import (
    NoNewDataException,
    SensorAlreadyExistsException,
)
from lakehouse_engine.core.definitions import (
    SENSOR_ALLOWED_DATA_FORMATS,
    InputFormat,
    ReadType,
    SensorSpec,
    SensorStatus,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.sensor_manager import (
    SensorControlTableManager,
    SensorUpstreamManager,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Sensor(Algorithm):
    """Class representing a sensor to check if the upstream has new data."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(self, acon: dict):
        """Construct Sensor instances.

        Args:
            acon: algorithm configuration.
        """
        self.spec: SensorSpec = SensorSpec.create_from_acon(acon=acon)
        self._validate_sensor_spec()

        if self._check_if_sensor_already_exists():
            raise SensorAlreadyExistsException(
                "There's already a sensor registered with same id or assets!"
            )

    def execute(self) -> bool:
        """Execute the sensor."""
        self._LOGGER.info(f"Starting {self.spec.input_spec.data_format} sensor...")

        new_data_df = SensorUpstreamManager.read_new_data(sensor_spec=self.spec)
        if self.spec.input_spec.read_type == ReadType.STREAMING.value:
            Sensor._run_streaming_sensor(sensor_spec=self.spec, new_data_df=new_data_df)
        elif self.spec.input_spec.read_type == ReadType.BATCH.value:
            Sensor._run_batch_sensor(
                sensor_spec=self.spec,
                new_data_df=new_data_df,
            )

        has_new_data = SensorControlTableManager.check_if_sensor_has_acquired_data(
            self.spec.sensor_id,
            self.spec.control_db_table_name,
        )

        self._LOGGER.info(
            f"Sensor {self.spec.sensor_id} has previously "
            f"acquired data? {has_new_data}"
        )

        if self.spec.fail_on_empty_result and not has_new_data:
            raise NoNewDataException(
                f"No data was acquired by {self.spec.sensor_id} sensor."
            )

        return has_new_data

    def _check_if_sensor_already_exists(self) -> bool:
        """Check if sensor already exists in the table to avoid duplicates."""
        row = SensorControlTableManager.read_sensor_table_data(
            sensor_id=self.spec.sensor_id,
            control_db_table_name=self.spec.control_db_table_name,
        )

        if row and row.assets != self.spec.assets:
            return True
        else:
            row = SensorControlTableManager.read_sensor_table_data(
                assets=self.spec.assets,
                control_db_table_name=self.spec.control_db_table_name,
            )
            return row is not None and row.sensor_id != self.spec.sensor_id

    @classmethod
    def _run_streaming_sensor(
        cls, sensor_spec: SensorSpec, new_data_df: DataFrame
    ) -> None:
        """Run sensor in streaming mode (internally runs in batch mode)."""

        def foreach_batch_check_new_data(df: DataFrame, batch_id: int) -> None:
            # forcing session to be available inside forEachBatch on
            # Spark Connect
            ExecEnv.get_or_create()

            Sensor._run_batch_sensor(
                sensor_spec=sensor_spec,
                new_data_df=df,
            )

        new_data_df.writeStream.trigger(availableNow=True).option(
            "checkpointLocation", sensor_spec.checkpoint_location
        ).foreachBatch(foreach_batch_check_new_data).start().awaitTermination()

    @classmethod
    def _run_batch_sensor(
        cls,
        sensor_spec: SensorSpec,
        new_data_df: DataFrame,
    ) -> None:
        """Run sensor in batch mode.

        Args:
            sensor_spec: sensor spec containing all sensor information.
            new_data_df: DataFrame possibly containing new data.
        """
        new_data_first_row = SensorUpstreamManager.get_new_data(new_data_df)

        cls._LOGGER.info(
            f"Sensor {sensor_spec.sensor_id} has new data from upstream? "
            f"{new_data_first_row is not None}"
        )

        if new_data_first_row:
            SensorControlTableManager.update_sensor_status(
                sensor_spec=sensor_spec,
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                upstream_key=(
                    new_data_first_row.UPSTREAM_KEY
                    if "UPSTREAM_KEY" in new_data_df.columns
                    else None
                ),
                upstream_value=(
                    new_data_first_row.UPSTREAM_VALUE
                    if "UPSTREAM_VALUE" in new_data_df.columns
                    else None
                ),
            )
            cls._LOGGER.info(
                f"Successfully updated sensor status for sensor "
                f"{sensor_spec.sensor_id}..."
            )

    def _validate_sensor_spec(self) -> None:
        """Validate if sensor spec Read Type is allowed for the selected Data Format."""
        if InputFormat.exists(self.spec.input_spec.data_format):
            if (
                self.spec.input_spec.data_format
                not in SENSOR_ALLOWED_DATA_FORMATS[self.spec.input_spec.read_type]
            ):
                raise NotImplementedError(
                    f"A sensor has not been implemented yet for this data format or, "
                    f"this data format is not available for the read_type"
                    f" {self.spec.input_spec.read_type}. "
                    f"Check the allowed combinations of read_type and data_formats:"
                    f" {SENSOR_ALLOWED_DATA_FORMATS}"
                )
        else:
            raise NotImplementedError(
                f"Data format {self.spec.input_spec.data_format} isn't implemented yet."
            )


================================================
FILE: lakehouse_engine/algorithms/sensors/__init__.py
================================================
"""Package containing all the lakehouse engine Sensor Heartbeat algorithms."""


================================================
FILE: lakehouse_engine/algorithms/sensors/heartbeat.py
================================================
"""Module to define Heartbeat Sensor algorithm behavior."""

import re
from typing import Optional

from delta import DeltaTable
from pyspark import Row
from pyspark.sql import DataFrame
from pyspark.sql.column import Column
from pyspark.sql.functions import (
    col,
    concat_ws,
    count,
    current_timestamp,
    lit,
    regexp_replace,
    row_number,
    trim,
    upper,
)
from pyspark.sql.window import Window

from lakehouse_engine.algorithms.algorithm import Algorithm
from lakehouse_engine.algorithms.sensors.sensor import Sensor
from lakehouse_engine.core.definitions import (
    HEARTBEAT_SENSOR_UPDATE_SET,
    HeartbeatConfigSpec,
    HeartbeatSensorSource,
    HeartbeatStatus,
    SensorStatus,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.sensor_manager import (
    SensorJobRunManager,
    SensorUpstreamManager,
)
from lakehouse_engine.terminators.sensor_terminator import SensorTerminator
from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Heartbeat(Algorithm):
    """Class representing a Heartbeat to check if the upstream has new data."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(self, acon: dict):
        """Construct Heartbeat instances.

        Args:
            acon: algorithm configuration.
        """
        self.spec: HeartbeatConfigSpec = HeartbeatConfigSpec.create_from_acon(acon=acon)

    def execute(self) -> None:
        """Execute the heartbeat."""
        latest_event_current_timestamp = current_timestamp()
        heartbeat_sensor_delta_table = DeltaTable.forName(
            ExecEnv.SESSION,
            self.spec.heartbeat_sensor_db_table,
        )
        sensor_source = self.spec.sensor_source

        active_jobs_from_heartbeat_control_table_df = self._get_active_heartbeat_jobs(
            heartbeat_sensor_delta_table, sensor_source
        )

        for (
            control_table_df_row
        ) in active_jobs_from_heartbeat_control_table_df.collect():

            sensor_acon = self._get_sensor_acon_from_heartbeat(
                self.spec, control_table_df_row
            )

            sensors_with_new_data = self._execute_batch_of_sensor(
                sensor_acon, control_table_df_row
            )

            if sensors_with_new_data:

                self._update_heartbeat_status_with_sensor_info(
                    active_jobs_from_heartbeat_control_table_df,
                    heartbeat_sensor_delta_table,
                    self._get_heartbeat_sensor_condition(sensors_with_new_data),
                    latest_event_current_timestamp,
                    sensor_source,
                )

    @classmethod
    def _get_active_heartbeat_jobs(
        cls, heartbeat_sensor_delta_table: DeltaTable, sensor_source: str
    ) -> DataFrame:
        """Get UNPAUSED and NULL or COMPLETED status record from control table.

        :param heartbeat_sensor_delta_table: DeltaTable for heartbeat sensor.
        :param sensor_source: source system from Spec(e.g. sap_b4, delta, kafka etc.).

        Returns:
            A control table DataFrame containing records for specified sensor_source
            that are UNPAUSED and have a status of either NULL or COMPLETED.
        """
        full_control_table = heartbeat_sensor_delta_table.toDF()

        filtered_control_table = full_control_table.filter(
            f"lower(sensor_source) == '{sensor_source}'"
        ).filter(
            "job_state == 'UNPAUSED' and (status is null OR status == 'COMPLETED')"
        )

        return filtered_control_table

    @classmethod
    def generate_unique_column_values(cls, main_col: str, col_to_append: str) -> str:
        """Generate a unique value by appending columns and replacing specific chars.

        Generate a unique value by appending another column and replacing spaces,
        dots, and colons with underscores for consistency.

        :param main_col: The primary column value.
        :param col_to_append: Column value to append for uniqueness.

        Returns:
            A unique, combined column value.
        """
        return f"{re.sub(r'[ :.]', '_', main_col)}_{col_to_append}"

    @classmethod
    def _get_sensor_acon_from_heartbeat(
        cls, heartbeat_spec: HeartbeatConfigSpec, control_table_df_row: Row
    ) -> dict:
        """Create sensor acon from heartbeat config and specifications.

        :param heartbeat_spec: Heartbeat specifications.
        :param control_table_df_row: Control table active records Dataframe Row.

        Returns:
            The sensor acon dict.
        """
        sensors_to_execute: dict = {
            "sensor_id": (
                cls.generate_unique_column_values(
                    control_table_df_row["sensor_id"],
                    control_table_df_row["trigger_job_id"],
                )
            ),  # 1. sensor_id can be same for two or more different trigger_job_id
            # 2. Replacing colon,space,dot(.) with underscore(_) is required to get the
            # checkpoint_location fixed in case of delta_table and kafka source
            "assets": [
                cls.generate_unique_column_values(
                    control_table_df_row["asset_description"],
                    control_table_df_row["trigger_job_id"],
                )
            ],
            "control_db_table_name": heartbeat_spec.lakehouse_engine_sensor_db_table,
            "input_spec": {
                "spec_id": "sensor_upstream",
                "read_type": control_table_df_row["sensor_read_type"],
                "data_format": heartbeat_spec.data_format,
                "db_table": (
                    control_table_df_row["sensor_id"]
                    if heartbeat_spec.data_format == "delta"
                    else None
                ),
                "options": heartbeat_spec.options,
                "location": (
                    (
                        heartbeat_spec.base_trigger_file_location
                        + "/"
                        + control_table_df_row["sensor_id"]
                    )
                    if heartbeat_spec.base_trigger_file_location is not None
                    else None
                ),
                "schema": heartbeat_spec.schema_dict,
            },
            "preprocess_query": control_table_df_row["preprocess_query"],
            "base_checkpoint_location": heartbeat_spec.base_checkpoint_location,
            "fail_on_empty_result": False,
        }

        final_sensors_to_execute = cls._enhance_sensor_acon_extra_options(
            heartbeat_spec, control_table_df_row, sensors_to_execute
        )

        return final_sensors_to_execute

    @classmethod
    def _enhance_sensor_acon_extra_options(
        cls,
        heartbeat_spec: HeartbeatConfigSpec,
        control_table_df_row: Row,
        sensors_to_execute: dict,
    ) -> dict:
        """Enhance sensor acon with extra options for specific source system.

        :param heartbeat_spec: Heartbeat specifications.
        :param control_table_df_row: Control table active records Dataframe Row.
        :param sensors_to_execute: sensor acon dictionary from previous step.

        Returns:
            The sensor acon dict having enhanced options for specific sensor_source.
        """
        LATEST_FETCH_EVENT_TIMESTAMP = (
            control_table_df_row.latest_event_fetched_timestamp
        )

        upstream_key = control_table_df_row["upstream_key"]

        upstream_value = (
            LATEST_FETCH_EVENT_TIMESTAMP.strftime("%Y%m%d%H%M%S")
            if LATEST_FETCH_EVENT_TIMESTAMP is not None
            else "19000101000000"
        )

        if control_table_df_row.sensor_source.lower() in [
            HeartbeatSensorSource.SAP_B4.value,
            HeartbeatSensorSource.SAP_BW.value,
        ]:

            sensors_to_execute["input_spec"]["options"]["prepareQuery"] = (
                SensorUpstreamManager.generate_sensor_sap_logchain_query(
                    chain_id=control_table_df_row.sensor_id,
                    dbtable=heartbeat_spec.jdbc_db_table,
                )
            )
            sensors_to_execute["input_spec"]["options"]["query"] = (
                SensorUpstreamManager.generate_filter_exp_query(
                    sensor_id=control_table_df_row.sensor_id,
                    filter_exp="?upstream_key > '?upstream_value'",
                    control_db_table_name=(
                        heartbeat_spec.lakehouse_engine_sensor_db_table
                    ),
                    upstream_key=upstream_key,
                    upstream_value=upstream_value,
                )
            )

        elif (
            control_table_df_row.sensor_source.lower()
            == HeartbeatSensorSource.LMU_DELTA_TABLE.value
        ):

            sensors_to_execute["preprocess_query"] = (
                SensorUpstreamManager.generate_filter_exp_query(
                    sensor_id=control_table_df_row.sensor_id,
                    filter_exp="?upstream_key > '?upstream_value'",
                    control_db_table_name=(
                        heartbeat_spec.lakehouse_engine_sensor_db_table
                    ),
                    upstream_key=upstream_key,
                    upstream_value=upstream_value,
                )
            )

        elif (
            control_table_df_row.sensor_source.lower()
            == HeartbeatSensorSource.KAFKA.value
        ):

            kafka_options = cls._get_all_kafka_options(
                heartbeat_spec.kafka_configs,
                control_table_df_row["sensor_id"],
                heartbeat_spec.kafka_secret_scope,
            )

            sensors_to_execute["input_spec"]["options"] = kafka_options

        return sensors_to_execute

    @classmethod
    def _get_all_kafka_options(
        cls,
        kafka_configs: dict,
        kafka_sensor_id: str,
        kafka_secret_scope: str,
    ) -> dict:
        """Get all Kafka extra options for sensor ACON.

        Read all heartbeat sensor related kafka config dynamically based on
        data product name or any other prefix which should match with sensor_id prefix.

        :param kafka_configs: kafka config read from yaml file.
        :param kafka_sensor_id: kafka topic for which new event to be fetched.
        :param kafka_secret_scope: secret scope used for kafka processing.

        Returns:
            The sensor acon dict having enhanced options for kafka source.
        """
        sensor_id_desc = kafka_sensor_id.split(":")
        dp_name_filter = sensor_id_desc[0].strip()
        KAFKA_TOPIC = sensor_id_desc[1].strip()

        KAFKA_BOOTSTRAP_SERVERS = kafka_configs[dp_name_filter][
            "kafka_bootstrap_servers_list"
        ]
        KAFKA_TRUSTSTORE_LOCATION = kafka_configs[dp_name_filter][
            "kafka_ssl_truststore_location"
        ]
        KAFKA_KEYSTORE_LOCATION = kafka_configs[dp_name_filter][
            "kafka_ssl_keystore_location"
        ]
        KAFKA_TRUSTSTORE_PSWD_SECRET_KEY = kafka_configs[dp_name_filter][
            "truststore_pwd_secret_key"
        ]
        KAFKA_TRUSTSTORE_PSWD = (
            DatabricksUtils.get_db_utils(ExecEnv.SESSION).secrets.get(
                scope=kafka_secret_scope,
                key=KAFKA_TRUSTSTORE_PSWD_SECRET_KEY,
            )
            if KAFKA_TRUSTSTORE_PSWD_SECRET_KEY
            else None
        )
        KAFKA_KEYSTORE_PSWD_SECRET_KEY = kafka_configs[dp_name_filter][
            "keystore_pwd_secret_key"
        ]
        KAFKA_KEYSTORE_PSWD = (
            DatabricksUtils.get_db_utils(ExecEnv.SESSION).secrets.get(
                scope=kafka_secret_scope,
                key=KAFKA_KEYSTORE_PSWD_SECRET_KEY,
            )
            if KAFKA_KEYSTORE_PSWD_SECRET_KEY
            else None
        )

        kafka_options_dict = {
            "kafka.bootstrap.servers": KAFKA_BOOTSTRAP_SERVERS,
            "subscribe": KAFKA_TOPIC,
            "startingOffsets": "earliest",
            "kafka.security.protocol": "SSL",
            "kafka.ssl.truststore.location": KAFKA_TRUSTSTORE_LOCATION,
            "kafka.ssl.truststore.password": KAFKA_TRUSTSTORE_PSWD,
            "kafka.ssl.keystore.location": KAFKA_KEYSTORE_LOCATION,
            "kafka.ssl.keystore.password": KAFKA_KEYSTORE_PSWD,
        }

        return kafka_options_dict

    @classmethod
    def _execute_batch_of_sensor(
        cls, sensor_acon: dict, control_table_df_row: Row
    ) -> dict:
        """Execute sensor acon to fetch NEW EVENT AVAILABLE for sensor source system.

        :param sensor_acon: sensor acon created from heartbeat config and specs.
        :param control_table_df_row: Control table active records Dataframe Row.

        Returns:
            Dict containing sensor_id and trigger_job_id for sensor with new data.
        """
        sensors_with_new_data: dict = {}

        cls._LOGGER.info(f"Executing sensor: {sensor_acon}")
        has_new_data = Sensor(sensor_acon).execute()

        if has_new_data:
            sensors_with_new_data["sensor_id"] = control_table_df_row["sensor_id"]
            sensors_with_new_data["trigger_job_id"] = control_table_df_row[
                "trigger_job_id"
            ]

        return sensors_with_new_data

    @classmethod
    def _get_heartbeat_sensor_condition(
        cls,
        sensors_with_new_data: dict,
    ) -> Optional[str]:
        """Get heartbeat sensor new event available condition.

        :param sensors_with_new_data: dict having NEW_EVENT_AVAILABLE sensor_id record.

        Returns:
            String having condition for sensor having new data available.
        """
        heartbeat_sensor_with_new_event_available = (
            f"(sensor_id = '{sensors_with_new_data['sensor_id']}' AND "
            f"trigger_job_id = '{sensors_with_new_data['trigger_job_id']}')"
        )

        return heartbeat_sensor_with_new_event_available

    @classmethod
    def _update_heartbeat_status_with_sensor_info(
        cls,
        heartbeat_sensor_jobs: DataFrame,
        heartbeat_sensor_delta_table: DeltaTable,
        heartbeat_with_new_event_available_condition: str,
        latest_event_current_timestamp: Column,
        sensor_source: str,
    ) -> None:
        """Update heartbeat status with sensor info.

        :param heartbeat_sensor_jobs: active UNPAUSED jobs from Control table dataframe.
        :param heartbeat_sensor_delta_table: heartbeat sensor Delta table.
        :param heartbeat_with_new_event_available_condition: new event available cond.
        :param latest_event_current_timestamp: timestamp when new event was captured.
        """
        if heartbeat_with_new_event_available_condition:
            sensors_with_new_event_available = (
                heartbeat_sensor_jobs.filter(
                    heartbeat_with_new_event_available_condition
                )
                .withColumn("status", lit(HeartbeatStatus.NEW_EVENT_AVAILABLE.value))
                .withColumn("status_change_timestamp", current_timestamp())
                .withColumn(
                    "latest_event_fetched_timestamp", latest_event_current_timestamp
                )
            )

            new_event_merge_condition = f"""target.sensor_id = src.sensor_id AND
                target.trigger_job_id = src.trigger_job_id AND
                target.sensor_source = '{sensor_source}'"""

            if sensors_with_new_event_available.count() > 0:
                cls.update_heartbeat_control_table(
                    heartbeat_sensor_delta_table,
                    sensors_with_new_event_available,
                    new_event_merge_condition,
                )
        else:
            cls._LOGGER.info("No sensors to execute!")

    @classmethod
    def update_heartbeat_control_table(
        cls,
        heartbeat_sensor_delta_table: DeltaTable,
        updated_data: DataFrame,
        heartbeat_control_table_merge_condition: str,
    ) -> None:
        """Update heartbeat control table with the new data.

        :param heartbeat_sensor_delta_table: db_table heartbeat sensor control table.
        :param updated_data: data to update the control table.
        :param heartbeat_control_table_merge_condition: merge condition for table.
        """
        cls._LOGGER.info(f"updated data: {updated_data}")

        heartbeat_sensor_delta_table.alias("target").merge(
            updated_data.alias("src"),
            (heartbeat_control_table_merge_condition),
        ).whenMatchedUpdate(
            set=HEARTBEAT_SENSOR_UPDATE_SET
        ).whenNotMatchedInsertAll().execute()

    @classmethod
    def get_heartbeat_jobs_to_trigger(
        cls,
        heartbeat_sensor_db_table: str,
        heartbeat_sensor_control_table_df: DataFrame,
    ) -> list[Row]:
        """Get heartbeat jobs to trigger.

        Check if all the dependencies are satisfied to trigger the job.
        dependency_flag column to be checked for all sensor_id and
        trigger_job_id combination keeping status as NEW_EVENT_AVAILABLE in mind.

        Check dependencies based trigger_job_id. From all control table record having
        status as NEW_EVENT_AVAILABLE, then it will fetch status and dependency_flag
        for all records having same trigger_job_id. If trigger_job_id, status,
        dependency_flag combination is same for all dependencies, Get distinct record
        and do count level aggregation for trigger_job_id, dependency_flag.

        Count level aggregation based on trigger_job_id, dependency_flag picks all
        those trigger_job_id which doesn`t satisfy dependency as it denotes there are
        more than one record present having dependency_flag = "TRUE" and status is
        different for same trigger_job_id. If count is not more than 1, means condition
        satisfied, Job id will be considered for triggering.

        If trigger_job_id, status, dependency_flag combination is not same for all
        dependencies, aggregated count will result in more than one record and it will
        go under jobs_to_not_trigger and will not trigger job.

        :param heartbeat_sensor_db_table: heartbeat sensor table name.
        :param heartbeat_sensor_control_table_df: Dataframe for heartbeat control table.
        :return: list of jobs to be triggered.
        """
        # Get all distinct trigger_job_id where status is NEW_EVENT_AVAILABLE
        trigger_jobs_new_events_df = (
            heartbeat_sensor_control_table_df.filter(
                f"status == '{HeartbeatStatus.NEW_EVENT_AVAILABLE.value}'"
            )
            .select(col("trigger_job_id"))
            .distinct()
        )

        # Get distinct trigger_job_id, status, dependency_flag for control table records
        full_data_df = (
            ExecEnv.SESSION.table(heartbeat_sensor_db_table)
            .select(
                col("trigger_job_id"),
                col("status"),
                upper(col("dependency_flag")).alias("dependency_flag"),
            )
            .distinct()
        )

        # Join NEW_EVENT_AVAILABLE records with full table to get all dependencies
        # based on trigger_job_id. dependency_flag = "TRUE" needs to be checked as
        # we are only concerned with records where dependencies needs to be checked.
        full_data_trigger_job_id = col("full_data.trigger_job_id")
        dep_flag_comparison = trim(upper(col("dependency_flag"))) == "TRUE"
        jobs_with_new_events_df = (
            full_data_df.alias("full_data")
            .join(
                trigger_jobs_new_events_df.alias("jobs_with_new_events"),
                col("jobs_with_new_events.trigger_job_id") == full_data_trigger_job_id,
                "inner",
            )
            .select(
                full_data_trigger_job_id,
                col("full_data.status"),
                col("full_data.dependency_flag"),
            )
        ).filter(dep_flag_comparison)

        # Count level aggregation based on trigger_job_id, dependency_flag picks all
        # those trigger_job_id which doesn`t satisfy dependency as it denotes there
        # are more than one record present having dependency_flag = "TRUE" and status
        # is different for same trigger_job_id.
        jobs_to_not_trigger_with_new_event_df = (
            jobs_with_new_events_df.filter(dep_flag_comparison)
            .groupBy("trigger_job_id", "dependency_flag")
            .agg(count("trigger_job_id").alias("count"))
            .where(col("count") > 1)
        )

        jobs_to_trigger_df = (
            jobs_with_new_events_df.alias("full_data")
            .join(
                jobs_to_not_trigger_with_new_event_df.alias("jobs_to_not_trigger"),
                (col("jobs_to_not_trigger.trigger_job_id") == full_data_trigger_job_id),
                "left_anti",
            )
            .groupBy("trigger_job_id", "status")
            .agg(count("trigger_job_id").alias("count"))
            .where(col("count") == 1)
        )

        jobs_to_trigger_df = jobs_to_trigger_df.select("trigger_job_id").distinct()
        jobs_to_trigger = jobs_to_trigger_df.collect()

        return jobs_to_trigger

    @classmethod
    def get_anchor_job_record(
        cls, heartbeat_sensor_table_df: DataFrame, job_id: str, sensor_source: str
    ) -> DataFrame:
        """Identify anchor jobs from the control table.

        Using trigger_job_id as the partition key, ordered by status_change_timestamp
        in descending order and sensor_id in ascending order, filtered by the specific
        sensor_source.

        This method partitions records by trigger_job_id, orders them by
        status_change_timestamp (descending) and sensor_id (ascending), and filters
        by the specified sensor_source. Filtering on sensor_source makes sure if
        current source is eligible for triggering the job and updates or not. This
        process ensures that only the appropriate single record triggers the job and
        the control table is updated accordingly. This approach eliminates redundant
        triggers and unnecessary updates.

        :param heartbeat_sensor_table_df: Heartbeat sensor control table Dataframe.
        :param job_id: Trigger job_id from table for which dependency also satisfies.
        :param sensor_source: source of the heartbeat sensor record.

        Returns:
            Control table DataFrame containing anchor job records valid for triggering.
        """
        heartbeat_anchor_records_df = heartbeat_sensor_table_df.filter(
            col("trigger_job_id") == job_id
        ).withColumn(
            "row_no",
            row_number().over(
                Window.partitionBy("trigger_job_id").orderBy(
                    col("status_change_timestamp").desc(), col("sensor_id").asc()
                )
            ),
        )

        heartbeat_anchor_records_df = heartbeat_anchor_records_df.filter(
            f"row_no = 1 AND sensor_source = '{sensor_source}'"
        ).drop("row_no")

        return heartbeat_anchor_records_df

    def heartbeat_sensor_trigger_jobs(self) -> None:
        """Get heartbeat jobs to trigger.

        :param self.spec: HeartbeatConfigSpec having config and control table spec.
        """
        heartbeat_sensor_db_table = self.spec.heartbeat_sensor_db_table
        sensor_source = self.spec.sensor_source

        heartbeat_sensor_delta_table = DeltaTable.forName(
            ExecEnv.SESSION, heartbeat_sensor_db_table
        )

        heartbeat_sensor_control_table_df = ExecEnv.SESSION.table(
            heartbeat_sensor_db_table
        ).filter(
            f"lower(sensor_source) == '{sensor_source}' and (job_state == 'UNPAUSED')"
        )

        jobs_to_trigger = self.get_heartbeat_jobs_to_trigger(
            heartbeat_sensor_db_table, heartbeat_sensor_control_table_df
        )

        heartbeat_sensor_table_df = ExecEnv.SESSION.table(heartbeat_sensor_db_table)
        final_df: Optional[DataFrame] = None

        for row in jobs_to_trigger:
            run_id = None
            exception = None

            heartbeat_anchor_job_records_df = self.get_anchor_job_record(
                heartbeat_sensor_table_df, row["trigger_job_id"], sensor_source
            )

            if heartbeat_anchor_job_records_df.take(1):
                run_id, exception = SensorJobRunManager.run_job(
                    row["trigger_job_id"], self.spec.token, self.spec.domain
                )

                if exception is None and run_id is not None:
                    status_df = (
                        heartbeat_sensor_table_df.filter(
                            (col("trigger_job_id") == row["trigger_job_id"])
                        )
                        .withColumn("job_start_timestamp", current_timestamp())
                        .withColumn("status", lit(HeartbeatStatus.IN_PROGRESS.value))
                        .withColumn("status_change_timestamp", current_timestamp())
                    )
                    final_df = final_df.union(status_df) if final_df else status_df

        if final_df is not None:
            in_progress_merge_condition = """target.sensor_id = src.sensor_id AND
                target.trigger_job_id = src.trigger_job_id AND
                target.sensor_source = src.sensor_source"""

            self.update_heartbeat_control_table(
                heartbeat_sensor_delta_table, final_df, in_progress_merge_condition
            )

    @classmethod
    def _read_heartbeat_sensor_data_feed_csv(
        cls, heartbeat_sensor_data_feed_path: str
    ) -> DataFrame:
        """Get rows to insert or delete in heartbeat_sensor table.

        It reads the CSV file stored from the `heartbeat_sensor_data_feed_path` and
        perform UPSERT and DELETE in control table.
        - **heartbeat_sensor_data_feed_path**: path where CSV file is stored.
        """
        data_feed_csv_df = (
            ExecEnv.SESSION.read.format("csv")
            .option("header", True)
            .load(heartbeat_sensor_data_feed_path)
        )
        data_feed_csv_df = data_feed_csv_df.withColumn(
            "job_state", upper(col("job_state"))
        )
        return data_feed_csv_df

    @classmethod
    def merge_control_table_data_feed_records(
        cls,
        heartbeat_sensor_control_table: str,
        heartbeat_sensor_data_feed_csv_df: DataFrame,
    ) -> None:
        """Perform merge operation based on the condition.

        It reads the CSV file stored at `heartbeat_sensor_data_feed_path` folder
        and perform UPSERT and DELETE in control table.
        - **heartbeat_sensor_control_table**: Heartbeat sensor control table.
        - **heartbeat_sensor_data_feed_csv_df**: Dataframe after reading CSV file.
        """
        delta_table = DeltaTable.forName(
            ExecEnv.SESSION, heartbeat_sensor_control_table
        )

        delta_table.alias("trgt").merge(
            heartbeat_sensor_data_feed_csv_df.alias("source"),
            (
                """source.sensor_id = trgt.sensor_id and
                trgt.trigger_job_id = source.trigger_job_id"""
            ),
        ).whenNotMatchedInsert(
            values={
                "sensor_source": "source.sensor_source",
                "sensor_id": "source.sensor_id",
                "sensor_read_type": "source.sensor_read_type",
                "asset_description": "source.asset_description",
                "upstream_key": "source.upstream_key",
                "preprocess_query": "source.preprocess_query",
                "latest_event_fetched_timestamp": "null",
                "trigger_job_id": "source.trigger_job_id",
                "trigger_job_name": "source.trigger_job_name",
                "status": "null",
                "status_change_timestamp": "null",
                "job_start_timestamp": "null",
                "job_end_timestamp": "null",
                "job_state": "source.job_state",
                "dependency_flag": "source.dependency_flag",
            }
        ).whenMatchedUpdate(
            set={
                "sensor_source": "source.sensor_source",
                "sensor_id": "source.sensor_id",
                "sensor_read_type": "source.sensor_read_type",
                "asset_description": "source.asset_description",
                "upstream_key": "source.upstream_key",
                "preprocess_query": "source.preprocess_query",
                "latest_event_fetched_timestamp": "trgt.latest_event_fetched_timestamp",
                "trigger_job_id": "source.trigger_job_id",
                "trigger_job_name": "source.trigger_job_name",
                "status": "trgt.status",
                "status_change_timestamp": "trgt.status_change_timestamp",
                "job_start_timestamp": "trgt.job_start_timestamp",
                "job_end_timestamp": "trgt.job_end_timestamp",
                "job_state": "source.job_state",
                "dependency_flag": "source.dependency_flag",
            }
        ).whenNotMatchedBySourceDelete().execute()

    @classmethod
    def heartbeat_sensor_control_table_data_feed(
        cls,
        heartbeat_sensor_data_feed_path: str,
        heartbeat_sensor_control_table: str,
    ) -> None:
        """Control table Data feeder.

        It reads the CSV file stored at `heartbeat_sensor_data_feed_path` and
        perform UPSERT and DELETE in control table.
        - **heartbeat_sensor_data_feed_path**: path where CSV file is stored.
        - **heartbeat_sensor_control_table**: CONTROL table of Heartbeat sensor.
        """
        heartbeat_sensor_data_feed_csv_df = cls._read_heartbeat_sensor_data_feed_csv(
            heartbeat_sensor_data_feed_path
        )

        cls.merge_control_table_data_feed_records(
            heartbeat_sensor_control_table, heartbeat_sensor_data_feed_csv_df
        )

    @classmethod
    def update_sensor_processed_status(
        cls,
        sensor_table: str,
        job_id_filter_control_table_df: DataFrame,
    ) -> None:
        """UPDATE sensor PROCESSED_NEW_DATA status.

        Update sensor control table with PROCESSED_NEW_DATA status and
        status_change_timestamp for the triggered job.

        Args:
            sensor_table: lakehouse engine sensor table name.
            job_id_filter_control_table_df: Job Id filtered Heartbeat sensor
            control table dataframe.
        """
        sensor_id_df = job_id_filter_control_table_df.withColumn(
            "sensor_table_sensor_id",
            concat_ws(
                "_",
                regexp_replace(col("sensor_id"), r"[ :\.]", "_"),
                col("trigger_job_id"),
            ),
        )

        for row in sensor_id_df.select("sensor_table_sensor_id").collect():
            SensorTerminator.update_sensor_status(
                sensor_id=row["sensor_table_sensor_id"],
                control_db_table_name=sensor_table,
                status=SensorStatus.PROCESSED_NEW_DATA.value,
                assets=None,
            )

    @classmethod
    def update_heartbeat_sensor_completion_status(
        cls,
        heartbeat_sensor_control_table: str,
        sensor_table: str,
        job_id: str,
    ) -> None:
        """UPDATE heartbeat sensor status.

        Update heartbeat sensor control table with COMPLETE status and
        job_end_timestamp for the triggered job.
        Update sensor control table with PROCESSED_NEW_DATA status and
        status_change_timestamp for the triggered job.

        Args:
            job_id: job_id of the running job. It will refer to
            trigger_job_id in Control table.
            sensor_table: lakehouse engine sensor table name.
            heartbeat_sensor_control_table: Heartbeat sensor control table.
        """
        job_id_filter_control_table_df = (
            ExecEnv.SESSION.table(heartbeat_sensor_control_table)
            .filter(col("trigger_job_id") == job_id)
            .withColumn("status", lit(HeartbeatStatus.COMPLETED.value))
            .withColumn("status_change_timestamp", current_timestamp())
            .withColumn("job_end_timestamp", current_timestamp())
        )

        cls.update_sensor_processed_status(sensor_table, job_id_filter_control_table_df)

        delta_table = DeltaTable.forName(
            ExecEnv.SESSION, heartbeat_sensor_control_table
        )

        (
            delta_table.alias("target")
            .merge(
                job_id_filter_control_table_df.alias("source"),
                (
                    f"""target.sensor_source = source.sensor_source and
                target.sensor_id = source.sensor_id and
                target.trigger_job_id = '{job_id}'"""
                ),
            )
            .whenMatchedUpdate(
                set={
                    "target.status": "source.status",
                    "target.status_change_timestamp": "source.status_change_timestamp",
                    "target.job_end_timestamp": "source.job_end_timestamp",
                }
            )
            .execute()
        )


================================================
FILE: lakehouse_engine/algorithms/sensors/sensor.py
================================================
"""Module to define Sensor algorithm behavior."""

from pyspark.sql import DataFrame

from lakehouse_engine.algorithms.algorithm import Algorithm
from lakehouse_engine.algorithms.exceptions import (
    NoNewDataException,
    SensorAlreadyExistsException,
)
from lakehouse_engine.core.definitions import (
    SENSOR_ALLOWED_DATA_FORMATS,
    InputFormat,
    ReadType,
    SensorSpec,
    SensorStatus,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.sensor_manager import (
    SensorControlTableManager,
    SensorUpstreamManager,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Sensor(Algorithm):
    """Class representing a sensor to check if the upstream has new data."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(self, acon: dict):
        """Construct Sensor instances.

        Args:
            acon: algorithm configuration.
        """
        self.spec: SensorSpec = SensorSpec.create_from_acon(acon=acon)
        self._validate_sensor_spec()

        if self._check_if_sensor_already_exists():
            raise SensorAlreadyExistsException(
                "There's already a sensor registered with same id or assets!"
            )

    def execute(self) -> bool:
        """Execute the sensor."""
        self._LOGGER.info(f"Starting {self.spec.input_spec.data_format} sensor...")

        new_data_df = SensorUpstreamManager.read_new_data(sensor_spec=self.spec)
        if self.spec.input_spec.read_type == ReadType.STREAMING.value:
            Sensor._run_streaming_sensor(sensor_spec=self.spec, new_data_df=new_data_df)
        elif self.spec.input_spec.read_type == ReadType.BATCH.value:
            Sensor._run_batch_sensor(
                sensor_spec=self.spec,
                new_data_df=new_data_df,
            )

        has_new_data = SensorControlTableManager.check_if_sensor_has_acquired_data(
            self.spec.sensor_id,
            self.spec.control_db_table_name,
        )

        self._LOGGER.info(
            f"Sensor {self.spec.sensor_id} has previously "
            f"acquired data? {has_new_data}"
        )

        if self.spec.fail_on_empty_result and not has_new_data:
            raise NoNewDataException(
                f"No data was acquired by {self.spec.sensor_id} sensor."
            )

        return has_new_data

    def _check_if_sensor_already_exists(self) -> bool:
        """Check if sensor already exists in the table to avoid duplicates."""
        row = SensorControlTableManager.read_sensor_table_data(
            sensor_id=self.spec.sensor_id,
            control_db_table_name=self.spec.control_db_table_name,
        )

        if row and row.assets != self.spec.assets:
            return True
        else:
            row = SensorControlTableManager.read_sensor_table_data(
                assets=self.spec.assets,
                control_db_table_name=self.spec.control_db_table_name,
            )
            return row is not None and row.sensor_id != self.spec.sensor_id

    @classmethod
    def _run_streaming_sensor(
        cls, sensor_spec: SensorSpec, new_data_df: DataFrame
    ) -> None:
        """Run sensor in streaming mode (internally runs in batch mode)."""

        def foreach_batch_check_new_data(df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(df)

            Sensor._run_batch_sensor(
                sensor_spec=sensor_spec,
                new_data_df=df,
            )

        new_data_df.writeStream.trigger(availableNow=True).option(
            "checkpointLocation", sensor_spec.checkpoint_location
        ).foreachBatch(foreach_batch_check_new_data).start().awaitTermination()

    @classmethod
    def _run_batch_sensor(
        cls,
        sensor_spec: SensorSpec,
        new_data_df: DataFrame,
    ) -> None:
        """Run sensor in batch mode.

        Args:
            sensor_spec: sensor spec containing all sensor information.
            new_data_df: DataFrame possibly containing new data.
        """
        new_data_first_row = SensorUpstreamManager.get_new_data(new_data_df)

        cls._LOGGER.info(
            f"Sensor {sensor_spec.sensor_id} has new data from upstream? "
            f"{new_data_first_row is not None}"
        )

        if new_data_first_row:
            SensorControlTableManager.update_sensor_status(
                sensor_spec=sensor_spec,
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                upstream_key=(
                    new_data_first_row.UPSTREAM_KEY
                    if "UPSTREAM_KEY" in new_data_df.columns
                    else None
                ),
                upstream_value=(
                    new_data_first_row.UPSTREAM_VALUE
                    if "UPSTREAM_VALUE" in new_data_df.columns
                    else None
                ),
            )
            cls._LOGGER.info(
                f"Successfully updated sensor status for sensor "
                f"{sensor_spec.sensor_id}..."
            )

    def _validate_sensor_spec(self) -> None:
        """Validate if sensor spec Read Type is allowed for the selected Data Format."""
        if InputFormat.exists(self.spec.input_spec.data_format):
            if (
                self.spec.input_spec.data_format
                not in SENSOR_ALLOWED_DATA_FORMATS[self.spec.input_spec.read_type]
            ):
                raise NotImplementedError(
                    f"A sensor has not been implemented yet for this data format or, "
                    f"this data format is not available for the read_type"
                    f" {self.spec.input_spec.read_type}. "
                    f"Check the allowed combinations of read_type and data_formats:"
                    f" {SENSOR_ALLOWED_DATA_FORMATS}"
                )
        else:
            raise NotImplementedError(
                f"Data format {self.spec.input_spec.data_format} isn't implemented yet."
            )


================================================
FILE: lakehouse_engine/configs/__init__.py
================================================
"""This module receives a config file which is included in the wheel."""


================================================
FILE: lakehouse_engine/configs/engine.yaml
================================================
dq_bucket: s3://sample-dq-bucket
dq_dev_bucket: s3://sample-dq-dev-bucket
dq_functions_column_list:
  - dq_rule_id
  - execution_point
  - filters
  - schema
  - table
  - column
  - dimension
dq_result_sink_columns_to_delete:
  - partial_unexpected_list
  - partial_unexpected_counts
  - partial_unexpected_index_list
  - unexpected_list
sharepoint_authority: https://login.microsoftonline.com
sharepoint_api_domain: https://graph.microsoft.com
sharepoint_company_domain: your_company_name.sharepoint.com
notif_disallowed_email_servers:
  - sample.blocked.email_server
engine_usage_path: s3://sample-log-bucket
engine_dev_usage_path: s3://sample-log-dev-bucket
raise_on_config_not_available: False
prod_catalog: sample_catalog
environment: prod

================================================
FILE: lakehouse_engine/core/__init__.py
================================================
"""Package with the core behaviour of the lakehouse engine."""


================================================
FILE: lakehouse_engine/core/dbfs_file_manager.py
================================================
"""File manager module using dbfs."""

from lakehouse_engine.core.file_manager import FileManager
from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


def _dry_run(bucket: str, object_paths: list) -> dict:
    """Build the dry run request return format.

    Args:
        bucket: name of bucket to perform operation.
        object_paths: paths of object to list.

    Returns:
        A dict with a list of objects that would be copied/deleted.
    """
    response = {}

    for path in object_paths:
        path = _get_path(bucket, path)

        object_list: list = []
        object_list = _list_objects(path, object_list)

        if object_list:
            response[path] = object_list
        else:
            response[path] = ["No such key"]

    return response


def _list_objects(path: str, objects_list: list) -> list:
    """List all the objects in a path.

    Args:
        path: path to be used to perform the list.
        objects_list: A list of object names, empty by default.

    Returns:
         A list of object names.
    """
    from lakehouse_engine.core.exec_env import ExecEnv

    ls_objects_list = DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.ls(path)

    for file_or_directory in ls_objects_list:
        if file_or_directory.isDir():
            _list_objects(file_or_directory.path, objects_list)
        else:
            objects_list.append(file_or_directory.path)
    return objects_list


def _get_path(bucket: str, path: str) -> str:
    """Get complete path.

    For s3 path, the bucket (e.g. bucket-example) and path
    (e.g. folder1/folder2) will be filled with part of the path.
    For dbfs path, the path will have the complete path
    (dbfs:/example) and bucket as null.

    Args:
        bucket: bucket for s3 objects.
        path: path to access the directory of file.

    Returns:
         The complete path with or without bucket.
    """
    if bucket.strip():
        path = f"s3://{bucket}/{path}".strip()
    else:
        path = path.strip()

    return path


class DBFSFileManager(FileManager):
    """Set of actions to manipulate dbfs files in several ways."""

    _logger = LoggingHandler(__name__).get_logger()

    def get_function(self) -> None:
        """Get a specific function to execute."""
        available_functions = {
            "delete_objects": self.delete_objects,
            "copy_objects": self.copy_objects,
            "move_objects": self.move_objects,
        }

        self._logger.info("Function being executed: {}".format(self.function))
        if self.function in available_functions.keys():
            func = available_functions[self.function]
            func()
        else:
            raise NotImplementedError(
                f"The requested function {self.function} is not implemented."
            )

    @staticmethod
    def _delete_objects(bucket: str, objects_paths: list) -> None:
        """Delete objects recursively.

        Params:
            bucket: name of bucket to perform the delete operation.
            objects_paths: objects to be deleted.
        """
        from lakehouse_engine.core.exec_env import ExecEnv

        for path in objects_paths:
            path = _get_path(bucket, path)

            DBFSFileManager._logger.info(f"Deleting: {path}")

            try:
                delete_operation = DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.rm(
                    path, True
                )

                if delete_operation:
                    DBFSFileManager._logger.info(f"Deleted: {path}")
                else:
                    DBFSFileManager._logger.info(f"Not able to delete: {path}")
            except Exception as e:
                DBFSFileManager._logger.error(f"Error deleting {path} - {e}")
                raise e

    def delete_objects(self) -> None:
        """Delete objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be deleted based on the given keys.
        """
        bucket = self.configs["bucket"]
        objects_paths = self.configs["object_paths"]
        dry_run = self.configs["dry_run"]

        if dry_run:
            response = _dry_run(bucket=bucket, object_paths=objects_paths)

            self._logger.info("Paths that would be deleted:")
            self._logger.info(response)
        else:
            self._delete_objects(bucket, objects_paths)

    def copy_objects(self) -> None:
        """Copies objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be copied based on the given keys.
        """
        source_bucket = self.configs["bucket"]
        source_object = self.configs["source_object"]
        destination_bucket = self.configs["destination_bucket"]
        destination_object = self.configs["destination_object"]
        dry_run = self.configs["dry_run"]

        if dry_run:
            response = _dry_run(bucket=source_bucket, object_paths=[source_object])

            self._logger.info("Paths that would be copied:")
            self._logger.info(response)
        else:
            self._copy_objects(
                source_bucket=source_bucket,
                source_object=source_object,
                destination_bucket=destination_bucket,
                destination_object=destination_object,
            )

    @staticmethod
    def _copy_objects(
        source_bucket: str,
        source_object: str,
        destination_bucket: str,
        destination_object: str,
    ) -> None:
        """Copies objects and 'directories'.

        Args:
            source_bucket: name of bucket to perform the copy.
            source_object: object/folder to be copied.
            destination_bucket: name of the target bucket to copy.
            destination_object: target object/folder to copy.
        """
        from lakehouse_engine.core.exec_env import ExecEnv

        copy_from = _get_path(source_bucket, source_object)
        copy_to = _get_path(destination_bucket, destination_object)

        DBFSFileManager._logger.info(f"Copying: {copy_from} to {copy_to}")

        try:
            DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.cp(
                copy_from, copy_to, True
            )

            DBFSFileManager._logger.info(f"Copied: {copy_from} to {copy_to}")
        except Exception as e:
            DBFSFileManager._logger.error(
                f"Error copying file {copy_from} to {copy_to} - {e}"
            )
            raise e

    def move_objects(self) -> None:
        """Moves objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be moved based on the given keys.
        """
        source_bucket = self.configs["bucket"]
        source_object = self.configs["source_object"]
        destination_bucket = self.configs["destination_bucket"]
        destination_object = self.configs["destination_object"]
        dry_run = self.configs["dry_run"]

        if dry_run:
            response = _dry_run(bucket=source_bucket, object_paths=[source_object])

            self._logger.info("Paths that would be moved:")
            self._logger.info(response)
        else:
            self._move_objects(
                source_bucket=source_bucket,
                source_object=source_object,
                destination_bucket=destination_bucket,
                destination_object=destination_object,
            )

    @staticmethod
    def _move_objects(
        source_bucket: str,
        source_object: str,
        destination_bucket: str,
        destination_object: str,
    ) -> None:
        """Moves objects and 'directories'.

        Args:
            source_bucket: name of bucket to perform the move.
            source_object: object/folder to be moved.
            destination_bucket: name of the target bucket to move.
            destination_object: target object/folder to move.
        """
        from lakehouse_engine.core.exec_env import ExecEnv

        move_from = _get_path(source_bucket, source_object)
        move_to = _get_path(destination_bucket, destination_object)

        DBFSFileManager._logger.info(f"Moving: {move_from} to {move_to}")

        try:
            DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.mv(
                move_from, move_to, True
            )

            DBFSFileManager._logger.info(f"Moved: {move_from} to {move_to}")
        except Exception as e:
            DBFSFileManager._logger.error(
                f"Error moving file {move_from} to {move_to} - {e}"
            )
            raise e


================================================
FILE: lakehouse_engine/core/definitions.py
================================================
"""Definitions of standard values and structures for core components."""

from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import ClassVar, Collection, List, Optional, Tuple

from pyspark.sql import DataFrame
from pyspark.sql.types import (
    ArrayType,
    BooleanType,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

from lakehouse_engine.io.exceptions import InputNotFoundException


class CollectEngineUsage(Enum):
    """Options for collecting engine usage stats.

    - enabled, enables the collection and storage of Lakehouse Engine
    usage statistics for any environment.
    - prod_only, enables the collection and storage of Lakehouse Engine
    usage statistics for production environment only.
    - disabled, disables the collection and storage of Lakehouse Engine
    usage statistics, for all environments.
    """

    ENABLED = "enabled"
    PROD_ONLY = "prod_only"
    DISABLED = "disabled"


@dataclass
class EngineConfig(object):
    """Definitions that can come from the Engine Config file.

    - dq_bucket: S3 prod bucket used to store data quality related artifacts.
    - dq_dev_bucket: S3 dev bucket used to store data quality related artifacts.
    - notif_disallowed_email_servers: email servers not allowed to be used
        for sending notifications.
    - engine_usage_path: path where the engine prod usage stats are stored.
    - engine_dev_usage_path: path where the engine dev usage stats are stored.
    - collect_engine_usage: whether to enable the collection of lakehouse
        engine usage stats or not.
    - dq_functions_column_list: list of columns to be added to the meta argument
        of GX when using PRISMA.
    - raise_on_config_not_available: whether to raise an exception if a spark config
        is not available.
    - prod_catalog: name of the prod catalog being used. This is useful to derive
        whether the environment is prod or dev, so the dev or prod buckets/paths can be
        used for storing engine usage stats and dq artifacts.
    - environment: environment that the engine is being executed on. Takes precedence
        over prod_catalog when defining if the environment is prod or dev.
    - sharepoint_authority: authority for the Sharepoint api.
    - sharepoint_company_domain: company domain for the Sharepoint api.
    - sharepoint_api_domain: api domain for the Sharepoint api.
    """

    dq_bucket: Optional[str] = None
    dq_dev_bucket: Optional[str] = None
    notif_disallowed_email_servers: Optional[list] = None
    engine_usage_path: Optional[str] = None
    engine_dev_usage_path: Optional[str] = None
    collect_engine_usage: str = CollectEngineUsage.ENABLED.value
    dq_functions_column_list: Optional[list] = None
    dq_result_sink_columns_to_delete: Optional[list] = None
    sharepoint_authority: Optional[str] = None
    sharepoint_company_domain: Optional[str] = None
    sharepoint_api_domain: Optional[str] = None
    raise_on_config_not_available: bool = False
    prod_catalog: Optional[str] = None
    environment: Optional[str] = None


class EngineStats(object):
    """Definitions for collection of Lakehouse Engine Stats.

    !!! note
        whenever the value comes from a key inside a Spark Config
        that returns an array, it can be specified with a '#' so that it
        is adequately processed.
    """

    CLUSTER_USAGE_TAGS = "spark.databricks.clusterUsageTags"
    DEF_SPARK_CONFS = {
        "dp_name": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#accountName",
        "environment": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#environment",
        "workspace_id": f"{CLUSTER_USAGE_TAGS}.orgId",
        "job_id": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#JobId",
        "job_name": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#RunName",
        "run_id": f"{CLUSTER_USAGE_TAGS}.clusterAllTags#ClusterName",
    }
    DEF_DATABRICKS_CONTEXT_KEYS = {
        "environment": "environment",
        "dp_name": "jobName",
        "run_id": "runId",
        "job_id": "jobId",
        "job_name": "jobName",
        "workspace_id": "workspaceId",
        "policy_id": "usagePolicyId",
    }


class InputFormat(Enum):
    """Formats of algorithm input."""

    JDBC = "jdbc"
    AVRO = "avro"
    JSON = "json"
    CSV = "csv"
    PARQUET = "parquet"
    DELTAFILES = "delta"
    CLOUDFILES = "cloudfiles"
    KAFKA = "kafka"
    SQL = "sql"
    SAP_BW = "sap_bw"
    SAP_B4 = "sap_b4"
    DATAFRAME = "dataframe"
    SFTP = "sftp"
    SHAREPOINT = "sharepoint"

    @classmethod
    def values(cls):  # type: ignore
        """Generates a list containing all enum values.

        Returns:
            A list with all enum values.
        """
        return (c.value for c in cls)

    @classmethod
    def exists(cls, input_format: str) -> bool:
        """Checks if the input format exists in the enum values.

        Args:
            input_format: format to check if exists.

        Returns:
            If the input format exists in our enum.
        """
        return input_format in cls.values()


# Formats of input that are considered files.
FILE_INPUT_FORMATS = [
    InputFormat.AVRO.value,
    InputFormat.JSON.value,
    InputFormat.PARQUET.value,
    InputFormat.CSV.value,
    InputFormat.DELTAFILES.value,
    InputFormat.CLOUDFILES.value,
]

SHAREPOINT_SUPPORTED_EXTENSIONS = {".csv", ".xlsx"}


@dataclass
class SharepointFile:
    """Represents a file from Sharepoint with metadata and optional content."""

    file_name: str
    time_created: str
    time_modified: str
    content: Optional[bytes] = None
    _folder: Optional[str] = None
    skip_rename: bool = False
    _already_archived: bool = False

    @property
    def file_extension(self) -> str:
        """Returns the file extension of the stored file."""
        return Path(self.file_name).suffix

    @property
    def file_path(self) -> str:
        """Full Sharepoint path including folder and file name."""
        if not self._folder:
            raise AttributeError("file_path unavailable; _folder not set.")
        return f"{self._folder}/{self.file_name}"

    @property
    def is_csv(self) -> bool:
        """True if file is a CSV."""
        return self.file_extension.lower() == ".csv"

    @property
    def is_excel(self) -> bool:
        """True if file is an Excel file."""
        return self.file_extension.lower() == ".xlsx"

    @property
    def content_size(self) -> int:
        """Size of content in bytes."""
        return len(self.content) if self.content else 0


@dataclass
class SharepointOptions(object):
    """Options for Sharepoint I/O (used by both reader and writer).

    This dataclass is shared by the Sharepoint reader and writer. Some fields
    are required/used only in *read* mode, others only in *write* mode.
    Use `validate_for_reader()` / `validate_for_writer()` to enforce the
    correct subsets.

    Common (reader & writer):
      - client_id (str): Azure AD application (client) ID.
      - tenant_id (str): Azure AD tenant (directory) ID.
      - site_name (str): Sharepoint site name.
      - drive_name (str): Document library/drive name.
      - secret (str): Client secret.
      - local_path (str): Local/volume path for staging (read/write temp).
      - api_version (str): Microsoft Graph API version (default: "v1.0").
      - conflict_behaviour (Optional[str]): e.g. 'replace', 'fail'.
      - allowed_extensions (Optional[Collection[str]]):
          Defaults to SHAREPOINT_SUPPORTED_EXTENSIONS {".csv", ".xlsx"}.

    Reader-specific:
      - folder_relative_path (Optional[str]): Folder (or full file path)
          to read from.
      - file_name (Optional[str]): Name of a single file inside the folder
          to read. If `folder_relative_path` already points to a file,
          `file_name` must be None.
      - file_type (Optional[str]): "csv" or "xlsx" when reading a folder.
      - file_pattern (Optional[str]): Glob (e.g. '*.csv') when reading a folder.
      - local_options (Optional[dict]): Spark CSV read options (e.g. header, sep).
      - chunk_size (Optional[int]): Download chunk size (bytes).

    Writer-specific:
      - file_name (Optional[str]): Target file name to upload.
      - local_options (Optional[dict]): Spark CSV write options.
      - chunk_size (Optional[int]): Upload chunk size (bytes).

    Archiving (reader):
      - archive_enabled (bool): Whether to move files after a successful/failed read.
          Default: True.
      - archive_success_subfolder (Optional[str]): Success folder (default "done").
          Set None to keep in place.
      - archive_error_subfolder (Optional[str]): Error folder (default "error").
          Set None to keep in place.
    """

    # Common
    client_id: str
    tenant_id: str
    site_name: str
    drive_name: str
    secret: str
    local_path: str
    file_name: Optional[str] = None  # used by reader (optional) and writer (target)
    api_version: str = "v1.0"
    conflict_behaviour: Optional[str] = None
    allowed_extensions: Optional[Collection[str]] = None

    # Reader
    file_type: Optional[str] = None
    folder_relative_path: Optional[str] = None
    file_pattern: Optional[str] = None
    chunk_size: Optional[int] = 100 * 1024 * 1024  # 100 MB (read & write)
    local_options: Optional[dict] = None  # (read & write)

    # Reader archiving
    archive_enabled: bool = True
    archive_success_subfolder: Optional[str] = "done"
    archive_error_subfolder: Optional[str] = "error"

    REQUIRED_READER_OPTS: ClassVar[Tuple[str, ...]] = (
        "site_name",
        "drive_name",
        "folder_relative_path",
    )
    REQUIRED_WRITER_OPTS: ClassVar[Tuple[str, ...]] = (
        "site_name",
        "drive_name",
        "local_path",
    )

    def __post_init__(self) -> None:
        """Normalize and validate Sharepoint options (types, extensions, etc)."""
        allowed_extensions = self._get_allowed_extensions()
        allowed_file_types = {extension.lstrip(".") for extension in allowed_extensions}

        self._validate_file_type(allowed_file_types)
        self._normalize_folder_relative_path()

        self._validate_folder_relative_path_extension_if_looks_like_file(
            allowed_extensions
        )
        self._validate_single_file_mode_constraints_if_folder_is_file_path(
            allowed_extensions
        )

        self._validate_file_name_and_file_pattern_are_not_both_set()

    def _get_allowed_extensions(self) -> set[str]:
        """Return the supported file extensions (lowercased)."""
        return {
            extension.lower()
            for extension in (
                self.allowed_extensions or SHAREPOINT_SUPPORTED_EXTENSIONS
            )
        }

    def _validate_file_type(self, allowed_file_types: set[str]) -> None:
        """Validate that `file_type` is supported when provided."""
        if not self.file_type:
            return

        if self.file_type.lower() not in allowed_file_types:
            raise ValueError(
                f"`file_type` must be one of {sorted(allowed_file_types)}. "
                f"Got: '{self.file_type}'"
            )

    def _normalize_folder_relative_path(self) -> None:
        """Strip leading and trailing slashes from `folder_relative_path`."""
        if self.folder_relative_path:
            self.folder_relative_path = self.folder_relative_path.strip("/")

    def _ends_with_supported_extension(
        self,
        path_value: str,
        allowed_extensions: set[str],
    ) -> bool:
        """Return True if the path ends with any supported extension."""
        lowered_path_value = path_value.lower()
        return any(
            lowered_path_value.endswith(extension) for extension in allowed_extensions
        )

    def _validate_single_file_mode_constraints_if_folder_is_file_path(
        self,
        allowed_extensions: set[str],
    ) -> None:
        """Forbid file name, pattern, and type when folder_relative_path end is file."""
        if not self.folder_relative_path:
            return

        if not self._ends_with_supported_extension(
            self.folder_relative_path, allowed_extensions
        ):
            return

        if self.file_name:
            raise ValueError(
                "When `folder_relative_path` points to a file, `file_name` must "
                "be None."
            )
        if self.file_pattern:
            raise ValueError(
                "When `folder_relative_path` points to a file, `file_pattern` must "
                "be None."
            )
        if self.file_type:
            raise ValueError(
                "When `folder_relative_path` points to a file, `file_type` must "
                "be None (it's derived from file_path extension)"
            )

    def _validate_file_name_extension(self, allowed_extensions: set[str]) -> None:
        """Validate that `file_name` ends with a supported extension when provided."""
        if not self.file_name:
            return

        if not self._ends_with_supported_extension(self.file_name, allowed_extensions):
            raise ValueError(
                f"`file_name` must end with one of {sorted(allowed_extensions)},"
                f" got: {self.file_name}"
            )

    def _validate_file_name_and_file_pattern_are_not_both_set(self) -> None:
        """Validate that `file_name` and `file_pattern` are not both set."""
        if self.file_name and self.file_pattern:
            raise ValueError(
                "Conflicting options: provide either `file_name` or `file_pattern`"
                ", not both."
            )

    def _validate_folder_relative_path_extension_if_looks_like_file(
        self,
        allowed_extensions: set[str],
    ) -> None:
        """Fail if folder_relative_path is a file path but has unsupported extension."""
        if not self.folder_relative_path:
            return

        last_segment = self.folder_relative_path.split("/")[-1]
        looks_like_file = "." in last_segment
        if not looks_like_file:
            return

        if self._ends_with_supported_extension(last_segment, allowed_extensions):
            return

        raise ValueError(
            f"`folder_relative_path` appears to be a file path but does not end "
            f"with one of {sorted(allowed_extensions)}: {self.folder_relative_path}"
        )

    def validate_for_reader(self) -> None:
        """Validate Sharepoint options required for reading."""
        missing = [opt for opt in self.REQUIRED_READER_OPTS if not getattr(self, opt)]
        if missing:
            raise InputNotFoundException(
                f"Missing required Sharepoint options for reader: {', '.join(missing)}"
            )
        allowed_extensions = self._get_allowed_extensions()
        if self.file_name and not self._ends_with_supported_extension(
            self.file_name, allowed_extensions
        ):
            raise ValueError(
                f"`file_name` must end with one of {sorted(allowed_extensions)}, "
                "got: {self.file_name}"
            )

    def validate_for_writer(self) -> None:
        """Validate Sharepoint options required for writing."""
        missing = [opt for opt in self.REQUIRED_WRITER_OPTS if not getattr(self, opt)]
        if missing:
            raise InputNotFoundException(
                f"Missing required Sharepoint options for writer: {', '.join(missing)}"
            )


class OutputFormat(Enum):
    """Formats of algorithm output."""

    JDBC = "jdbc"
    AVRO = "avro"
    JSON = "json"
    CSV = "csv"
    PARQUET = "parquet"
    DELTAFILES = "delta"
    KAFKA = "kafka"
    CONSOLE = "console"
    NOOP = "noop"
    DATAFRAME = "dataframe"
    REST_API = "rest_api"
    FILE = "file"  # Internal use only
    TABLE = "table"  # Internal use only
    SHAREPOINT = "sharepoint"

    @classmethod
    def values(cls):  # type: ignore
        """Generates a list containing all enum values.

        Returns:
            A list with all enum values.
        """
        return (c.value for c in cls)

    @classmethod
    def exists(cls, output_format: str) -> bool:
        """Checks if the output format exists in the enum values.

        Args:
            output_format: format to check if exists.

        Returns:
            If the output format exists in our enum.
        """
        return output_format in cls.values()


# Formats of output that are considered files.
FILE_OUTPUT_FORMATS = [
    OutputFormat.AVRO.value,
    OutputFormat.JSON.value,
    OutputFormat.PARQUET.value,
    OutputFormat.CSV.value,
    OutputFormat.DELTAFILES.value,
]


class NotifierType(Enum):
    """Type of notifier available."""

    EMAIL = "email"


class NotificationRuntimeParameters(Enum):
    """Parameters to be replaced in runtime."""

    DATABRICKS_JOB_NAME = "databricks_job_name"
    DATABRICKS_WORKSPACE_ID = "databricks_workspace_id"
    JOB_EXCEPTION = "exception"


NOTIFICATION_RUNTIME_PARAMETERS = [
    NotificationRuntimeParameters.DATABRICKS_JOB_NAME.value,
    NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID.value,
    NotificationRuntimeParameters.JOB_EXCEPTION.value,
]


class ReadType(Enum):
    """Define the types of read operations.

    - BATCH - read the data in batch mode (e.g., Spark batch).
    - STREAMING - read the data in streaming mode (e.g., Spark streaming).
    """

    BATCH = "batch"
    STREAMING = "streaming"


class ReadMode(Enum):
    """Different modes that control how we handle compliance to the provided schema.

    These read modes map to Spark's read modes at the moment.
    """

    PERMISSIVE = "PERMISSIVE"
    FAILFAST = "FAILFAST"
    DROPMALFORMED = "DROPMALFORMED"


class DQDefaults(Enum):
    """Defaults used on the data quality process."""

    FILE_SYSTEM_STORE = "file_system"
    FILE_SYSTEM_S3_STORE = "s3"
    DQ_BATCH_IDENTIFIERS = ["spec_id", "input_id", "timestamp"]
    DATASOURCE_CLASS_NAME = "Datasource"
    DATASOURCE_EXECUTION_ENGINE = "SparkDFExecutionEngine"
    DATA_CONNECTORS_CLASS_NAME = "RuntimeDataConnector"
    DATA_CONNECTORS_MODULE_NAME = "great_expectations.datasource.data_connector"
    STORE_BACKEND = "s3"
    EXPECTATIONS_STORE_PREFIX = "dq/expectations/"
    VALIDATIONS_STORE_PREFIX = "dq/validations/"
    CHECKPOINT_STORE_PREFIX = "dq/checkpoints/"
    CUSTOM_EXPECTATION_LIST = [
        "expect_column_values_to_be_date_not_older_than",
        "expect_column_pair_a_to_be_smaller_or_equal_than_b",
        "expect_multicolumn_column_a_must_equal_b_or_c",
        "expect_queried_column_agg_value_to_be",
        "expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b",
        "expect_column_pair_a_to_be_not_equal_to_b",
        "expect_column_values_to_not_be_null_or_empty_string",
    ]
    DQ_COLUMNS_TO_KEEP_TYPES = [
        "success",
        "run_time",
        "validation_results",
        "expectation_success",
        "exception_info",
        "meta",
        "run_time_year",
        "run_time_month",
        "run_time_day",
        "source_primary_key",
        "evaluated_expectations",
        "success_percent",
        "successful_expectations",
        "unsuccessful_expectations",
        "unexpected_index_list",
    ]
    DQ_VALIDATIONS_SCHEMA = StructType(
        [
            StructField(
                "dq_validations",
                StructType(
                    [
                        StructField("run_name", StringType()),
                        StructField("run_success", BooleanType()),
                        StructField("raised_exceptions", BooleanType()),
                        StructField("run_row_success", BooleanType()),
                        StructField(
                            "dq_failure_details",
                            ArrayType(
                                StructType(
                                    [
                                        StructField("expectation_type", StringType()),
                                        StructField("kwargs", StringType()),
                                    ]
                                ),
                            ),
                        ),
                    ]
                ),
            )
        ]
    )


class WriteType(Enum):
    """Types of write operations."""

    OVERWRITE = "overwrite"
    COMPLETE = "complete"
    APPEND = "append"
    UPDATE = "update"
    MERGE = "merge"
    ERROR_IF_EXISTS = "error"
    IGNORE_IF_EXISTS = "ignore"


@dataclass
class InputSpec(object):
    """Specification of an algorithm input.

    This is very aligned with the way the execution environment connects to the sources
    (e.g., spark sources).

    - spec_id: spec_id of the input specification
    - read_type: ReadType type of read
        operation.
    - data_format: format of the input.
    - sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp
        directory.
    - df_name: dataframe name.
    - db_table: table name in the form of `<db>.<table>`.
    - location: uri that identifies from where to read data in the
      specified format.
    - sharepoint_opts: Options to apply when reading from Sharepoint.
    - enforce_schema_from_table: if we want to enforce the table schema or not,
    by providing a table name in the form of `<db>.<table>`.
    - query: sql query to execute and return the dataframe. Use it if you do not want
        to read from a file system nor from a table, but rather from a sql query.
    - schema: dict representation of a schema of the input (e.g., Spark struct
    type schema).
    - schema_path: path to a file with a representation of a schema of the
    input (e.g., Spark struct type schema).
    - disable_dbfs_retry: optional flag to disable file storage dbfs.
    - with_filepath: if we want to include the path of the file that is being
    read. Only
        works with the file reader (batch and streaming modes are supported).
    - options: dict with other relevant options according to the execution
        environment (e.g., spark) possible sources.
    - calculate_upper_bound: when to calculate upper bound to extract from SAP BW
        or not.
    - calc_upper_bound_schema: specific schema for the calculated upper_bound.
    - generate_predicates: when to generate predicates to extract from SAP BW or not.
    - predicates_add_null: if we want to include is null on partition by predicates.
    - temp_view: optional name of a view to point to the input dataframe to be used
        to create or replace a temp view on top of the dataframe.
    """

    spec_id: str
    read_type: str
    data_format: Optional[str] = None
    sftp_files_format: Optional[str] = None
    df_name: Optional[DataFrame] = None
    db_table: Optional[str] = None
    location: Optional[str] = None
    sharepoint_opts: Optional[SharepointOptions] = None
    query: Optional[str] = None
    enforce_schema_from_table: Optional[str] = None
    schema: Optional[dict] = None
    schema_path: Optional[str] = None
    disable_dbfs_retry: bool = False
    with_filepath: bool = False
    options: Optional[dict] = None
    jdbc_args: Optional[dict] = None
    calculate_upper_bound: bool = False
    calc_upper_bound_schema: Optional[str] = None
    generate_predicates: bool = False
    predicates_add_null: bool = True
    temp_view: Optional[str] = None

    def __post_init__(self) -> None:
        """Normalize Sharepoint options if passed as a raw dictionary.

        Args:
            self: Instance of the class where `sharepoint_opts` attribute
                may be either a dictionary or a SharepointOptions object.
        """
        if isinstance(self.sharepoint_opts, dict):
            self.sharepoint_opts = SharepointOptions(**self.sharepoint_opts)


@dataclass
class TransformerSpec(object):
    """Transformer Specification, i.e., a single transformation amongst many.

    - function: name of the function (or callable function) to be executed.
    - args: (not applicable if using a callable function) dict with the arguments
        to pass to the function `<k,v>` pairs with the name of the parameter of
        the function and the respective value.
    """

    function: str
    args: dict


@dataclass
class TransformSpec(object):
    """Transformation Specification.

    I.e., the specification that defines the many transformations to be done to the data
    that was read.

    - spec_id: id of the terminate specification
    - input_id: id of the corresponding input
    specification.
    - transformers: list of transformers to execute.
    - force_streaming_foreach_batch_processing: sometimes, when using streaming, we want
        to force the transform to be executed in the foreachBatch function to ensure
        non-supported streaming operations can be properly executed.
    """

    spec_id: str
    input_id: str
    transformers: List[TransformerSpec]
    force_streaming_foreach_batch_processing: bool = False


class DQType(Enum):
    """Available data quality tasks."""

    VALIDATOR = "validator"
    PRISMA = "prisma"


class DQResultFormat(Enum):
    """Available data quality result formats."""

    COMPLETE = "COMPLETE"


class DQExecutionPoint(Enum):
    """Available data quality execution points."""

    IN_MOTION = "in_motion"
    AT_REST = "at_rest"


class DQTableBaseParameters(Enum):
    """Base parameters for importing DQ rules from a table."""

    PRISMA_BASE_PARAMETERS = ["arguments", "dq_tech_function"]


@dataclass
class DQFunctionSpec(object):
    """Defines a data quality function specification.

    - function - name of the data quality function (expectation) to execute.
    It follows the great_expectations api https://greatexpectations.io/expectations/.
    - args - args of the function (expectation). Follow the same api as above.
    """

    function: str
    args: Optional[dict] = None


@dataclass
class DQSpec(object):
    """Data quality overall specification.

    - spec_id - id of the specification.
    - input_id - id of the input specification.
    - dq_type - type of DQ process to execute (e.g. validator).
    - dq_functions - list of function specifications to execute.
    - dq_db_table - name of table to derive the dq functions from.
    - dq_table_table_filter - name of the table which rules are to be applied in the
        validations (Only used when deriving dq functions).
    - dq_table_extra_filters - extra filters to be used when deriving dq functions.
        This is a sql expression to be applied to the dq_db_table.
    - execution_point - execution point of the dq functions. [at_rest, in_motion].
        This is set during the load_data or dq_validator functions.
    - unexpected_rows_pk - the list of columns composing the primary key of the
        source data to identify the rows failing the DQ validations. Note: only one
        of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It
        is mandatory to provide one of these arguments when using tag_source_data
        as True. When tag_source_data is False, this is not mandatory, but still
        recommended.
    - tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
        Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to
        be provided. It is mandatory to provide one of these arguments when using
        tag_source_data as True. hen tag_source_data is False, this is not
        mandatory, but still recommended.
    - gx_result_format - great expectations result format. Default: "COMPLETE".
    - tag_source_data - when set to true, this will ensure that the DQ process ends by
        tagging the source data with an additional column with information about the
        DQ results. This column makes it possible to identify if the DQ run was
        succeeded in general and, if not, it unlocks the insights to know what
        specific rows have made the DQ validations fail and why. Default: False.
        Note: it only works if result_sink_explode is True, gx_result_format is
        COMPLETE, fail_on_error is False (which is done automatically when
        you specify tag_source_data as True) and tbl_to_derive_pk or
        unexpected_rows_pk is configured.
    - store_backend - which store_backend to use (e.g. s3 or file_system).
    - local_fs_root_dir - path of the root directory. Note: only applicable for
        store_backend file_system.
    - bucket - the bucket name to consider for the store_backend (store DQ artefacts).
        Note: only applicable for store_backend s3.
    - expectations_store_prefix - prefix where to store expectations' data. Note: only
        applicable for store_backend s3.
    - validations_store_prefix - prefix where to store validations' data. Note: only
        applicable for store_backend s3.
    - checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only
        applicable for store_backend s3.
    - data_asset_name - name of the data asset to consider when configuring the great
        expectations' data source.
    - expectation_suite_name - name to consider for great expectations' suite.
    - result_sink_db_table - db.table_name indicating the database and table in which
        to save the results of the DQ process.
    - result_sink_location - file system location in which to save the results of the
        DQ process.
    - result_sink_chunk_size - number of records per chunk when writing the results of
        the DQ process. Default: 1000000 records.
    - processed_keys_location - file system location where the keys processed by the
        DQ Process are saved. This is specifically used when the DQ Type is PRISMA.
        Note that this location is always constructed during the process, so any
        value defined in the configuration will be overwritten.
    - data_product_name - name of the data product.
    - result_sink_partitions - the list of partitions to consider.
    - result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
    - result_sink_options - extra spark options for configuring the result sink.
        E.g: can be used to configure a Kafka sink if result_sink_format is kafka.
    - result_sink_explode - flag to determine if the output table/location should have
        the columns exploded (as True) or not (as False). Default: True.
    - result_sink_extra_columns - list of extra columns to be exploded (following
        the pattern "<name>.*") or columns to be selected. It is only used when
        result_sink_explode is set to True.
    - source - name of data source, to be easier to identify in analysis. If not
        specified, it is set as default <input_id>. This will be only used
        when result_sink_explode is set to True.
    - fail_on_error - whether to fail the algorithm if the validations of your data in
        the DQ process failed.
    - cache_df - whether to cache the dataframe before running the DQ process or not.
    - critical_functions - functions that should not fail. When this argument is
        defined, fail_on_error is nullified.
    - max_percentage_failure - percentage of failure that should be allowed.
        This argument has priority over both fail_on_error and critical_functions.
    - enable_row_condition - flag to determine if the row_conditions should be
    enabled or not. row_conditions allow you to filter the rows that are
    processed by the DQ functions. This is useful when you want to run the
    DQ functions only on a subset of the data. Default: False. Note: When using PRISMA,
    if you enable this flag, bear in mind that the number of processed keys will be
    numerically different from the evaluated keys. This happens because the
    row_conditions limit the number of rows that are processed by the DQ functions,
    but the we consider processed keys as all the keys that are passed to the dq_spec.
    """

    spec_id: str
    input_id: str
    dq_type: str
    dq_functions: Optional[List[DQFunctionSpec]] = None
    dq_db_table: Optional[str] = None
    dq_table_table_filter: Optional[str] = None
    dq_table_extra_filters: Optional[str] = None
    execution_point: Optional[str] = None
    unexpected_rows_pk: Optional[List[str]] = None
    tbl_to_derive_pk: Optional[str] = None
    gx_result_format: Optional[str] = DQResultFormat.COMPLETE.value
    tag_source_data: Optional[bool] = False
    store_backend: str = DQDefaults.STORE_BACKEND.value
    local_fs_root_dir: Optional[str] = None
    bucket: Optional[str] = None
    expectations_store_prefix: str = DQDefaults.EXPECTATIONS_STORE_PREFIX.value
    validations_store_prefix: str = DQDefaults.VALIDATIONS_STORE_PREFIX.value
    checkpoint_store_prefix: str = DQDefaults.CHECKPOINT_STORE_PREFIX.value
    data_asset_name: Optional[str] = None
    expectation_suite_name: Optional[str] = None
    result_sink_db_table: Optional[str] = None
    result_sink_location: Optional[str] = None
    result_sink_chunk_size: Optional[int] = 1000000
    processed_keys_location: Optional[str] = None
    data_product_name: Optional[str] = None
    result_sink_partitions: Optional[List[str]] = None
    result_sink_format: str = OutputFormat.DELTAFILES.value
    result_sink_options: Optional[dict] = None
    result_sink_explode: bool = True
    result_sink_extra_columns: Optional[List[str]] = None
    source: Optional[str] = None
    fail_on_error: bool = True
    cache_df: bool = False
    critical_functions: Optional[List[DQFunctionSpec]] = None
    max_percentage_failure: Optional[float] = None
    enable_row_condition: bool = False


@dataclass
class MergeOptions(object):
    """Options for a merge operation.

    - merge_predicate: predicate to apply to the merge operation so that we can
        check if a new record corresponds to a record already included in the
        historical data.
    - insert_only: indicates if the merge should only insert data (e.g., deduplicate
        scenarios).
    - delete_predicate: predicate to apply to the delete operation.
    - update_predicate: predicate to apply to the update operation.
    - insert_predicate: predicate to apply to the insert operation.
    - update_column_set: rules to apply to the update operation which allows to
        set the value for each column to be updated.
        (e.g. {"data": "new.data", "count": "current.count + 1"} )
    - insert_column_set: rules to apply to the insert operation which allows to
        set the value for each column to be inserted.
        (e.g. {"date": "updates.date", "count": "1"} )
    """

    merge_predicate: str
    insert_only: bool = False
    delete_predicate: Optional[str] = None
    update_predicate: Optional[str] = None
    insert_predicate: Optional[str] = None
    update_column_set: Optional[dict] = None
    insert_column_set: Optional[dict] = None


@dataclass
class OutputSpec(object):
    """Specification of an algorithm output.

    This is very aligned with the way the execution environment connects to the output
    systems (e.g., spark outputs).

    - spec_id: id of the output specification.
    - input_id: id of the corresponding input specification.
    - write_type: type of write operation.
    - data_format: format of the output. Defaults to DELTA.
    - db_table: table name in the form of `<db>.<table>`.
    - location: uri that identifies from where to write data in the specified format.
    - sharepoint_opts: options to apply on writing on Sharepoint operations.
    - partitions: list of partition input_col names.
    - merge_opts: options to apply to the merge operation.
    - streaming_micro_batch_transformers: transformers to invoke for each streaming
        micro batch, before writing (i.e., in Spark's foreachBatch structured
        streaming function). Note: the lakehouse engine manages this for you, so
        you don't have to manually specify streaming transformations here, so we don't
        advise you to manually specify transformations through this parameter. Supply
        them as regular transformers in the transform_specs sections of an ACON.
    - streaming_once: if the streaming query is to be executed just once, or not,
        generating just one micro batch.
    - streaming_processing_time: if streaming query is to be kept alive, this indicates
        the processing time of each micro batch.
    - streaming_available_now: if set to True, set a trigger that processes all
        available data in multiple batches then terminates the query.
        When using streaming, this is the default trigger that the lakehouse-engine will
        use, unless you configure a different one.
    - streaming_continuous: set a trigger that runs a continuous query with a given
        checkpoint interval.
    - streaming_await_termination: whether to wait (True) for the termination of the
        streaming query (e.g. timeout or exception) or not (False). Default: True.
    - streaming_await_termination_timeout: a timeout to set to the
        streaming_await_termination. Default: None.
    - with_batch_id: whether to include the streaming batch id in the final data,
        or not. It only takes effect in streaming mode.
    - options: dict with other relevant options according to the execution environment
        (e.g., spark) possible outputs.  E.g.,: JDBC options, checkpoint location for
        streaming, etc.
    - streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers
        but for the DQ functions to be executed. Used internally by the lakehouse
        engine, so you don't have to supply DQ functions through this parameter. Use the
        dq_specs of the acon instead.
    """

    spec_id: str
    input_id: str
    write_type: str
    data_format: str = OutputFormat.DELTAFILES.value
    db_table: Optional[str] = None
    location: Optional[str] = None
    sharepoint_opts: Optional[SharepointOptions] = None
    merge_opts: Optional[MergeOptions] = None
    partitions: Optional[List[str]] = None
    streaming_micro_batch_transformers: Optional[List[TransformerSpec]] = None
    streaming_once: Optional[bool] = None
    streaming_processing_time: Optional[str] = None
    streaming_available_now: bool = True
    streaming_continuous: Optional[str] = None
    streaming_await_termination: bool = True
    streaming_await_termination_timeout: Optional[int] = None
    with_batch_id: bool = False
    options: Optional[dict] = None
    streaming_micro_batch_dq_processors: Optional[List[DQSpec]] = None


@dataclass
class TerminatorSpec(object):
    """Terminator Specification.

    I.e., the specification that defines a terminator operation to be executed. Examples
    are compute statistics, vacuum, optimize, etc.

    - function: terminator function to execute.
    - args: arguments of the terminator function.
    - input_id: id of the corresponding output specification (Optional).
    """

    function: str
    args: Optional[dict] = None
    input_id: Optional[str] = None


@dataclass
class ReconciliatorSpec(object):
    """Reconciliator Specification.

    - metrics: list of metrics in the form of:
        [{
            metric: name of the column present in both truth and current datasets,
            aggregation: sum, avg, max, min, ...,
            type: percentage or absolute,
            yellow: value,
            red: value
        }].
    - recon_type: reconciliation type (percentage or absolute). Percentage calculates
        the difference between truth and current results as a percentage (x-y/x), and
        absolute calculates the raw difference (x - y).
    - truth_input_spec: input specification of the truth data.
    - current_input_spec: input specification of the current results data
    - truth_preprocess_query: additional query on top of the truth input data to
        preprocess the truth data before it gets fueled into the reconciliation process.
        Important note: you need to assume that the data out of
        the truth_input_spec is referencable by a table called 'truth'.
    - truth_preprocess_query_args: optional dict having the functions/transformations to
        apply on top of the truth_preprocess_query and respective arguments. Note: cache
        is being applied on the Dataframe, by default. For turning the default behavior
        off, pass `"truth_preprocess_query_args": []`.
    - current_preprocess_query: additional query on top of the current results input
        data to preprocess the current results data before it gets fueled into the
        reconciliation process. Important note: you need to assume that the data out of
        the current_results_input_spec is referencable by a table called 'current'.
    - current_preprocess_query_args: optional dict having the
        functions/transformations to apply on top of the current_preprocess_query
        and respective arguments. Note: cache is being applied on the Dataframe,
        by default. For turning the default behavior off, pass
        `"current_preprocess_query_args": []`.
    - ignore_empty_df: optional boolean, to ignore the recon process if source & target
       dataframes are empty, recon will exit success code (passed)
    """

    metrics: List[dict]
    truth_input_spec: InputSpec
    current_input_spec: InputSpec
    truth_preprocess_query: Optional[str] = None
    truth_preprocess_query_args: Optional[List[dict]] = None
    current_preprocess_query: Optional[str] = None
    current_preprocess_query_args: Optional[List[dict]] = None
    ignore_empty_df: Optional[bool] = False


@dataclass
class DQValidatorSpec(object):
    """Data Quality Validator Specification.

    - input_spec: input specification of the data to be checked/validated.
    - dq_spec: data quality specification.
    - restore_prev_version: specify if, having
        delta table/files as input, they should be restored to the
        previous version if the data quality process fails. Note: this
        is only considered if fail_on_error is kept as True.
    """

    input_spec: InputSpec
    dq_spec: DQSpec
    restore_prev_version: Optional[bool] = False


class SQLDefinitions(Enum):
    """SQL definitions statements."""

    compute_table_stats = "ANALYZE TABLE {} COMPUTE STATISTICS"
    drop_table_stmt = "DROP TABLE IF EXISTS"
    drop_view_stmt = "DROP VIEW IF EXISTS"
    truncate_stmt = "TRUNCATE TABLE"
    describe_stmt = "DESCRIBE TABLE"
    optimize_stmt = "OPTIMIZE"
    show_tbl_props_stmt = "SHOW TBLPROPERTIES"
    delete_where_stmt = "DELETE FROM {} WHERE {}"


class FileManagerAPIKeys(Enum):
    """File Manager s3 api keys."""

    CONTENTS = "Contents"
    KEY = "Key"
    CONTINUATION = "NextContinuationToken"
    BUCKET = "Bucket"
    OBJECTS = "Objects"


@dataclass
class SensorSpec(object):
    """Sensor Specification.

    - sensor_id: sensor id.
    - assets: a list of assets that are considered as available to
        consume downstream after this sensor has status
        PROCESSED_NEW_DATA.
    - control_db_table_name: db.table to store sensor metadata.
    - input_spec: input specification of the source to be checked for new data.
    - preprocess_query: SQL query to transform/filter the result from the
        upstream. Consider that we should refer to 'new_data' whenever
        we are referring to the input of the sensor. E.g.:
            "SELECT dummy_col FROM new_data WHERE ..."
    - checkpoint_location: optional location to store checkpoints to resume
        from. These checkpoints use the same as Spark checkpoint strategy.
        For Spark readers that do not support checkpoints, use the
        preprocess_query parameter to form a SQL query to filter the result
        from the upstream accordingly.
    - fail_on_empty_result: if the sensor should throw an error if there is no new data
    in the upstream. Default: True.
    """

    sensor_id: str
    assets: List[str]
    control_db_table_name: str
    input_spec: InputSpec
    preprocess_query: Optional[str]
    checkpoint_location: Optional[str]
    fail_on_empty_result: bool = True

    @classmethod
    def create_from_acon(cls, acon: dict):  # type: ignore
        """Create SensorSpec from acon.

        Args:
            acon: sensor ACON.
        """
        checkpoint_location = acon.get("base_checkpoint_location")
        if checkpoint_location:
            checkpoint_location = (
                f"{checkpoint_location.rstrip('/')}/lakehouse_engine/"
                f"sensors/{acon['sensor_id']}"
            )

        return cls(
            sensor_id=acon["sensor_id"],
            assets=acon["assets"],
            control_db_table_name=acon["control_db_table_name"],
            input_spec=InputSpec(**acon["input_spec"]),
            preprocess_query=acon.get("preprocess_query"),
            checkpoint_location=checkpoint_location,
            fail_on_empty_result=acon.get("fail_on_empty_result", True),
        )


class SensorStatus(Enum):
    """Status for a sensor."""

    ACQUIRED_NEW_DATA = "ACQUIRED_NEW_DATA"
    PROCESSED_NEW_DATA = "PROCESSED_NEW_DATA"


SENSOR_SCHEMA = StructType(
    [
        StructField("sensor_id", StringType(), False),
        StructField("assets", ArrayType(StringType(), False), True),
        StructField("status", StringType(), False),
        StructField("status_change_timestamp", TimestampType(), False),
        StructField("checkpoint_location", StringType(), True),
        StructField("upstream_key", StringType(), True),
        StructField("upstream_value", StringType(), True),
    ]
)

SENSOR_UPDATE_SET: dict = {
    "sensors.sensor_id": "updates.sensor_id",
    "sensors.status": "updates.status",
    "sensors.status_change_timestamp": "updates.status_change_timestamp",
}

SENSOR_ALLOWED_DATA_FORMATS = {
    ReadType.STREAMING.value: [InputFormat.KAFKA.value, *FILE_INPUT_FORMATS],
    ReadType.BATCH.value: [
        InputFormat.DELTAFILES.value,
        InputFormat.JDBC.value,
    ],
}


class SAPLogchain(Enum):
    """Defaults used on consuming data from SAP Logchain."""

    DBTABLE = "SAPPHA.RSPCLOGCHAIN"
    GREEN_STATUS = "G"
    ENGINE_TABLE = "sensor_new_data"


class RestoreType(Enum):
    """Archive types."""

    BULK = "Bulk"
    STANDARD = "Standard"
    EXPEDITED = "Expedited"

    @classmethod
    def values(cls):  # type: ignore
        """Generates a list containing all enum values.

        Returns:
            A list with all enum values.
        """
        return (c.value for c in cls)

    @classmethod
    def exists(cls, restore_type: str) -> bool:
        """Checks if the restore type exists in the enum values.

        Args:
            restore_type: restore type to check if exists.

        Returns:
            If the restore type exists in our enum.
        """
        return restore_type in cls.values()


class RestoreStatus(Enum):
    """Archive types."""

    NOT_STARTED = "not_started"
    ONGOING = "ongoing"
    RESTORED = "restored"


ARCHIVE_STORAGE_CLASS = [
    "GLACIER",
    "DEEP_ARCHIVE",
    "GLACIER_IR",
]


class SQLParser(Enum):
    """Defaults to use for parsing."""

    DOUBLE_QUOTES = '"'
    SINGLE_QUOTES = "'"
    BACKSLASH = "\\"
    SINGLE_TRACE = "-"
    DOUBLE_TRACES = "--"
    SLASH = "/"
    OPENING_MULTIPLE_LINE_COMMENT = "/*"
    CLOSING_MULTIPLE_LINE_COMMENT = "*/"
    PARAGRAPH = "\n"
    STAR = "*"

    MULTIPLE_LINE_COMMENT = [
        OPENING_MULTIPLE_LINE_COMMENT,
        CLOSING_MULTIPLE_LINE_COMMENT,
    ]


class GABDefaults(Enum):
    """Defaults used on the GAB process."""

    DATE_FORMAT = "%Y-%m-%d"
    DIMENSIONS_DEFAULT_COLUMNS = ["from_date", "to_date"]
    DEFAULT_DIMENSION_CALENDAR_TABLE = "dim_calendar"
    DEFAULT_LOOKUP_QUERY_BUILDER_TABLE = "lkp_query_builder"


class GABStartOfWeek(Enum):
    """Representation of start of week values on GAB."""

    SUNDAY = "S"
    MONDAY = "M"

    @classmethod
    def get_start_of_week(cls) -> dict:
        """Get the start of week enum as a dict.

        Returns:
            dict containing all enum entries as `{name:value}`.
        """
        return {
            start_of_week.name: start_of_week.value for start_of_week in GABStartOfWeek
        }

    @classmethod
    def get_values(cls) -> set[str]:
        """Get the start of week enum values as set.

        Returns:
            set containing all possible values `{value}`.
        """
        return {start_of_week.value for start_of_week in GABStartOfWeek}


@dataclass
class GABSpec(object):
    """Gab Specification.

    - query_label_filter: query use-case label to execute.
    - queue_filter: queue to execute the job.
    - cadence_filter: selected cadences to build the asset.
    - target_database: target database to write.
    - curr_date: current date.
    - start_date: period start date.
    - end_date: period end date.
    - rerun_flag: rerun flag.
    - target_table: target table to write.
    - source_database: source database.
    - gab_base_path: base path to read the use cases.
    - lookup_table: gab configuration table.
    - calendar_table: gab calendar table.
    """

    query_label_filter: list[str]
    queue_filter: list[str]
    cadence_filter: list[str]
    target_database: str
    current_date: datetime
    start_date: datetime
    end_date: datetime
    rerun_flag: str
    target_table: str
    source_database: str
    gab_base_path: str
    lookup_table: str
    calendar_table: str

    @classmethod
    def create_from_acon(cls, acon: dict):  # type: ignore
        """Create GabSpec from acon.

        Args:
            acon: gab ACON.
        """
        lookup_table = f"{acon['source_database']}." + (
            acon.get(
                "lookup_table", GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE.value
            )
        )

        calendar_table = f"{acon['source_database']}." + (
            acon.get(
                "calendar_table", GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE.value
            )
        )

        def format_date(date_to_format: datetime | str) -> datetime:
            if isinstance(date_to_format, str):
                return datetime.strptime(date_to_format, GABDefaults.DATE_FORMAT.value)
            else:
                return date_to_format

        return cls(
            query_label_filter=acon["query_label_filter"],
            queue_filter=acon["queue_filter"],
            cadence_filter=acon["cadence_filter"],
            target_database=acon["target_database"],
            current_date=datetime.now(),
            start_date=format_date(acon["start_date"]),
            end_date=format_date(acon["end_date"]),
            rerun_flag=acon["rerun_flag"],
            target_table=acon["target_table"],
            source_database=acon["source_database"],
            gab_base_path=acon["gab_base_path"],
            lookup_table=lookup_table,
            calendar_table=calendar_table,
        )


class GABCadence(Enum):
    """Representation of the supported cadences on GAB."""

    DAY = 1
    WEEK = 2
    MONTH = 3
    QUARTER = 4
    YEAR = 5

    @classmethod
    def get_ordered_cadences(cls) -> dict:
        """Get the cadences ordered by the value.

        Returns:
            dict containing ordered cadences as `{name:value}`.
        """
        return {
            cadence.name: cadence.value
            for cadence in sorted(GABCadence, key=lambda gab_cadence: gab_cadence.value)
        }

    @classmethod
    def get_cadences(cls) -> set[str]:
        """Get the cadences values as set.

        Returns:
            set containing all possible cadence values as `{value}`.
        """
        return {cadence.name for cadence in GABCadence}

    @classmethod
    def order_cadences(cls, cadences_to_order: list[str]) -> list[str]:
        """Order a list of cadences by value.

        Returns:
            ordered set containing the received cadences.
        """
        return sorted(
            cadences_to_order,
            key=lambda item: cls.get_ordered_cadences().get(item),  # type: ignore
        )


class GABKeys:
    """Constants used to update pre-configured gab dict key."""

    JOIN_SELECT = "join_select"
    PROJECT_START = "project_start"
    PROJECT_END = "project_end"


class GABReplaceableKeys:
    """Constants used to replace pre-configured gab dict values."""

    CADENCE = "${cad}"
    DATE_COLUMN = "${date_column}"
    CONFIG_WEEK_START = "${config_week_start}"
    RECONCILIATION_CADENCE = "${rec_cadence}"


class GABCombinedConfiguration(Enum):
    """GAB combined configuration.

    Based on the use case configuration return the values to override in the SQL file.
    This enum aims to exhaustively map each combination of `cadence`, `reconciliation`,
        `week_start` and `snap_flag` return the corresponding values `join_select`,
        `project_start` and `project_end` to replace this values in the stages SQL file.

    Return corresponding configuration (join_select, project_start, project_end) for
        each combination (cadence x recon x week_start x snap_flag).
    """

    _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE = (
        "date(date_trunc('${cad}',${date_column}))"
    )
    _DEFAULT_PROJECT_START = "df_cal.cadence_start_date"
    _DEFAULT_PROJECT_END = "df_cal.cadence_end_date"

    COMBINED_CONFIGURATION = {
        # Combination of:
        # - cadence: `DAY`
        # - reconciliation_window: `DAY`, `WEEK`, `MONTH`, `QUARTER`, `YEAR`
        # - week_start: `S`, `M`
        # - snapshot_flag: `Y`, `N`
        1: {
            "cadence": GABCadence.DAY.name,
            "recon": GABCadence.get_cadences(),
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": {"Y", "N"},
            "join_select": "",
            "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
            "project_end": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
        },
        # Combination of:
        # - cadence: `WEEK`
        # - reconciliation_window: `DAY`
        # - week_start: `S`, `M`
        # - snapshot_flag: `Y`
        2: {
            "cadence": GABCadence.WEEK.name,
            "recon": GABCadence.DAY.name,
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "Y",
            "join_select": """
            select distinct case
                when '${config_week_start}' = 'Monday' then weekstart_mon
                when '${config_week_start}' = 'Sunday' then weekstart_sun
            end as cadence_start_date,
            calendar_date as cadence_end_date
        """,
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        # Combination of:
        # - cadence: `WEEK`
        # - reconciliation_window: `DAY, `MONTH`, `QUARTER`, `YEAR`
        # - week_start: `M`
        # - snapshot_flag: `Y`, `N`
        3: {
            "cadence": GABCadence.WEEK.name,
            "recon": {
                GABCadence.DAY.name,
                GABCadence.MONTH.name,
                GABCadence.QUARTER.name,
                GABCadence.YEAR.name,
            },
            "week_start": "M",
            "snap_flag": {"Y", "N"},
            "join_select": """
            select distinct case
                when '${config_week_start}'  = 'Monday' then weekstart_mon
                when '${config_week_start}' = 'Sunday' then weekstart_sun
            end as cadence_start_date,
            case
                when '${config_week_start}' = 'Monday' then weekend_mon
                when '${config_week_start}' = 'Sunday' then weekend_sun
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        4: {
            "cadence": GABCadence.MONTH.name,
            "recon": GABCadence.DAY.name,
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "Y",
            "join_select": """
            select distinct month_start as cadence_start_date,
            calendar_date as cadence_end_date
        """,
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        5: {
            "cadence": GABCadence.MONTH.name,
            "recon": GABCadence.WEEK.name,
            "week_start": GABStartOfWeek.MONDAY.value,
            "snap_flag": "Y",
            "join_select": """
            select distinct month_start as cadence_start_date,
            case
                when date(
                    date_trunc('MONTH',add_months(calendar_date, 1))
                )-1 < weekend_mon
                    then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
                else weekend_mon
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        6: {
            "cadence": GABCadence.MONTH.name,
            "recon": GABCadence.WEEK.name,
            "week_start": GABStartOfWeek.SUNDAY.value,
            "snap_flag": "Y",
            "join_select": """
            select distinct month_start as cadence_start_date,
            case
                when date(
                    date_trunc('MONTH',add_months(calendar_date, 1))
                )-1 < weekend_sun
                    then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
                else weekend_sun
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        7: {
            "cadence": GABCadence.MONTH.name,
            "recon": GABCadence.get_cadences(),
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": {"Y", "N"},
            "join_select": "",
            "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
            "project_end": "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1",
        },
        8: {
            "cadence": GABCadence.QUARTER.name,
            "recon": GABCadence.DAY.name,
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "Y",
            "join_select": """
            select distinct quarter_start as cadence_start_date,
            calendar_date as cadence_end_date
        """,
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        9: {
            "cadence": GABCadence.QUARTER.name,
            "recon": GABCadence.WEEK.name,
            "week_start": GABStartOfWeek.MONDAY.value,
            "snap_flag": "Y",
            "join_select": """
            select distinct quarter_start as cadence_start_date,
            case
                when weekend_mon > date(
                    date_trunc('QUARTER',add_months(calendar_date, 3))
                )-1
                    then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
                else weekend_mon
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        10: {
            "cadence": GABCadence.QUARTER.name,
            "recon": GABCadence.WEEK.name,
            "week_start": GABStartOfWeek.SUNDAY.value,
            "snap_flag": "Y",
            "join_select": """
            select distinct quarter_start as cadence_start_date,
            case
                when weekend_sun > date(
                    date_trunc('QUARTER',add_months(calendar_date, 3))
                )-1
                    then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
                else weekend_sun
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        11: {
            "cadence": GABCadence.QUARTER.name,
            "recon": GABCadence.MONTH.name,
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "Y",
            "join_select": """
            select distinct quarter_start as cadence_start_date,
            month_end as cadence_end_date
        """,
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        12: {
            "cadence": GABCadence.QUARTER.name,
            "recon": GABCadence.YEAR.name,
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "N",
            "join_select": "",
            "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
            "project_end": """
            date(
                date_trunc(
                    '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)
                )
            )-1
        """,
        },
        13: {
            "cadence": GABCadence.QUARTER.name,
            "recon": GABCadence.get_cadences(),
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "N",
            "join_select": "",
            "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
            "project_end": """
            date(
                date_trunc(
                    '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)
                )
            )-1
        """,
        },
        14: {
            "cadence": GABCadence.YEAR.name,
            "recon": GABCadence.WEEK.name,
            "week_start": GABStartOfWeek.MONDAY.value,
            "snap_flag": "Y",
            "join_select": """
            select distinct year_start as cadence_start_date,
            case
                when weekend_mon > date(
                    date_trunc('YEAR',add_months(calendar_date, 12))
                )-1
                    then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
                else weekend_mon
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        15: {
            "cadence": GABCadence.YEAR.name,
            "recon": GABCadence.WEEK.name,
            "week_start": GABStartOfWeek.SUNDAY.value,
            "snap_flag": "Y",
            "join_select": """
            select distinct year_start as cadence_start_date,
            case
                when weekend_sun > date(
                    date_trunc('YEAR',add_months(calendar_date, 12))
                )-1
                    then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
                else weekend_sun
            end as cadence_end_date""",
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        16: {
            "cadence": GABCadence.YEAR.name,
            "recon": GABCadence.get_cadences(),
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "N",
            "inverse_flag": "Y",
            "join_select": "",
            "project_start": _PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
            "project_end": """
            date(
                date_trunc(
                    '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)
                )
            )-1
        """,
        },
        17: {
            "cadence": GABCadence.YEAR.name,
            "recon": {
                GABCadence.DAY.name,
                GABCadence.MONTH.name,
                GABCadence.QUARTER.name,
            },
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": "Y",
            "join_select": """
            select distinct year_start as cadence_start_date,
            case
                when '${rec_cadence}' = 'DAY' then calendar_date
                when '${rec_cadence}' = 'MONTH' then month_end
                when '${rec_cadence}' = 'QUARTER' then quarter_end
            end as cadence_end_date
        """,
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
        18: {
            "cadence": GABCadence.get_cadences(),
            "recon": GABCadence.get_cadences(),
            "week_start": GABStartOfWeek.get_values(),
            "snap_flag": {"Y", "N"},
            "join_select": """
            select distinct
            case
                when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
                    then weekstart_mon
                when  '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
                    then weekstart_sun
                else
                    date(date_trunc('${cad}',calendar_date))
            end as cadence_start_date,
            case
                when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
                    then weekend_mon
                when  '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
                    then weekend_sun
                when '${cad}' = 'DAY'
                    then date(date_trunc('${cad}',calendar_date))
                when '${cad}' = 'MONTH'
                    then date(
                        date_trunc(
                            'MONTH',
                            add_months(date(date_trunc('${cad}',calendar_date)), 1)
                        )
                    )-1
                when '${cad}' = 'QUARTER'
                    then date(
                        date_trunc(
                            'QUARTER',
                            add_months(date(date_trunc('${cad}',calendar_date)) , 3)
                        )
                    )-1
                when '${cad}' = 'YEAR'
                    then date(
                        date_trunc(
                            'YEAR',
                            add_months(date(date_trunc('${cad}',calendar_date)), 12)
                        )
                    )-1
            end as cadence_end_date
        """,
            "project_start": _DEFAULT_PROJECT_START,
            "project_end": _DEFAULT_PROJECT_END,
        },
    }


@dataclass
class HeartbeatConfigSpec(object):
    """Heartbeat Configurations and control table specifications.

    This provides the way in which the Heartbeat can pass environment and
    specific quantum related config information to sensor acon.

    - sensor_source: specifies the source system of sensor, for e.g.
        sap_b4, sap_bw, delta_table, kafka, lmu_delta_table, trigger_file etc.
        It is also a part of heartbeat control table, Therefore it is useful for
        filtering out data from Heartbeat control table based on template source system.
    - data_format: format of the input source, e.g jdbc, delta, kafka, cloudfiles etc.
    - heartbeat_sensor_db_table: heartbeat control table along
        with database from config.
    - lakehouse_engine_sensor_db_table: Control table along with database(config).
    - options: dict with other relevant options for reading data from specified input
        data_format. This can vary for each source system.
        For e.g. For sap systems, DRIVER, URL, USERNAME, PASSWORD are required which are
        all being read from config file of quantum.
    - jdbc_db_table: schema and table name of JDBC sources.
    - token: token to access Databricks Job API(read from config).
    - domain: workspace domain url for quantum(read from config).
    - base_checkpoint_location: checkpoint location for streaming sources(from config).
    - kafka_configs: configs required for kafka. It is (read from config) as JSON.
        config hierarchy is [sensor_kafka --> <dp_name/prefix> --> main kafka options].
    - kafka_secret_scope: secret scope for kafka (read from config).
    - base_trigger_file_location: location where all the trigger files are being
        created (read from config).
    - schema_dict: dict representation of schema of the trigger file (e.g. Spark struct
        type schema).
    """

    sensor_source: str
    data_format: str
    heartbeat_sensor_db_table: str
    lakehouse_engine_sensor_db_table: str
    token: str
    domain: str
    options: Optional[dict]
    jdbc_db_table: Optional[str]
    base_checkpoint_location: Optional[str]
    kafka_configs: Optional[dict]
    kafka_secret_scope: Optional[str]
    base_trigger_file_location: Optional[str]
    schema_dict: Optional[dict]

    @classmethod
    def create_from_acon(cls, acon: dict):  # type: ignore
        """Create HeartbeatConfigSpec from acon.

        Args:
            acon: Heartbeat ACON.
        """
        return cls(
            sensor_source=acon["sensor_source"],
            data_format=acon["data_format"],
            heartbeat_sensor_db_table=acon["heartbeat_sensor_db_table"],
            lakehouse_engine_sensor_db_table=acon["lakehouse_engine_sensor_db_table"],
            token=acon["token"],
            domain=acon["domain"],
            options=acon.get("options"),
            jdbc_db_table=acon.get("jdbc_db_table"),
            base_checkpoint_location=acon.get("base_checkpoint_location"),
            kafka_configs=acon.get("kafka_configs"),
            kafka_secret_scope=acon.get("kafka_secret_scope"),
            base_trigger_file_location=acon.get("base_trigger_file_location"),
            schema_dict=acon.get("schema_dict"),
        )


class HeartbeatSensorSource(Enum):
    """Formats of algorithm input."""

    SAP_BW = "sap_bw"
    SAP_B4 = "sap_b4"
    DELTA_TABLE = "delta_table"
    KAFKA = "kafka"
    LMU_DELTA_TABLE = "lmu_delta_table"
    TRIGGER_FILE = "trigger_file"

    @classmethod
    def values(cls):  # type: ignore
        """Generates a list containing all enum values.

        Returns:
            A list with all enum values.
        """
        return (c.value for c in cls)


class HeartbeatStatus(Enum):
    """Status for a sensor."""

    NEW_EVENT_AVAILABLE = "NEW_EVENT_AVAILABLE"
    IN_PROGRESS = "IN_PROGRESS"
    COMPLETED = "COMPLETED"


HEARTBEAT_SENSOR_UPDATE_SET: dict = {
    "target.sensor_source": "src.sensor_source",
    "target.sensor_id": "src.sensor_id",
    "target.asset_description": "src.asset_description",
    "target.upstream_key": "src.upstream_key",
    "target.preprocess_query": "src.preprocess_query",
    "target.latest_event_fetched_timestamp": "src.latest_event_fetched_timestamp",
    "target.trigger_job_id": "src.trigger_job_id",
    "target.trigger_job_name": "src.trigger_job_name",
    "target.status": "src.status",
    "target.status_change_timestamp": "src.status_change_timestamp",
    "target.job_start_timestamp": "src.job_start_timestamp",
    "target.job_end_timestamp": "src.job_end_timestamp",
    "target.job_state": "src.job_state",
    "target.dependency_flag": "src.dependency_flag",
    "target.sensor_read_type": "src.sensor_read_type",
}


TABLE_MANAGER_OPERATIONS = {
    "compute_table_statistics": {"table_or_view": {"type": "str", "mandatory": True}},
    "create_table": {
        "path": {"type": "str", "mandatory": True},
        "disable_dbfs_retry": {"type": "bool", "mandatory": False},
        "delimiter": {"type": "str", "mandatory": False},
        "advanced_parser": {"type": "bool", "mandatory": False},
    },
    "create_tables": {
        "path": {"type": "str", "mandatory": True},
        "disable_dbfs_retry": {"type": "bool", "mandatory": False},
        "delimiter": {"type": "str", "mandatory": False},
        "advanced_parser": {"type": "bool", "mandatory": False},
    },
    "create_view": {
        "path": {"type": "str", "mandatory": True},
        "disable_dbfs_retry": {"type": "bool", "mandatory": False},
        "delimiter": {"type": "str", "mandatory": False},
        "advanced_parser": {"type": "bool", "mandatory": False},
    },
    "drop_table": {"table_or_view": {"type": "str", "mandatory": True}},
    "drop_view": {"table_or_view": {"type": "str", "mandatory": True}},
    "execute_sql": {
        "sql": {"type": "str", "mandatory": True},
        "delimiter": {"type": "str", "mandatory": False},
        "advanced_parser": {"type": "bool", "mandatory": False},
    },
    "truncate": {"table_or_view": {"type": "str", "mandatory": True}},
    "vacuum": {
        "table_or_view": {"type": "str", "mandatory": False},
        "path": {"type": "str", "mandatory": False},
        "vacuum_hours": {"type": "int", "mandatory": False},
    },
    "describe": {"table_or_view": {"type": "str", "mandatory": True}},
    "optimize": {
        "table_or_view": {"type": "str", "mandatory": False},
        "path": {"type": "str", "mandatory": False},
        "where_clause": {"type": "str", "mandatory": False},
        "optimize_zorder_col_list": {"type": "str", "mandatory": False},
    },
    "show_tbl_properties": {"table_or_view": {"type": "str", "mandatory": True}},
    "get_tbl_pk": {"table_or_view": {"type": "str", "mandatory": True}},
    "repair_table": {
        "table_or_view": {"type": "str", "mandatory": True},
        "sync_metadata": {"type": "bool", "mandatory": True},
    },
    "delete_where": {
        "table_or_view": {"type": "str", "mandatory": True},
        "where_clause": {"type": "str", "mandatory": True},
    },
}


FILE_MANAGER_OPERATIONS = {
    "delete_objects": {
        "bucket": {"type": "str", "mandatory": True},
        "object_paths": {"type": "list", "mandatory": True},
        "dry_run": {"type": "bool", "mandatory": True},
    },
    "copy_objects": {
        "bucket": {"type": "str", "mandatory": True},
        "source_object": {"type": "str", "mandatory": True},
        "destination_bucket": {"type": "str", "mandatory": True},
        "destination_object": {"type": "str", "mandatory": True},
        "dry_run": {"type": "bool", "mandatory": True},
    },
    "move_objects": {
        "bucket": {"type": "str", "mandatory": True},
        "source_object": {"type": "str", "mandatory": True},
        "destination_bucket": {"type": "str", "mandatory": True},
        "destination_object": {"type": "str", "mandatory": True},
        "dry_run": {"type": "bool", "mandatory": True},
    },
    "request_restore": {
        "bucket": {"type": "str", "mandatory": True},
        "source_object": {"type": "str", "mandatory": True},
        "restore_expiration": {"type": "int", "mandatory": True},
        "retrieval_tier": {"type": "str", "mandatory": True},
        "dry_run": {"type": "bool", "mandatory": True},
    },
    "check_restore_status": {
        "bucket": {"type": "str", "mandatory": True},
        "source_object": {"type": "str", "mandatory": True},
    },
    "request_restore_to_destination_and_wait": {
        "bucket": {"type": "str", "mandatory": True},
        "source_object": {"type": "str", "mandatory": True},
        "destination_bucket": {"type": "str", "mandatory": True},
        "destination_object": {"type": "str", "mandatory": True},
        "restore_expiration": {"type": "int", "mandatory": True},
        "retrieval_tier": {"type": "str", "mandatory": True},
        "dry_run": {"type": "bool", "mandatory": True},
    },
}


================================================
FILE: lakehouse_engine/core/exec_env.py
================================================
"""Module to take care of creating a singleton of the execution environment class."""

from dataclasses import replace

from pyspark.sql import DataFrame, SparkSession

from lakehouse_engine.core.definitions import EngineConfig
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class ExecEnv(object):
    """Represents the basic resources regarding the engine execution environment.

    Currently, it is used to encapsulate both the logic to get the Spark
    session and the engine configurations.
    """

    SESSION: SparkSession
    _LOGGER = LoggingHandler(__name__).get_logger()
    ENGINE_CONFIG: EngineConfig = EngineConfig(**ConfigUtils.get_config())
    IS_SERVERLESS = DatabricksUtils.is_serverless_workload()

    @classmethod
    def set_default_engine_config(
        cls,
        package: str = "lakehouse_engine.configs",
        custom_configs_dict: dict = None,
        custom_configs_file_path: str = None,
    ) -> None:
        """Set default engine configurations.

        The function set the default engine configurations by reading
        them from a specified package and overwrite them if the user
        pass a dictionary or a file path with new configurations.

        Args:
            package: package where the engine default configurations can be found.
            custom_configs_dict: a dictionary with custom configurations
            to overwrite the default ones.
            custom_configs_file_path: path for the file with custom
            configurations to overwrite the default ones.
        """
        cls.ENGINE_CONFIG = EngineConfig(**ConfigUtils.get_config(package))
        if custom_configs_dict:
            cls.ENGINE_CONFIG = replace(cls.ENGINE_CONFIG, **custom_configs_dict)
        if custom_configs_file_path:
            cls.ENGINE_CONFIG = replace(
                cls.ENGINE_CONFIG,
                **ConfigUtils.get_config_from_file(custom_configs_file_path),
            )

    @classmethod
    def get_or_create(
        cls,
        session: SparkSession = None,
        enable_hive_support: bool = True,
        app_name: str = None,
        config: dict = None,
    ) -> None:
        """Get or create an execution environment session (currently Spark).

        It instantiates a singleton session that can be accessed anywhere from the
        lakehouse engine. By default, if there is an existing Spark Session in
        the environment (getActiveSession()), this function re-uses it. It can
        be further extended in the future to support forcing the creation of new
        isolated sessions even when a Spark Session is already active.

        Args:
            session: spark session.
            enable_hive_support: whether to enable hive support or not.
            app_name: application name.
            config: extra spark configs to supply to the spark session.
        """
        if not cls.IS_SERVERLESS:
            default_config = {
                "spark.databricks.delta.optimizeWrite.enabled": True,
                "spark.sql.adaptive.enabled": True,
                "spark.databricks.delta.merge.enableLowShuffle": True,
            }
            cls._LOGGER.info(
                f"Using the following default configs you may want to override them "
                f"for your job: {default_config}"
            )
        else:
            default_config = {}
        final_config: dict = {**default_config, **(config if config else {})}
        cls._LOGGER.info(f"Final config is: {final_config}")

        if session:
            cls.SESSION = session
        elif SparkSession.getActiveSession():
            cls.SESSION = SparkSession.getActiveSession()
            cls._set_spark_configs(final_config)
        else:
            cls._LOGGER.info("Creating a new Spark Session")

            session_builder = SparkSession.builder.appName(app_name)
            cls._set_spark_configs(final_config, session_builder)

            if enable_hive_support:
                session_builder = session_builder.enableHiveSupport()
            cls.SESSION = session_builder.getOrCreate()

    @classmethod
    def get_for_each_batch_session(cls, df: DataFrame) -> None:
        """Get the execution environment session for foreachBatch operations.

        For Spark connect scenarios, spark is not able to re-use the Spark session
        from an external scope as it cannot serialise it, so the session
        needs to be retrieved and stored again in the ExecEnv class.
        """
        cls.SESSION = df.sparkSession.getActiveSession()

    @classmethod
    def _set_spark_configs(
        cls, final_config: dict, session_builder: SparkSession.Builder = None
    ) -> None:
        """Set Spark session configurations based on final_config.

        This method attempts to set each configuration key-value pair in the provided
        final_config dictionary to the Spark session. If a configuration key is not
        available in the current environment, it logs a warning and skips that key.

        Args:
            final_config: dictionary with spark configurations to set.
            session_builder: spark session builder.
        """
        for key, value in final_config.items():
            try:
                if session_builder:
                    session_builder.config(key, value)
                else:
                    cls.SESSION.conf.set(key, value)
            except Exception as e:
                if (
                    "[CONFIG_NOT_AVAILABLE]" in str(e)
                    and not ExecEnv.ENGINE_CONFIG.raise_on_config_not_available
                ):
                    cls._LOGGER.warning(
                        f"Spark config '{key}' is not available in this "
                        f"environment and will be skipped."
                    )
                else:
                    raise e

    @classmethod
    def get_environment(cls) -> str:
        """Get the environment where the process is running.

        Returns:
            Name of the environment.
        """
        if cls.ENGINE_CONFIG.environment:
            return cls.ENGINE_CONFIG.environment

        catalog = cls.SESSION.sql("SELECT current_catalog()").collect()[0][0]
        if catalog.lower() == cls.ENGINE_CONFIG.prod_catalog:
            return "prod"
        else:
            return "dev"


================================================
FILE: lakehouse_engine/core/executable.py
================================================
"""Module representing an executable lakehouse engine component."""

from abc import ABC, abstractmethod
from typing import Any, Optional


class Executable(ABC):
    """Abstract class defining the behaviour of an executable component."""

    @abstractmethod
    def execute(self) -> Optional[Any]:
        """Define the executable component behaviour.

        E.g., the behaviour of an algorithm inheriting from this.
        """
        pass


================================================
FILE: lakehouse_engine/core/file_manager.py
================================================
"""Module for abstract representation of a file manager system."""

from abc import ABC, abstractmethod
from typing import Any

from lakehouse_engine.algorithms.exceptions import RestoreTypeNotFoundException
from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions


class FileManager(ABC):  # noqa: B024
    """Abstract file manager class.

    {{ get_file_manager_operations() }}
    """

    def __init__(self, configs: dict):
        """Construct FileManager algorithm instances.

        Args:
            configs: configurations for the FileManager algorithm.
        """
        self.configs = configs
        self.function = self.configs["function"]

    @abstractmethod
    def delete_objects(self) -> None:
        """Delete objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be deleted based on the given keys.
        """
        pass

    @abstractmethod
    def copy_objects(self) -> None:
        """Copies objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be copied based on the given keys.
        """
        pass

    @abstractmethod
    def move_objects(self) -> None:
        """Moves objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be moved based on the given keys.
        """
        pass


class FileManagerFactory(ABC):  # noqa: B024
    """Class for file manager factory."""

    @staticmethod
    def execute_function(configs: dict) -> Any:
        """Get a specific File Manager and function to execute."""
        from lakehouse_engine.core.dbfs_file_manager import DBFSFileManager
        from lakehouse_engine.core.s3_file_manager import S3FileManager

        disable_dbfs_retry = (
            configs["disable_dbfs_retry"]
            if "disable_dbfs_retry" in configs.keys()
            else False
        )

        if disable_dbfs_retry:
            S3FileManager(configs).get_function()
        elif FileStorageFunctions.is_boto3_configured():
            try:
                S3FileManager(configs).get_function()
            except (ValueError, NotImplementedError, RestoreTypeNotFoundException):
                raise
            except Exception:
                DBFSFileManager(configs).get_function()
        else:
            DBFSFileManager(configs).get_function()


================================================
FILE: lakehouse_engine/core/gab_manager.py
================================================
"""Module to define GAB Manager classes."""

import calendar
from datetime import datetime, timedelta
from typing import Tuple, cast

import pendulum
from pendulum import DateTime
from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import GABCadence, GABDefaults
from lakehouse_engine.core.gab_sql_generator import GABViewGenerator
from lakehouse_engine.utils.gab_utils import GABUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class GABCadenceManager(object):
    """Class to control the GAB Cadence Window."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    def extended_window_calculator(
        self,
        cadence: str,
        reconciliation_cadence: str,
        current_date: datetime,
        start_date_str: str,
        end_date_str: str,
        query_type: str,
        rerun_flag: str,
        snapshot_flag: str,
    ) -> tuple[datetime, datetime, datetime, datetime]:
        """extended_window_calculator function.

        Calculates the extended window of any cadence despite the user providing
        custom dates which are not the exact start and end dates of a cadence.

        Args:
            cadence: cadence to process
            reconciliation_cadence: reconciliation to process.
            current_date: current date.
            start_date_str: start date of the period to process.
            end_date_str: end date of the period to process.
            query_type: use case query type.
            rerun_flag: flag indicating if it's a rerun or a normal run.
            snapshot_flag: flag indicating if for this cadence the snapshot is enabled.
        """
        cad_order = GABCadence.get_ordered_cadences()

        derived_cadence = self._get_reconciliation_cadence(
            cad_order, rerun_flag, cadence, reconciliation_cadence, snapshot_flag
        )

        self._LOGGER.info(f"cadence passed to extended window: {derived_cadence}")

        start_date = datetime.strptime(start_date_str, GABDefaults.DATE_FORMAT.value)
        end_date = datetime.strptime(end_date_str, GABDefaults.DATE_FORMAT.value)

        bucket_start_date, bucket_end_date = self.get_cadence_start_end_dates(
            cadence, derived_cadence, start_date, end_date, query_type, current_date
        )

        self._LOGGER.info(f"bucket dates: {bucket_start_date} - {bucket_end_date}")

        filter_start_date, filter_end_date = self.get_cadence_start_end_dates(
            cadence,
            (
                reconciliation_cadence
                if cad_order[cadence] < cad_order[reconciliation_cadence]
                else cadence
            ),
            start_date,
            end_date,
            query_type,
            current_date,
        )

        self._LOGGER.info(f"filter dates: {filter_start_date} - {filter_end_date}")

        return bucket_start_date, bucket_end_date, filter_start_date, filter_end_date

    @classmethod
    def _get_reconciliation_cadence(
        cls,
        cadence_order: dict,
        rerun_flag: str,
        cadence: str,
        reconciliation_cadence: str,
        snapshot_flag: str,
    ) -> str:
        """Get bigger cadence when rerun_flag or snapshot.

        Args:
            cadence_order: ordered cadences.
            rerun_flag: flag indicating if it's a rerun or a normal run.
            cadence: cadence to process.
            reconciliation_cadence: reconciliation to process.
            snapshot_flag: flag indicating if for this cadence the snapshot is enabled.
        """
        derived_cadence = reconciliation_cadence

        if rerun_flag == "Y":
            if cadence_order[cadence] > cadence_order[reconciliation_cadence]:
                derived_cadence = cadence
            elif cadence_order[cadence] < cadence_order[reconciliation_cadence]:
                derived_cadence = reconciliation_cadence
        else:
            if (
                cadence_order[cadence] > cadence_order[reconciliation_cadence]
                and snapshot_flag == "Y"
            ) or (cadence_order[cadence] < cadence_order[reconciliation_cadence]):
                derived_cadence = reconciliation_cadence
            elif (
                cadence_order[cadence] > cadence_order[reconciliation_cadence]
                and snapshot_flag == "N"
            ):
                derived_cadence = cadence

        return derived_cadence

    def get_cadence_start_end_dates(
        self,
        cadence: str,
        derived_cadence: str,
        start_date: datetime,
        end_date: datetime,
        query_type: str,
        current_date: datetime,
    ) -> tuple[datetime, datetime]:
        """Generate the new set of extended start and end dates based on the cadence.

        Running week cadence again to extend to correct week start and end date in case
            of recon window for Week cadence is present.
        For end_date 2012-12-31,in case of Quarter Recon window present for Week
            cadence, start and end dates are recalculated to 2022-10-01 to 2022-12-31.
        But these are not start and end dates of week. Hence, to correct this, new dates
            are passed again to get the correct dates.

        Args:
            cadence: cadence to process.
            derived_cadence: cadence reconciliation to process.
            start_date: start date of the period to process.
            end_date: end date of the period to process.
            query_type: use case query type.
            current_date: current date to be used in the end date, in case the end date
                is greater than current date so the end date should be the current date.
        """
        new_start_date = self._get_cadence_calculated_date(
            derived_cadence=derived_cadence, base_date=start_date, is_start=True
        )
        new_end_date = self._get_cadence_calculated_date(
            derived_cadence=derived_cadence, base_date=end_date, is_start=False
        )

        if cadence.upper() == "WEEK":
            new_start_date = (
                pendulum.datetime(
                    int(new_start_date.strftime("%Y")),
                    int(new_start_date.strftime("%m")),
                    int(new_start_date.strftime("%d")),
                )
                .start_of("week")
                .replace(tzinfo=None)
            )
            new_end_date = (
                pendulum.datetime(
                    int(new_end_date.strftime("%Y")),
                    int(new_end_date.strftime("%m")),
                    int(new_end_date.strftime("%d")),
                )
                .end_of("week")
                .replace(hour=0, minute=0, second=0, microsecond=0)
                .replace(tzinfo=None)
            )

        new_end_date = new_end_date + timedelta(days=1)

        if new_end_date >= current_date:
            new_end_date = current_date

        if query_type == "NAM":
            new_end_date = new_end_date + timedelta(days=1)

        return new_start_date, new_end_date

    @classmethod
    def _get_cadence_calculated_date(
        cls, derived_cadence: str, base_date: datetime, is_start: bool
    ) -> datetime | DateTime:  # type: ignore
        cadence_base_date = cls._get_cadence_base_date(derived_cadence, base_date)
        cadence_date_calculated: DateTime | datetime

        if derived_cadence.upper() == "WEEK":
            cadence_date_calculated = cls._get_calculated_week_date(
                cast(DateTime, cadence_base_date), is_start
            )
        elif derived_cadence.upper() == "MONTH":
            cadence_date_calculated = cls._get_calculated_month_date(
                cast(datetime, cadence_base_date), is_start
            )
        elif derived_cadence.upper() in ["QUARTER", "YEAR"]:
            cadence_date_calculated = cls._get_calculated_quarter_or_year_date(
                cast(DateTime, cadence_base_date), is_start, derived_cadence
            )
        else:
            cadence_date_calculated = cadence_base_date  # type: ignore

        return cadence_date_calculated  # type: ignore

    @classmethod
    def _get_cadence_base_date(
        cls, derived_cadence: str, base_date: datetime
    ) -> datetime | DateTime | str:  # type: ignore
        """Get start date for the selected cadence.

        Args:
            derived_cadence: cadence reconciliation to process.
            base_date: base date used to compute the start date of the cadence.
        """
        if derived_cadence.upper() in ["DAY", "MONTH"]:
            cadence_date_calculated = base_date
        elif derived_cadence.upper() in ["WEEK", "QUARTER", "YEAR"]:
            cadence_date_calculated = pendulum.datetime(
                int(base_date.strftime("%Y")),
                int(base_date.strftime("%m")),
                int(base_date.strftime("%d")),
            )
        else:
            cadence_date_calculated = "0"  # type: ignore

        return cadence_date_calculated

    @classmethod
    def _get_calculated_week_date(
        cls, cadence_date_calculated: DateTime, is_start: bool
    ) -> DateTime:
        """Get WEEK start/end date.

        Args:
            cadence_date_calculated: base date to compute the week date.
            is_start: flag indicating if we should get the start or end for the cadence.
        """
        if is_start:
            cadence_date_calculated = cadence_date_calculated.start_of("week").replace(
                tzinfo=None
            )
        else:
            cadence_date_calculated = (
                cadence_date_calculated.end_of("week")
                .replace(hour=0, minute=0, second=0, microsecond=0)
                .replace(tzinfo=None)
            )

        return cadence_date_calculated

    @classmethod
    def _get_calculated_month_date(
        cls, cadence_date_calculated: datetime, is_start: bool
    ) -> datetime:
        """Get MONTH start/end date.

        Args:
            cadence_date_calculated: base date to compute the month date.
            is_start: flag indicating if we should get the start or end for the cadence.
        """
        if is_start:
            cadence_date_calculated = cadence_date_calculated - timedelta(
                days=(int(cadence_date_calculated.strftime("%d")) - 1)
            )
        else:
            cadence_date_calculated = datetime(
                int(cadence_date_calculated.strftime("%Y")),
                int(cadence_date_calculated.strftime("%m")),
                calendar.monthrange(
                    int(cadence_date_calculated.strftime("%Y")),
                    int(cadence_date_calculated.strftime("%m")),
                )[1],
            )

        return cadence_date_calculated

    @classmethod
    def _get_calculated_quarter_or_year_date(
        cls, cadence_date_calculated: DateTime, is_start: bool, cadence: str
    ) -> DateTime:
        """Get QUARTER/YEAR start/end date.

        Args:
            cadence_date_calculated: base date to compute the quarter/year date.
            is_start: flag indicating if we should get the start or end for the cadence.
            cadence: selected cadence (possible values: QUARTER or YEAR).
        """
        if is_start:
            cadence_date_calculated = cadence_date_calculated.first_of(
                cadence.lower()
            ).replace(tzinfo=None)
        else:
            cadence_date_calculated = cadence_date_calculated.last_of(
                cadence.lower()
            ).replace(tzinfo=None)

        return cadence_date_calculated


class GABViewManager(object):
    """Class to control the GAB View creation."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(
        self,
        query_id: str,
        lookup_query_builder: DataFrame,
        target_database: str,
        target_table: str,
    ):
        """Construct GABViewManager instances.

        Args:
            query_id: gab configuration table use case identifier.
            lookup_query_builder: gab configuration data.
            target_database: target database to write.
            target_table: target table to write.
        """
        self.query_id = query_id
        self.lookup_query_builder = lookup_query_builder
        self.target_database = target_database
        self.target_table = target_table

    def generate_use_case_views(self) -> None:
        """Generate all the use case views.

        Generates the DDLs for each of the views. This DDL is dynamically built based on
        the mappings provided in the config table.
        """
        reconciliation_window = GABUtils.get_json_column_as_dict(
            self.lookup_query_builder, self.query_id, "recon_window"
        )

        cadence_snapshot_status = self._get_cadence_snapshot_status(
            reconciliation_window
        )

        (
            cadences_with_snapshot,
            cadences_without_snapshot,
        ) = self._split_cadence_by_snapshot(cadence_snapshot_status)

        mappings = GABUtils.get_json_column_as_dict(
            self.lookup_query_builder, self.query_id, "mappings"
        )

        for view_name in mappings.keys():
            self._generate_use_case_view(
                mappings,
                view_name,
                cadence_snapshot_status,
                cadences_with_snapshot,
                cadences_without_snapshot,
                self.target_database,
                self.target_table,
                self.query_id,
            )

    @classmethod
    def _generate_use_case_view(
        cls,
        mappings: dict,
        view_name: str,
        cadence_snapshot_status: dict,
        cadences_with_snapshot: list[str],
        cadences_without_snapshot: list[str],
        target_database: str,
        target_table: str,
        query_id: str,
    ) -> None:
        """Generate the selected use case views.

        Args:
            mappings: use case mappings configuration.
            view_name: name of the view to be generated.
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
            cadences_with_snapshot: cadences to execute with snapshot.
            cadences_without_snapshot: cadences to execute without snapshot.
            target_database: target database to write.
            target_table: target table to write.
            query_id: gab configuration table use case identifier.
        """
        view_configuration = mappings[view_name]

        view_dimensions = view_configuration["dimensions"]
        view_metrics = view_configuration["metric"]
        custom_filter = view_configuration["filter"]

        view_filter = " "
        if custom_filter:
            view_filter = " AND " + custom_filter

        (
            dimensions,
            dimensions_and_metrics,
            dimensions_and_metrics_with_alias,
        ) = cls._get_dimensions_and_metrics_from_use_case_view(
            view_dimensions, view_metrics
        )

        (
            final_cols,
            final_calculated_script,
            final_calculated_script_snapshot,
        ) = cls._get_calculated_and_derived_metrics_from_use_case_view(
            view_metrics, view_dimensions, cadence_snapshot_status
        )

        GABViewGenerator(
            cadence_snapshot_status=cadence_snapshot_status,
            target_database=target_database,
            view_name=view_name,
            final_cols=final_cols,
            target_table=target_table,
            dimensions_and_metrics_with_alias=dimensions_and_metrics_with_alias,
            dimensions=dimensions,
            dimensions_and_metrics=dimensions_and_metrics,
            final_calculated_script=final_calculated_script,
            query_id=query_id,
            view_filter=view_filter,
            final_calculated_script_snapshot=final_calculated_script_snapshot,
            without_snapshot_cadences=cadences_without_snapshot,
            with_snapshot_cadences=cadences_with_snapshot,
        ).generate_sql()

    @classmethod
    def _get_dimensions_and_metrics_from_use_case_view(
        cls, view_dimensions: dict, view_metrics: dict
    ) -> Tuple[str, str, str]:
        """Get dimensions and metrics from use case.

        Args:
            view_dimensions: use case configured dimensions.
            view_metrics: use case configured metrics.
        """
        (
            extracted_dimensions_with_alias,
            extracted_dimensions_without_alias,
        ) = GABUtils.extract_columns_from_mapping(
            columns=view_dimensions,
            is_dimension=True,
            extract_column_without_alias=True,
            table_alias="a",
            is_extracted_value_as_name=False,
        )

        dimensions_without_default_columns = [
            extracted_dimension
            for extracted_dimension in extracted_dimensions_without_alias
            if extracted_dimension not in GABDefaults.DIMENSIONS_DEFAULT_COLUMNS.value
        ]

        dimensions = ",".join(dimensions_without_default_columns)
        dimensions_with_alias = ",".join(extracted_dimensions_with_alias)

        (
            extracted_metrics_with_alias,
            extracted_metrics_without_alias,
        ) = GABUtils.extract_columns_from_mapping(
            columns=view_metrics,
            is_dimension=False,
            extract_column_without_alias=True,
            table_alias="a",
            is_extracted_value_as_name=False,
        )
        metrics = ",".join(extracted_metrics_without_alias)
        metrics_with_alias = ",".join(extracted_metrics_with_alias)

        dimensions_and_metrics_with_alias = (
            dimensions_with_alias + "," + metrics_with_alias
        )
        dimensions_and_metrics = dimensions + "," + metrics

        return dimensions, dimensions_and_metrics, dimensions_and_metrics_with_alias

    @classmethod
    def _get_calculated_and_derived_metrics_from_use_case_view(
        cls, view_metrics: dict, view_dimensions: dict, cadence_snapshot_status: dict
    ) -> Tuple[str, str, str]:
        """Get calculated and derived metrics from use case.

        Args:
            view_dimensions: use case configured dimensions.
            view_metrics: use case configured metrics.
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
        """
        calculated_script = []
        calculated_script_snapshot = []
        derived_script = []
        for metric_key, metric_value in view_metrics.items():
            (
                calculated_metrics_script,
                calculated_metrics_script_snapshot,
                derived_metrics_script,
            ) = cls._get_calculated_metrics(
                metric_key, metric_value, view_dimensions, cadence_snapshot_status
            )
            calculated_script += [*calculated_metrics_script]
            calculated_script_snapshot += [*calculated_metrics_script_snapshot]
            derived_script += [*derived_metrics_script]

        joined_calculated_script = cls._join_list_to_string_when_present(
            calculated_script
        )
        joined_calculated_script_snapshot = cls._join_list_to_string_when_present(
            calculated_script_snapshot
        )

        joined_derived = cls._join_list_to_string_when_present(
            to_join=derived_script, starting_value="*,", default_value="*"
        )

        return (
            joined_derived,
            joined_calculated_script,
            joined_calculated_script_snapshot,
        )

    @classmethod
    def _join_list_to_string_when_present(
        cls,
        to_join: list[str],
        separator: str = ",",
        starting_value: str = ",",
        default_value: str = "",
    ) -> str:
        """Join list to string when has values, otherwise return the default value.

        Args:
            to_join: values to join.
            separator: separator to be used in the join.
            starting_value: value to be started before the join.
            default_value: value to be returned if the list is empty.
        """
        return starting_value + separator.join(to_join) if to_join else default_value

    @classmethod
    def _get_cadence_snapshot_status(cls, result: dict) -> dict:
        cadence_snapshot_status = {}
        for k, v in result.items():
            cadence_snapshot_status[k] = next(
                (
                    next(
                        (
                            snap_list["snapshot"]
                            for snap_list in loop_outer_cad.values()
                            if snap_list["snapshot"] == "Y"
                        ),
                        "N",
                    )
                    for loop_outer_cad in v.values()
                    if v
                ),
                "N",
            )

        return cadence_snapshot_status

    @classmethod
    def _split_cadence_by_snapshot(
        cls, cadence_snapshot_status: dict
    ) -> tuple[list[str], list[str]]:
        """Split cadences by the snapshot value.

        Args:
            cadence_snapshot_status: cadences to be split by snapshot status.
        """
        with_snapshot_cadences = []
        without_snapshot_cadences = []

        for key_snap_status, value_snap_status in cadence_snapshot_status.items():
            if value_snap_status == "Y":
                with_snapshot_cadences.append(key_snap_status)
            else:
                without_snapshot_cadences.append(key_snap_status)

        return with_snapshot_cadences, without_snapshot_cadences

    @classmethod
    def _get_calculated_metrics(
        cls,
        metric_key: str,
        metric_value: dict,
        view_dimensions: dict,
        cadence_snapshot_status: dict,
    ) -> tuple[list[str], list[str], list[str]]:
        """Get calculated metrics from use case.

        Args:
            metric_key: use case metric name.
            metric_value: use case metric value.
            view_dimensions: use case configured dimensions.
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
        """
        dim_partition = ",".join([str(i) for i in view_dimensions.keys()][2:])
        dim_partition = "cadence," + dim_partition
        calculated_metrics = metric_value["calculated_metric"]
        derived_metrics = metric_value["derived_metric"]
        calculated_metrics_script: list[str] = []
        calculated_metrics_script_snapshot: list[str] = []
        derived_metrics_script: list[str] = []

        if calculated_metrics:
            (
                calculated_metrics_script,
                calculated_metrics_script_snapshot,
            ) = cls._get_calculated_metric(
                metric_key, calculated_metrics, dim_partition, cadence_snapshot_status
            )

        if derived_metrics:
            derived_metrics_script = cls._get_derived_metrics(derived_metrics)

        return (
            calculated_metrics_script,
            calculated_metrics_script_snapshot,
            derived_metrics_script,
        )

    @classmethod
    def _get_derived_metrics(cls, derived_metric: dict) -> list[str]:
        """Get derived metrics from use case.

        Args:
            derived_metric: use case derived metrics.
        """
        derived_metric_script = []

        for i in range(0, len(derived_metric)):
            derived_formula = str(derived_metric[i]["formula"])
            derived_label = derived_metric[i]["label"]
            derived_metric_script.append(derived_formula + " AS " + derived_label)

        return derived_metric_script

    @classmethod
    def _get_calculated_metric(
        cls,
        metric_key: str,
        calculated_metric: dict,
        dimension_partition: str,
        cadence_snapshot_status: dict,
    ) -> tuple[list[str], list[str]]:
        """Get calculated metrics from use case.

        Args:
            metric_key: use case metric name.
            calculated_metric: use case calculated metrics.
            dimension_partition: dimension partition.
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
        """
        last_cadence_script: list[str] = []
        last_year_cadence_script: list[str] = []
        window_script: list[str] = []
        last_cadence_script_snapshot: list[str] = []
        last_year_cadence_script_snapshot: list[str] = []
        window_script_snapshot: list[str] = []

        if "last_cadence" in calculated_metric:
            (
                last_cadence_script,
                last_cadence_script_snapshot,
            ) = cls._get_cadence_calculated_metric(
                metric_key,
                dimension_partition,
                calculated_metric,
                cadence_snapshot_status,
                "last_cadence",
            )
        if "last_year_cadence" in calculated_metric:
            (
                last_year_cadence_script,
                last_year_cadence_script_snapshot,
            ) = cls._get_cadence_calculated_metric(
                metric_key,
                dimension_partition,
                calculated_metric,
                cadence_snapshot_status,
                "last_year_cadence",
            )
        if "window_function" in calculated_metric:
            window_script, window_script_snapshot = cls._get_window_calculated_metric(
                metric_key,
                dimension_partition,
                calculated_metric,
                cadence_snapshot_status,
            )

        calculated_script = [
            *last_cadence_script,
            *last_year_cadence_script,
            *window_script,
        ]
        calculated_script_snapshot = [
            *last_cadence_script_snapshot,
            *last_year_cadence_script_snapshot,
            *window_script_snapshot,
        ]

        return calculated_script, calculated_script_snapshot

    @classmethod
    def _get_window_calculated_metric(
        cls,
        metric_key: str,
        dimension_partition: str,
        calculated_metric: dict,
        cadence_snapshot_status: dict,
    ) -> tuple[list, list]:
        """Get window calculated metrics from use case.

        Args:
            metric_key: use case metric name.
            dimension_partition: dimension partition.
            calculated_metric: use case calculated metrics.
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
        """
        calculated_script = []
        calculated_script_snapshot = []

        for i in range(0, len(calculated_metric["window_function"])):
            window_function = calculated_metric["window_function"][i]["agg_func"]
            window_function_start = calculated_metric["window_function"][i]["window"][0]
            window_function_end = calculated_metric["window_function"][i]["window"][1]
            window_label = calculated_metric["window_function"][i]["label"]

            calculated_script.append(
                f"""
                NVL(
                    {window_function}({metric_key}) OVER
                    (
                        PARTITION BY {dimension_partition}
                        order by from_date ROWS BETWEEN
                            {str(window_function_start)} PRECEDING
                            AND {str(window_function_end)} PRECEDING
                    ),
                    0
                ) AS
                {window_label}
                """
            )

            if "Y" in cadence_snapshot_status.values():
                calculated_script_snapshot.append(
                    f"""
                    NVL(
                        {window_function}({metric_key}) OVER
                        (
                            PARTITION BY {dimension_partition} ,rn
                            order by from_date ROWS BETWEEN
                                {str(window_function_start)} PRECEDING
                                AND {str(window_function_end)} PRECEDING
                        ),
                        0
                    ) AS
                    {window_label}
                    """
                )

        return calculated_script, calculated_script_snapshot

    @classmethod
    def _get_cadence_calculated_metric(
        cls,
        metric_key: str,
        dimension_partition: str,
        calculated_metric: dict,
        cadence_snapshot_status: dict,
        cadence: str,
    ) -> tuple[list, list]:
        """Get cadence calculated metrics from use case.

        Args:
            metric_key: use case metric name.
            calculated_metric: use case calculated metrics.
            dimension_partition: dimension partition.
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
            cadence: cadence to process.
        """
        calculated_script = []
        calculated_script_snapshot = []

        for i in range(0, len(calculated_metric[cadence])):
            cadence_lag = cls._get_cadence_item_lag(calculated_metric, cadence, i)
            cadence_label = calculated_metric[cadence][i]["label"]

            calculated_script.append(
                cls._get_cadence_lag_statement(
                    metric_key,
                    cadence_lag,
                    dimension_partition,
                    cadence_label,
                    snapshot=False,
                    cadence=cadence,
                )
            )

            if "Y" in cadence_snapshot_status.values():
                calculated_script_snapshot.append(
                    cls._get_cadence_lag_statement(
                        metric_key,
                        cadence_lag,
                        dimension_partition,
                        cadence_label,
                        snapshot=True,
                        cadence=cadence,
                    )
                )

        return calculated_script, calculated_script_snapshot

    @classmethod
    def _get_cadence_item_lag(
        cls, calculated_metric: dict, cadence: str, item: int
    ) -> str:
        """Get calculated metric item lag.

        Args:
            calculated_metric: use case calculated metrics.
            cadence: cadence to process.
            item: metric item.
        """
        return str(calculated_metric[cadence][item]["window"])

    @classmethod
    def _get_cadence_lag_statement(
        cls,
        metric_key: str,
        cadence_lag: str,
        dimension_partition: str,
        cadence_label: str,
        snapshot: bool,
        cadence: str,
    ) -> str:
        """Get cadence lag statement.

        Args:
            metric_key: use case metric name.
            cadence_lag: cadence window lag.
            dimension_partition: dimension partition.
            cadence_label: cadence name.
            snapshot: indicate if the snapshot is enabled.
            cadence: cadence to process.
        """
        cadence_lag_statement = ""
        if cadence == "last_cadence":
            cadence_lag_statement = (
                "NVL(LAG("
                + metric_key
                + ","
                + cadence_lag
                + ") OVER(PARTITION BY "
                + dimension_partition
                + (",rn" if snapshot else "")
                + " order by from_date),0) AS "
                + cadence_label
            )
        elif cadence == "last_year_cadence":
            cadence_lag_statement = (
                "NVL(LAG("
                + metric_key
                + ","
                + cadence_lag
                + ") OVER(PARTITION BY "
                + dimension_partition
                + (",rn" if snapshot else "")
                + """,
                    case
                        when cadence in ('DAY','MONTH','QUARTER')
                            then struct(month(from_date), day(from_date))
                        when cadence in('WEEK')
                            then struct(weekofyear(from_date+1),1)
                    end order by from_date),0) AS """
                + cadence_label
            )
        else:
            cls._LOGGER.error(f"Cadence {cadence} not implemented yet")

        return cadence_lag_statement


================================================
FILE: lakehouse_engine/core/gab_sql_generator.py
================================================
"""Module to define GAB SQL classes."""

import ast
import json
from abc import ABC, abstractmethod
from typing import Any, Callable, Optional

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, lit, struct, to_json

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.gab_utils import GABUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


def _execute_sql(func) -> Callable:  # type: ignore
    """Execute the SQL resulting from the function.

    This function is protected to be used just in this module.
    It's used to decorate functions that returns a SQL statement.

    Args:
        func: function that will return the sql to execute
    """

    def inner(*args: Any) -> None:
        generated_sql = func(*args)
        if generated_sql:
            ExecEnv.SESSION.sql(generated_sql)

    return inner


class GABSQLGenerator(ABC):
    """Abstract class defining the behaviour of a GAB SQL Generator."""

    @abstractmethod
    def generate_sql(self) -> Optional[str]:
        """Define the generate sql command.

        E.g., the behaviour of gab generate sql inheriting from this.
        """
        pass


class GABInsertGenerator(GABSQLGenerator):
    """GAB insert generator.

    Creates the insert statement based on the dimensions and metrics provided in
    the configuration table.
    """

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(
        self,
        query_id: str,
        cadence: str,
        final_stage_table: str,
        lookup_query_builder: DataFrame,
        target_database: str,
        target_table: str,
    ):
        """Construct GABInsertGenerator instances.

        Args:
            query_id: gab configuration table use case identifier.
            cadence:  inputted cadence to process.
            final_stage_table: stage view name.
            lookup_query_builder: gab configuration data.
            target_database: target database to write.
            target_table: target table to write.
        """
        self.query_id = query_id
        self.cadence = cadence
        self.final_stage_table = final_stage_table
        self.lookup_query_builder = lookup_query_builder
        self.target_database = target_database
        self.target_table = target_table

    def generate_sql(self) -> Optional[str]:
        """Generate insert sql statement to the insights table."""
        insert_sql_statement = self._insert_statement_generator()

        return insert_sql_statement

    def _insert_statement_generator(self) -> str:
        """Generate GAB insert statement.

        Creates the insert statement based on the dimensions and metrics provided in
        the configuration table.
        """
        result = GABUtils.get_json_column_as_dict(
            self.lookup_query_builder, self.query_id, "mappings"
        )

        for result_key in result.keys():
            joined_dimensions, joined_metrics = self._get_mapping_columns(
                mapping=result[result_key]
            )
            gen_ins = f"""
                INSERT INTO {self.target_database}.{self.target_table}
                SELECT
                    {self.query_id} as query_id,
                    '{self.cadence}' as cadence,
                    {joined_dimensions},
                    {joined_metrics},
                    current_timestamp() as lh_created_on
                FROM {self.final_stage_table}
                """  # nosec: B608

        return gen_ins

    @classmethod
    def _get_mapping_columns(cls, mapping: dict) -> tuple[str, str]:
        """Get mapping columns(dimensions and metrics) as joined string.

        Args:
            mapping: use case mappings configuration.
        """
        dimensions_mapping = mapping["dimensions"]
        metrics_mapping = mapping["metric"]

        joined_dimensions = cls._join_extracted_column_with_filled_columns(
            columns=dimensions_mapping, is_dimension=True
        )
        joined_metrics = cls._join_extracted_column_with_filled_columns(
            columns=metrics_mapping, is_dimension=False
        )

        return joined_dimensions, joined_metrics

    @classmethod
    def _join_extracted_column_with_filled_columns(
        cls, columns: dict, is_dimension: bool
    ) -> str:
        """Join extracted columns with empty filled columns.

        Args:
            columns: use case columns and values.
            is_dimension: flag identifying if is a dimension or a metric.
        """
        extracted_columns_with_alias = (
            GABUtils.extract_columns_from_mapping(  # type: ignore
                columns=columns, is_dimension=is_dimension
            )
        )

        filled_columns = cls._fill_empty_columns(
            extracted_columns=extracted_columns_with_alias,  # type: ignore
            is_dimension=is_dimension,
        )

        joined_columns = [*extracted_columns_with_alias, *filled_columns]

        return ",".join(joined_columns)

    @classmethod
    def _fill_empty_columns(
        cls, extracted_columns: list[str], is_dimension: bool
    ) -> list[str]:
        """Fill empty columns as null.

        As the data is expected to have 40 columns we have to fill the unused columns.

        Args:
            extracted_columns: use case extracted columns.
            is_dimension: flag identifying if is a dimension or a metric.
        """
        filled_columns = []

        for ins in range(
            (
                len(extracted_columns) - 1
                if is_dimension
                else len(extracted_columns) + 1
            ),
            41,
        ):
            filled_columns.append(
                " null as {}{}".format("d" if is_dimension else "m", ins)
            )

        return filled_columns


class GABViewGenerator(GABSQLGenerator):
    """GAB view generator.

    Creates the use case view statement to be consumed.
    """

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(
        self,
        cadence_snapshot_status: dict,
        target_database: str,
        view_name: str,
        final_cols: str,
        target_table: str,
        dimensions_and_metrics_with_alias: str,
        dimensions: str,
        dimensions_and_metrics: str,
        final_calculated_script: str,
        query_id: str,
        view_filter: str,
        final_calculated_script_snapshot: str,
        without_snapshot_cadences: list[str],
        with_snapshot_cadences: list[str],
    ):
        """Construct GABViewGenerator instances.

        Args:
            cadence_snapshot_status: each cadence with the corresponding snapshot
                status.
            target_database: target database to write.
            view_name: name of the view to be generated.
            final_cols: columns to return in the view.
            target_table: target table to write.
            dimensions_and_metrics_with_alias: configured dimensions and metrics with
                alias to compute in the view.
            dimensions: use case configured dimensions.
            dimensions_and_metrics: use case configured dimensions and metrics.
            final_calculated_script: use case calculated metrics.
            query_id: gab configuration table use case identifier.
            view_filter: filter to add in the view.
            final_calculated_script_snapshot: use case calculated metrics with snapshot.
            without_snapshot_cadences: cadences without snapshot.
            with_snapshot_cadences: cadences with snapshot.
        """
        self.cadence_snapshot_status = cadence_snapshot_status
        self.target_database = target_database
        self.result_key = view_name
        self.final_cols = final_cols
        self.target_table = target_table
        self.dimensions_and_metrics_with_alias = dimensions_and_metrics_with_alias
        self.dimensions = dimensions
        self.dimensions_and_metrics = dimensions_and_metrics
        self.final_calculated_script = final_calculated_script
        self.query_id = query_id
        self.view_filter = view_filter
        self.final_calculated_script_snapshot = final_calculated_script_snapshot
        self.without_snapshot_cadences = without_snapshot_cadences
        self.with_snapshot_cadences = with_snapshot_cadences

    @_execute_sql
    def generate_sql(self) -> Optional[str]:
        """Generate use case view sql statement."""
        consumption_view_sql = self._create_consumption_view()

        return consumption_view_sql

    def _create_consumption_view(self) -> str:
        """Create consumption view."""
        final_view_query = self._generate_consumption_view_statement(
            self.cadence_snapshot_status,
            self.target_database,
            self.final_cols,
            self.target_table,
            self.dimensions_and_metrics_with_alias,
            self.dimensions,
            self.dimensions_and_metrics,
            self.final_calculated_script,
            self.query_id,
            self.view_filter,
            self.final_calculated_script_snapshot,
            without_snapshot_cadences=",".join(
                f'"{w}"' for w in self.without_snapshot_cadences
            ),
            with_snapshot_cadences=",".join(
                f'"{w}"' for w in self.with_snapshot_cadences
            ),
        )

        rendered_query = """
            CREATE OR REPLACE VIEW {database}.{view_name} AS {final_view_query}
            """.format(
            database=self.target_database,
            view_name=self.result_key,
            final_view_query=final_view_query,
        )
        self._LOGGER.info(f"Consumption view statement: {rendered_query}")
        return rendered_query

    @classmethod
    def _generate_consumption_view_statement(
        cls,
        cadence_snapshot_status: dict,
        target_database: str,
        final_cols: str,
        target_table: str,
        dimensions_and_metrics_with_alias: str,
        dimensions: str,
        dimensions_and_metrics: str,
        final_calculated_script: str,
        query_id: str,
        view_filter: str,
        final_calculated_script_snapshot: str,
        without_snapshot_cadences: str,
        with_snapshot_cadences: str,
    ) -> str:
        """Generate consumption view.

        Args:
            cadence_snapshot_status: cadences to execute with the information if it has
                snapshot.
            target_database: target database to write.
            final_cols: use case columns exposed in the consumption view.
            target_table: target table to write.
            dimensions_and_metrics_with_alias: dimensions and metrics as string columns
                with alias.
            dimensions: dimensions as string columns.
            dimensions_and_metrics: dimensions and metrics as string columns
                without alias.
            final_calculated_script: final calculated metrics script.
            query_id: gab configuration table use case identifier.
            view_filter: filter to execute on the view.
            final_calculated_script_snapshot: final calculated metrics with snapshot
                script.
            without_snapshot_cadences: cadences without snapshot.
            with_snapshot_cadences: cadences with snapshot.
        """
        cls._LOGGER.info("Generating consumption view statement...")
        cls._LOGGER.info(
            f"""
            {{
                target_database: {target_database},
                target_table: {target_table},
                query_id: {query_id},
                cadence_and_snapshot_status: {cadence_snapshot_status},
                cadences_without_snapshot: [{without_snapshot_cadences}],
                cadences_with_snapshot: [{with_snapshot_cadences}],
                final_cols: {final_cols},
                dimensions_and_metrics_with_alias: {dimensions_and_metrics_with_alias},
                dimensions: {dimensions},
                dimensions_with_metrics: {dimensions_and_metrics},
                final_calculated_script: {final_calculated_script},
                final_calculated_script_snapshot: {final_calculated_script_snapshot},
                view_filter: {view_filter}
            }}"""
        )
        if (
            "Y" in cadence_snapshot_status.values()
            and "N" in cadence_snapshot_status.values()
        ):
            consumption_view_query = f"""
                WITH TEMP1 AS (
                    SELECT
                        a.cadence,
                        {dimensions_and_metrics_with_alias}{final_calculated_script}
                    FROM {target_database}.{target_table} a
                    WHERE a.query_id = {query_id}
                    AND cadence IN ({without_snapshot_cadences})
                    {view_filter}
                ),
                TEMP_RN AS (
                    SELECT
                        a.cadence,
                        a.from_date,
                        a.to_date,
                        {dimensions_and_metrics},
                        row_number() over(
                            PARTITION BY
                                a.cadence,
                                {dimensions},
                                a.from_date
                            order by to_date
                        ) as rn
                    FROM {target_database}.{target_table} a
                    WHERE a.query_id = {query_id}
                    AND cadence IN ({with_snapshot_cadences})
                    {view_filter}
                ),
                TEMP2 AS (
                    SELECT
                        a.cadence,
                        {dimensions_and_metrics_with_alias}{final_calculated_script_snapshot}
                    FROM TEMP_RN a
                ),
                TEMP3 AS (SELECT * FROM TEMP1 UNION SELECT * from TEMP2)
                SELECT {final_cols} FROM TEMP3
            """  # nosec: B608
        elif "N" in cadence_snapshot_status.values():
            consumption_view_query = f"""
                WITH TEMP1 AS (
                    SELECT
                        a.cadence,
                        {dimensions_and_metrics_with_alias}{final_calculated_script}
                    FROM {target_database}.{target_table} a
                    WHERE a.query_id = {query_id}
                    AND cadence IN ({without_snapshot_cadences})  {view_filter}
                )
                SELECT {final_cols} FROM TEMP1
            """  # nosec: B608
        else:
            consumption_view_query = f"""
                WITH TEMP_RN AS (
                    SELECT
                        a.cadence,
                        a.from_date,
                        a.to_date,
                        {dimensions_and_metrics},
                        row_number() over(
                            PARTITION BY
                                a.cadence,
                                a.from_date,
                                a.to_date,
                                {dimensions},
                                a.from_date
                        order by to_date) as rn
                    FROM {target_database}.{target_table} a
                    WHERE a.query_id = {query_id}
                    AND cadence IN ({with_snapshot_cadences})
                    {view_filter}
                ),
                TEMP2 AS (
                    SELECT
                        a.cadence,
                        {dimensions_and_metrics_with_alias}{final_calculated_script_snapshot}
                    FROM TEMP_RN a
                )
                SELECT {final_cols} FROM TEMP2
            """  # nosec: B608

        return consumption_view_query


class GABDeleteGenerator(GABSQLGenerator):
    """GAB delete generator.

    Creates the delete statement to clean the use case base data on the insights table.
    """

    _LOGGER = LoggingHandler(__name__).get_logger()

    def __init__(
        self,
        query_id: str,
        cadence: str,
        temp_stage_view_name: str,
        lookup_query_builder: DataFrame,
        target_database: str,
        target_table: str,
    ):
        """Construct GABViewGenerator instances.

        Args:
            query_id: gab configuration table use case identifier.
            cadence:  inputted cadence to process.
            temp_stage_view_name: stage view name.
            lookup_query_builder: gab configuration data.
            target_database: target database to write.
            target_table: target table to write.
        """
        self.query_id = query_id
        self.cadence = cadence
        self.temp_stage_view_name = temp_stage_view_name
        self.lookup_query_builder = lookup_query_builder
        self.target_database = target_database
        self.target_table = target_table

    @_execute_sql
    def generate_sql(self) -> Optional[str]:
        """Generate delete sql statement.

        This statement is to clean the insights table for the corresponding use case.
        """
        delete_sql_statement = self._delete_statement_generator()

        return delete_sql_statement

    def _delete_statement_generator(self) -> str:
        df_filtered = self.lookup_query_builder.filter(
            col("query_id") == lit(self.query_id)
        )

        df_map = df_filtered.select(col("mappings"))
        view_df = df_map.select(
            to_json(struct([df_map[x] for x in df_map.columns]))
        ).collect()[0][0]
        line = json.loads(view_df)

        for line_v in line.values():
            result = ast.literal_eval(line_v)

        for result_key in result.keys():
            result_new = result[result_key]
            dim_from_date = result_new["dimensions"]["from_date"]
            dim_to_date = result_new["dimensions"]["to_date"]

        self._LOGGER.info(f"temp stage view name: {self.temp_stage_view_name}")

        min_from_date = ExecEnv.SESSION.sql(
            """
            SELECT
                MIN({from_date}) as min_from_date
            FROM {iter_stages}""".format(  # nosec: B608
                iter_stages=self.temp_stage_view_name, from_date=dim_from_date
            )
        ).collect()[0][0]
        max_from_date = ExecEnv.SESSION.sql(
            """
            SELECT
                MAX({from_date}) as max_from_date
            FROM {iter_stages}""".format(  # nosec: B608
                iter_stages=self.temp_stage_view_name, from_date=dim_from_date
            )
        ).collect()[0][0]

        min_to_date = ExecEnv.SESSION.sql(
            """
            SELECT
                MIN({to_date}) as min_to_date
            FROM {iter_stages}""".format(  # nosec: B608
                iter_stages=self.temp_stage_view_name, to_date=dim_to_date
            )
        ).collect()[0][0]
        max_to_date = ExecEnv.SESSION.sql(
            """
            SELECT
                MAX({to_date}) as max_to_date
            FROM {iter_stages}""".format(  # nosec: B608
                iter_stages=self.temp_stage_view_name, to_date=dim_to_date
            )
        ).collect()[0][0]

        gen_del = """
        DELETE FROM {target_database}.{target_table} a
            WHERE query_id = {query_id}
            AND cadence = '{cadence}'
            AND from_date BETWEEN '{min_from_date}' AND '{max_from_date}'
            AND to_date BETWEEN '{min_to_date}' AND '{max_to_date}'
        """.format(  # nosec: B608
            target_database=self.target_database,
            target_table=self.target_table,
            query_id=self.query_id,
            cadence=self.cadence,
            min_from_date=min_from_date,
            max_from_date=max_from_date,
            min_to_date=min_to_date,
            max_to_date=max_to_date,
        )

        return gen_del


================================================
FILE: lakehouse_engine/core/s3_file_manager.py
================================================
"""File manager module using boto3."""

import time
from typing import Any, Optional, Tuple

import boto3

from lakehouse_engine.algorithms.exceptions import RestoreTypeNotFoundException
from lakehouse_engine.core.definitions import (
    ARCHIVE_STORAGE_CLASS,
    FileManagerAPIKeys,
    RestoreStatus,
    RestoreType,
)
from lakehouse_engine.core.file_manager import FileManager
from lakehouse_engine.utils.file_utils import get_directory_path
from lakehouse_engine.utils.logging_handler import LoggingHandler


def _dry_run(bucket: str, object_paths: list) -> dict:
    """Build the dry run request return format.

    Args:
        bucket: name of bucket to perform operation.
        object_paths: paths of object to list.

    Returns:
        A dict with a list of objects that would be copied/deleted.
    """
    response = {}

    for path in object_paths:
        if _check_directory(bucket, path):
            path = get_directory_path(path)

        res = _list_objects_recursively(bucket=bucket, path=path)

        if res:
            response[path] = res
        else:
            response[path] = ["No such key"]

    return response


def _list_objects(
    s3_client: Any, bucket: str, path: str, paginator: str = ""
) -> Tuple[list, str]:
    """List 1000 objects in a bucket given a prefix and paginator in s3.

    Args:
        bucket: name of bucket to perform the list.
        path: path to be used as a prefix.
        paginator: paginator token to be used.

    Returns:
         A list of object names.
    """
    object_list = []

    if not paginator:
        list_response = s3_client.list_objects_v2(Bucket=bucket, Prefix=path)
    else:
        list_response = s3_client.list_objects_v2(
            Bucket=bucket,
            Prefix=path,
            ContinuationToken=paginator,
        )

    if FileManagerAPIKeys.CONTENTS.value in list_response:
        for obj in list_response[FileManagerAPIKeys.CONTENTS.value]:
            object_list.append(obj[FileManagerAPIKeys.KEY.value])

    if FileManagerAPIKeys.CONTINUATION.value in list_response:
        pagination = list_response[FileManagerAPIKeys.CONTINUATION.value]
    else:
        pagination = ""

    return object_list, pagination


def _list_objects_recursively(bucket: str, path: str) -> list:
    """Recursively list all objects given a prefix in s3.

    Args:
        bucket: name of bucket to perform the list.
        path: path to be used as a prefix.

    Returns:
        A list of object names fetched recursively.
    """
    object_list = []
    more_objects = True
    paginator = ""

    s3 = boto3.client("s3")

    while more_objects:
        temp_list, paginator = _list_objects(s3, bucket, path, paginator)

        object_list.extend(temp_list)

        if not paginator:
            more_objects = False

    return object_list


def _check_directory(bucket: str, path: str) -> bool:
    """Checks if the object is a 'directory' in s3.

    Args:
        bucket: name of bucket to perform the check.
        path: path to be used as a prefix.

    Returns:
        If path represents a 'directory'.
    """
    s3 = boto3.client("s3")
    objects, _ = _list_objects(s3, bucket, path)
    return len(objects) > 1


class S3FileManager(FileManager):
    """Set of actions to manipulate s3 files in several ways."""

    _logger = LoggingHandler(__name__).get_logger()

    def get_function(self) -> None:
        """Get a specific function to execute."""
        available_functions = {
            "delete_objects": self.delete_objects,
            "copy_objects": self.copy_objects,
            "request_restore": self.request_restore,
            "check_restore_status": self.check_restore_status,
            "request_restore_to_destination_and_wait": (
                self.request_restore_to_destination_and_wait
            ),
        }

        self._logger.info("Function being executed: {}".format(self.function))
        if self.function in available_functions.keys():
            func = available_functions[self.function]
            func()
        else:
            raise NotImplementedError(
                f"The requested function {self.function} is not implemented."
            )

    def _delete_objects(self, bucket: str, objects_paths: list) -> None:
        """Delete objects recursively in s3.

        Params:
            bucket: name of bucket to perform the delete operation.
            objects_paths: objects to be deleted.
        """
        s3 = boto3.client("s3")

        for path in objects_paths:
            if _check_directory(bucket, path):
                path = get_directory_path(path)
            else:
                path = path.strip()

            more_objects = True
            paginator = ""
            objects_to_delete = []

            while more_objects:
                objects_found, paginator = _list_objects(
                    s3_client=s3, bucket=bucket, path=path, paginator=paginator
                )
                for obj in objects_found:
                    objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj})

                if not paginator:
                    more_objects = False

                response = s3.delete_objects(
                    Bucket=bucket,
                    Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete},
                )
                self._logger.info(response)
                objects_to_delete = []

    def delete_objects(self) -> None:
        """Delete objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be deleted based on the given keys.
        """
        bucket = self.configs["bucket"]
        objects_paths = self.configs["object_paths"]
        dry_run = self.configs["dry_run"]

        if dry_run:
            response = _dry_run(bucket=bucket, object_paths=objects_paths)

            self._logger.info("Paths that would be deleted:")
            self._logger.info(response)
        else:
            self._delete_objects(bucket, objects_paths)

    def copy_objects(self) -> None:
        """Copies objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be copied based on the given keys.
        """
        source_bucket = self.configs["bucket"]
        source_object = self.configs["source_object"]
        destination_bucket = self.configs["destination_bucket"]
        destination_object = self.configs["destination_object"]
        dry_run = self.configs["dry_run"]

        S3FileManager._copy_objects(
            source_bucket=source_bucket,
            source_object=source_object,
            destination_bucket=destination_bucket,
            destination_object=destination_object,
            dry_run=dry_run,
        )

    def move_objects(self) -> None:
        """Moves objects and 'directories'.

        If dry_run is set to True the function will print a dict with all the
        paths that would be moved based on the given keys.
        """
        pass

    def request_restore(self) -> None:
        """Request the restore of archived data."""
        source_bucket = self.configs["bucket"]
        source_object = self.configs["source_object"]
        restore_expiration = self.configs["restore_expiration"]
        retrieval_tier = self.configs["retrieval_tier"]
        dry_run = self.configs["dry_run"]

        ArchiveFileManager.request_restore(
            source_bucket,
            source_object,
            restore_expiration,
            retrieval_tier,
            dry_run,
        )

    def check_restore_status(self) -> None:
        """Check the restore status of archived data."""
        source_bucket = self.configs["bucket"]
        source_object = self.configs["source_object"]

        restore_status = ArchiveFileManager.check_restore_status(
            source_bucket, source_object
        )

        self._logger.info(
            f"""
            Restore status:
            - Not Started: {restore_status.get('not_started_objects')}
            - Ongoing: {restore_status.get('ongoing_objects')}
            - Restored: {restore_status.get('restored_objects')}
            Total objects in this restore process: {restore_status.get('total_objects')}
            """
        )

    def request_restore_to_destination_and_wait(self) -> None:
        """Request and wait for the restore to complete, polling the restore status.

        After the restore is done, copy the restored files to destination
        """
        source_bucket = self.configs["bucket"]
        source_object = self.configs["source_object"]
        destination_bucket = self.configs["destination_bucket"]
        destination_object = self.configs["destination_object"]
        restore_expiration = self.configs["restore_expiration"]
        retrieval_tier = self.configs["retrieval_tier"]
        dry_run = self.configs["dry_run"]

        ArchiveFileManager.request_restore_and_wait(
            source_bucket=source_bucket,
            source_object=source_object,
            restore_expiration=restore_expiration,
            retrieval_tier=retrieval_tier,
            dry_run=dry_run,
        )

        S3FileManager._logger.info(
            f"Restoration complete for {source_bucket} and {source_object}"
        )
        S3FileManager._logger.info(
            f"Starting to copy data from {source_bucket}/{source_object} to "
            f"{destination_bucket}/{destination_object}"
        )
        S3FileManager._copy_objects(
            source_bucket=source_bucket,
            source_object=source_object,
            destination_bucket=destination_bucket,
            destination_object=destination_object,
            dry_run=dry_run,
        )
        S3FileManager._logger.info(
            f"Finished copying data, data should be available on {destination_bucket}/"
            f"{destination_object}"
        )

    @staticmethod
    def _copy_objects(
        source_bucket: str,
        source_object: str,
        destination_bucket: str,
        destination_object: str,
        dry_run: bool,
    ) -> None:
        """Copies objects and 'directories' in s3.

        Args:
            source_bucket: name of bucket to perform the copy.
            source_object: object/folder to be copied.
            destination_bucket: name of the target bucket to copy.
            destination_object: target object/folder to copy.
            dry_run: if dry_run is set to True the function will print a dict with
                all the paths that would be deleted based on the given keys.
        """
        s3 = boto3.client("s3")

        if dry_run:
            response = _dry_run(bucket=source_bucket, object_paths=[source_object])

            S3FileManager._logger.info("Paths that would be copied:")
            S3FileManager._logger.info(response)
        else:
            original_object_name = source_object.split("/")[-1]

            if _check_directory(source_bucket, source_object):
                source_object = get_directory_path(source_object)

                copy_object = _list_objects_recursively(
                    bucket=source_bucket, path=source_object
                )

                for obj in copy_object:
                    S3FileManager._logger.info(f"Copying obj: {obj}")

                    final_path = obj.replace(source_object, "")

                    response = s3.copy_object(
                        Bucket=destination_bucket,
                        CopySource={
                            FileManagerAPIKeys.BUCKET.value: source_bucket,
                            FileManagerAPIKeys.KEY.value: obj,
                        },
                        Key=f"{destination_object}/{original_object_name}/{final_path}",
                    )
                    S3FileManager._logger.info(response)
            else:
                S3FileManager._logger.info(f"Copying obj: {source_object}")

                response = s3.copy_object(
                    Bucket=destination_bucket,
                    CopySource={
                        FileManagerAPIKeys.BUCKET.value: source_bucket,
                        FileManagerAPIKeys.KEY.value: source_object,
                    },
                    Key=f"""{destination_object}/{original_object_name}""",
                )
                S3FileManager._logger.info(response)


class ArchiveFileManager(object):
    """Set of actions to restore archives."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]:
        """Get the archived object if it's an object.

        Args:
            bucket: name of bucket to check get the object.
            object_key: object to get.

        Returns:
            S3 Object if it's an archived object, otherwise None.
        """
        s3 = boto3.resource("s3")
        object_to_restore = s3.Object(bucket, object_key)

        if (
            object_to_restore.storage_class is not None
            and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS
        ):
            return object_to_restore
        else:
            return None

    @staticmethod
    def _check_object_restore_status(
        bucket: str, object_key: str
    ) -> Optional[RestoreStatus]:
        """Check the restore status of the archive.

        Args:
            bucket: name of bucket to check the restore status.
            object_key: object to check the restore status.

        Returns:
            The restore status represented by an enum, possible values are:
                NOT_STARTED, ONGOING or RESTORED
        """
        archived_object = ArchiveFileManager._get_archived_object(bucket, object_key)

        if archived_object is None:
            status = None
        elif archived_object.restore is None:
            status = RestoreStatus.NOT_STARTED
        elif 'ongoing-request="true"' in archived_object.restore:
            status = RestoreStatus.ONGOING
        else:
            status = RestoreStatus.RESTORED

        return status

    @staticmethod
    def check_restore_status(source_bucket: str, source_object: str) -> dict:
        """Check the restore status of archived data.

        Args:
            source_bucket: name of bucket to check the restore status.
            source_object: object to check the restore status.

        Returns:
            A dict containing the amount of objects in each status.
        """
        not_started_objects = 0
        ongoing_objects = 0
        restored_objects = 0
        total_objects = 0

        if _check_directory(source_bucket, source_object):
            source_object = get_directory_path(source_object)

        objects_to_restore = _list_objects_recursively(
            bucket=source_bucket, path=source_object
        )

        for obj in objects_to_restore:
            ArchiveFileManager._logger.info(f"Checking restore status for: {obj}")

            restore_status = ArchiveFileManager._check_object_restore_status(
                source_bucket, obj
            )
            if not restore_status:
                ArchiveFileManager._logger.warning(
                    f"Restore status not found for {source_bucket}/{obj}"
                )
            else:
                total_objects += 1

                if RestoreStatus.NOT_STARTED == restore_status:
                    not_started_objects += 1
                elif RestoreStatus.ONGOING == restore_status:
                    ongoing_objects += 1
                else:
                    restored_objects += 1

                ArchiveFileManager._logger.info(
                    f"{obj} restore status is {restore_status.value}"
                )

        return {
            "total_objects": total_objects,
            "not_started_objects": not_started_objects,
            "ongoing_objects": ongoing_objects,
            "restored_objects": restored_objects,
        }

    @staticmethod
    def _request_restore_object(
        bucket: str, object_key: str, expiration: int, retrieval_tier: str
    ) -> None:
        """Request a restore of the archive.

        Args:
            bucket: name of bucket to perform the restore.
            object_key: object to be restored.
            expiration: restore expiration.
            retrieval_tier: type of restore, possible values are:
                Bulk, Standard or Expedited.
        """
        if not RestoreType.exists(retrieval_tier):
            raise RestoreTypeNotFoundException(
                f"Restore type {retrieval_tier} not supported."
            )

        if _check_directory(bucket, object_key):
            object_key = get_directory_path(object_key)

        archived_object = ArchiveFileManager._get_archived_object(bucket, object_key)

        if archived_object and archived_object.restore is None:
            ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.")
            archived_object.restore_object(
                RestoreRequest={
                    "Days": expiration,
                    "GlacierJobParameters": {"Tier": retrieval_tier},
                }
            )
        else:
            ArchiveFileManager._logger.info(
                f"Restore request for {bucket}/{object_key} not performed."
            )

    @staticmethod
    def request_restore(
        source_bucket: str,
        source_object: str,
        restore_expiration: int,
        retrieval_tier: str,
        dry_run: bool,
    ) -> None:
        """Request the restore of archived data.

        Args:
            source_bucket: name of bucket to perform the restore.
            source_object: object to be restored.
            restore_expiration: restore expiration in days.
            retrieval_tier: type of restore, possible values are:
                Bulk, Standard or Expedited.
            dry_run: if dry_run is set to True the function will print a dict with
                all the paths that would be deleted based on the given keys.
        """
        if _check_directory(source_bucket, source_object):
            source_object = get_directory_path(source_object)

        if dry_run:
            response = _dry_run(bucket=source_bucket, object_paths=[source_object])

            ArchiveFileManager._logger.info("Paths that would be restored:")
            ArchiveFileManager._logger.info(response)
        else:
            objects_to_restore = _list_objects_recursively(
                bucket=source_bucket, path=source_object
            )

            for obj in objects_to_restore:
                ArchiveFileManager._request_restore_object(
                    source_bucket,
                    obj,
                    restore_expiration,
                    retrieval_tier,
                )

    @staticmethod
    def request_restore_and_wait(
        source_bucket: str,
        source_object: str,
        restore_expiration: int,
        retrieval_tier: str,
        dry_run: bool,
    ) -> None:
        """Request and wait for the restore to complete, polling the restore status.

        Args:
            source_bucket: name of bucket to perform the restore.
            source_object: object to be restored.
            restore_expiration: restore expiration in days.
            retrieval_tier: type of restore, possible values are:
                Bulk, Standard or Expedited.
            dry_run: if dry_run is set to True the function will print a dict with
                all the paths that would be deleted based on the given keys.
        """
        if retrieval_tier != RestoreType.EXPEDITED.value:
            ArchiveFileManager._logger.error(
                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
                "kind of restore should be used just with `Expedited` retrieval tier "
                "to save cluster costs."
            )
            raise ValueError(
                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
                "kind of restore should be used just with `Expedited` retrieval tier "
                "to save cluster costs."
            )

        ArchiveFileManager.request_restore(
            source_bucket=source_bucket,
            source_object=source_object,
            restore_expiration=restore_expiration,
            retrieval_tier=retrieval_tier,
            dry_run=dry_run,
        )
        restore_status = ArchiveFileManager.check_restore_status(
            source_bucket, source_object
        )
        ArchiveFileManager._logger.info(f"Restore status: {restore_status}")

        if not dry_run:
            ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.")
            wait_time = 300
            while restore_status.get("total_objects") > restore_status.get(
                "restored_objects"
            ):
                ArchiveFileManager._logger.info(
                    "Not all objects have been restored yet, checking the status again "
                    f"in {wait_time} seconds."
                )
                time.sleep(wait_time)
                wait_time = 30
                restore_status = ArchiveFileManager.check_restore_status(
                    source_bucket, source_object
                )
                ArchiveFileManager._logger.info(f"Restore status: {restore_status}")


================================================
FILE: lakehouse_engine/core/sensor_manager.py
================================================
"""Module to define Sensor Manager classes."""

import json
from datetime import datetime
from typing import List, Optional, Tuple

import requests
from delta.tables import DeltaTable
from pyspark.sql import DataFrame, Row
from pyspark.sql.functions import array, col, lit

from lakehouse_engine.core.definitions import (
    SENSOR_SCHEMA,
    SENSOR_UPDATE_SET,
    SAPLogchain,
    SensorSpec,
    SensorStatus,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader_factory import ReaderFactory
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SensorControlTableManager(object):
    """Class to control the Sensor execution."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def check_if_sensor_has_acquired_data(
        cls,
        sensor_id: str,
        control_db_table_name: str,
    ) -> bool:
        """Check if sensor has acquired new data.

        Args:
            sensor_id: sensor id.
            control_db_table_name: `db.table` to control sensor runs.

        Returns:
            True if acquired new data, otherwise False
        """
        sensor_table_data = cls.read_sensor_table_data(
            sensor_id=sensor_id, control_db_table_name=control_db_table_name
        )
        cls._LOGGER.info(f"sensor_table_data = {sensor_table_data}")

        return (
            sensor_table_data is not None
            and sensor_table_data.status == SensorStatus.ACQUIRED_NEW_DATA.value
        )

    @classmethod
    def update_sensor_status(
        cls,
        sensor_spec: SensorSpec,
        status: str,
        upstream_key: str = None,
        upstream_value: str = None,
    ) -> None:
        """Control sensor execution storing the execution data in a delta table.

        Args:
            sensor_spec: sensor spec containing all sensor
                information we need to update the control status.
            status: status of the sensor.
            upstream_key: upstream key (e.g., used to store an attribute
                name from the upstream so that new data can be detected
                automatically).
            upstream_value: upstream value (e.g., used to store the max
                attribute value from the upstream so that new data can be
                detected automatically).
        """
        cls._LOGGER.info(
            f"Updating sensor status for sensor {sensor_spec.sensor_id}..."
        )

        data = cls._convert_sensor_to_data(
            spec=sensor_spec,
            status=status,
            upstream_key=upstream_key,
            upstream_value=upstream_value,
        )

        sensor_update_set = cls._get_sensor_update_set(
            assets=sensor_spec.assets,
            checkpoint_location=sensor_spec.checkpoint_location,
            upstream_key=upstream_key,
            upstream_value=upstream_value,
        )

        cls._update_sensor_control(
            data=data,
            sensor_update_set=sensor_update_set,
            sensor_control_table=sensor_spec.control_db_table_name,
            sensor_id=sensor_spec.sensor_id,
        )

    @classmethod
    def _update_sensor_control(
        cls,
        data: List[dict],
        sensor_update_set: dict,
        sensor_control_table: str,
        sensor_id: str,
    ) -> None:
        """Update sensor control delta table.

        Args:
            data: to be updated.
            sensor_update_set: columns which we had update.
            sensor_control_table: control table name.
            sensor_id: sensor_id to be updated.
        """
        sensors_delta_table = DeltaTable.forName(
            ExecEnv.SESSION,
            sensor_control_table,
        )
        sensors_updates = ExecEnv.SESSION.createDataFrame(data, SENSOR_SCHEMA)
        sensors_delta_table.alias("sensors").merge(
            sensors_updates.alias("updates"),
            f"sensors.sensor_id = '{sensor_id}' AND "
            "sensors.sensor_id = updates.sensor_id",
        ).whenMatchedUpdate(set=sensor_update_set).whenNotMatchedInsertAll().execute()

    @classmethod
    def _convert_sensor_to_data(
        cls,
        spec: SensorSpec,
        status: str,
        upstream_key: str,
        upstream_value: str,
        status_change_timestamp: Optional[datetime] = None,
    ) -> List[dict]:
        """Convert sensor data to dataframe input data.

        Args:
            spec: sensor spec containing sensor identifier data.
            status: new sensor data status.
            upstream_key: key used to acquired data from the upstream.
            upstream_value: max value from the upstream_key
                acquired from the upstream.
            status_change_timestamp: timestamp we commit
                this change in the sensor control table.

        Returns:
            Sensor data as list[dict], used to create a
                dataframe to store the data into the sensor_control_table.
        """
        status_change_timestamp = (
            datetime.now()
            if status_change_timestamp is None
            else status_change_timestamp
        )
        return [
            {
                "sensor_id": spec.sensor_id,
                "assets": spec.assets,
                "status": status,
                "status_change_timestamp": status_change_timestamp,
                "checkpoint_location": spec.checkpoint_location,
                "upstream_key": str(upstream_key),
                "upstream_value": str(upstream_value),
            }
        ]

    @classmethod
    def _get_sensor_update_set(cls, **kwargs: Optional[str] | List[str]) -> dict:
        """Get the sensor update set.

        Args:
            kwargs: Containing the following keys:
            - assets
            - checkpoint_location
            - upstream_key
            - upstream_value

        Returns:
            A set containing the fields to update in the control_table.
        """
        sensor_update_set = dict(SENSOR_UPDATE_SET)
        for key, value in kwargs.items():
            if value:
                sensor_update_set[f"sensors.{key}"] = f"updates.{key}"

        return sensor_update_set

    @classmethod
    def read_sensor_table_data(
        cls,
        control_db_table_name: str,
        sensor_id: str = None,
        assets: list = None,
    ) -> Optional[Row]:
        """Read data from delta table containing sensor status info.

        Args:
            sensor_id: sensor id. If this parameter is defined search occurs
                only considering this parameter. Otherwise, it considers sensor
                assets and checkpoint location.
            control_db_table_name: db.table to control sensor runs.
            assets: list of assets that are fueled by the pipeline
                where this sensor is.

        Returns:
            Row containing the data for the provided sensor_id.
        """
        df = DeltaTable.forName(
            ExecEnv.SESSION,
            control_db_table_name,
        ).toDF()

        if sensor_id:
            df = df.where(col("sensor_id") == sensor_id)
        elif assets:
            df = df.where(col("assets") == array(*[lit(asset) for asset in assets]))
        else:
            raise ValueError(
                "Either sensor_id or assets need to be provided as arguments."
            )

        return df.first()


class SensorUpstreamManager(object):
    """Class to deal with Sensor Upstream data."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def generate_filter_exp_query(
        cls,
        sensor_id: str,
        filter_exp: str,
        control_db_table_name: str = None,
        upstream_key: str = None,
        upstream_value: str = None,
        upstream_table_name: str = None,
    ) -> str:
        """Generates a sensor preprocess query based on timestamp logic.

        Args:
            sensor_id: sensor id.
            filter_exp: expression to filter incoming new data.
                You can use the placeholder `?upstream_value` so that
                it can be replaced by the upstream_value in the
                control_db_table_name for this specific sensor_id.
            control_db_table_name: db.table to retrieve the last status change
                timestamp. This is only relevant for the jdbc sensor.
            upstream_key: the key of custom sensor information
                to control how to identify new data from the
                upstream (e.g., a time column in the upstream).
            upstream_value: value for custom sensor
                to identify new data from the upstream
                (e.g., the value of a time present in the upstream)
                If none we will set the default value.
                Note: This parameter is used just to override the
                default value `-2147483647`.
            upstream_table_name: value for custom sensor
                to query new data from the upstream.
                If none we will set the default value,
                our `sensor_new_data` view.

        Returns:
            The query string.
        """
        source_table = upstream_table_name if upstream_table_name else "sensor_new_data"
        select_exp = "SELECT COUNT(1) as count"
        if control_db_table_name:
            if not upstream_key:
                raise ValueError(
                    "If control_db_table_name is defined, upstream_key should "
                    "also be defined!"
                )

            default_upstream_value: str = "-2147483647"
            trigger_name = upstream_key
            trigger_value = (
                default_upstream_value if upstream_value is None else upstream_value
            )
            sensor_table_data = SensorControlTableManager.read_sensor_table_data(
                sensor_id=sensor_id, control_db_table_name=control_db_table_name
            )

            if sensor_table_data and sensor_table_data.upstream_value:
                trigger_value = sensor_table_data.upstream_value

            filter_exp = filter_exp.replace("?upstream_key", trigger_name).replace(
                "?upstream_value", trigger_value
            )
            select_exp = (
                f"SELECT COUNT(1) as count, '{trigger_name}' as UPSTREAM_KEY, "
                f"max({trigger_name}) as UPSTREAM_VALUE"
            )

        query = (
            f"{select_exp} "
            f"FROM {source_table} "
            f"WHERE {filter_exp} "
            f"HAVING COUNT(1) > 0"
        )

        return query

    @classmethod
    def generate_sensor_table_preprocess_query(
        cls,
        sensor_id: str,
    ) -> str:
        """Generates a query to be used for a sensor having other sensor as upstream.

        Args:
            sensor_id: sensor id.

        Returns:
            The query string.
        """
        query = (
            f"SELECT * "  # nosec
            f"FROM sensor_new_data "
            f"WHERE"
            f" _change_type in ('insert', 'update_postimage')"
            f" and sensor_id = '{sensor_id}'"
            f" and status = '{SensorStatus.PROCESSED_NEW_DATA.value}'"
        )

        return query

    @classmethod
    def read_new_data(cls, sensor_spec: SensorSpec) -> DataFrame:
        """Read new data from the upstream into the sensor 'new_data_df'.

        Args:
            sensor_spec: sensor spec containing all sensor information.

        Returns:
            An empty dataframe if it doesn't have new data otherwise the new data
        """
        new_data_df = ReaderFactory.get_data(sensor_spec.input_spec)

        if sensor_spec.preprocess_query:
            new_data_df.createOrReplaceTempView("sensor_new_data")
            new_data_df = ExecEnv.SESSION.sql(sensor_spec.preprocess_query)

        return new_data_df

    @classmethod
    def get_new_data(
        cls,
        new_data_df: DataFrame,
    ) -> Optional[Row]:
        """Get new data from upstream df if it's present.

        Args:
            new_data_df: DataFrame possibly containing new data.

        Returns:
            Optional row, present if there is new data in the upstream,
            absent otherwise.
        """
        return new_data_df.first()

    @classmethod
    def generate_sensor_sap_logchain_query(
        cls,
        chain_id: str,
        dbtable: str = SAPLogchain.DBTABLE.value,
        status: str = SAPLogchain.GREEN_STATUS.value,
        engine_table_name: str = SAPLogchain.ENGINE_TABLE.value,
    ) -> str:
        """Generates a sensor query based in the SAP Logchain table.

        Args:
            chain_id: chain id to query the status on SAP.
            dbtable: db.table to retrieve the data to
                check if the sap chain is already finished.
            status: db.table to retrieve the last status change
                timestamp.
            engine_table_name: table name exposed with the SAP LOGCHAIN data.
                This table will be used in the jdbc query.

        Returns:
            The query string.
        """
        if not chain_id:
            raise ValueError(
                "To query on log chain SAP table the chain id should be defined!"
            )

        select_exp = (
            "SELECT CHAIN_ID, CONCAT(DATUM, ZEIT) AS LOAD_DATE, ANALYZED_STATUS"
        )
        filter_exp = (
            f"UPPER(CHAIN_ID) = UPPER('{chain_id}') "
            f"AND UPPER(ANALYZED_STATUS) = UPPER('{status}')"
        )

        query = (
            f"WITH {engine_table_name} AS ("
            f"{select_exp} "
            f"FROM {dbtable} "
            f"WHERE {filter_exp}"
            ")"
        )

        return query


class SensorJobRunManager(object):
    """Class to manage triggering of Jobs via Job Run API."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def run_job(cls, job_id: str, token: str, host: str) -> Tuple[int, Optional[str]]:
        """Trigger the job based on its id.

        Args:
            job_id: the id of the job to trigger.
            token: token required to access Databricks API.
            host: host for workspace.
        """
        run_id = None
        ex = None

        headers = {"Authorization": f"Bearer {token}"}
        body = json.dumps(
            {
                "job_id": job_id,
                "notebook_params": {"msg": "triggered via heartbeat sensor"},
            }
        )

        res = requests.post(
            f"https://{host}/api/2.1/jobs/run-now",
            data=body,
            headers=headers,
            timeout=3600,
        )

        if res.status_code == 200:
            run_id = (json.loads(res.text))["run_id"]
            cls._LOGGER.info(
                f"Job : {str(job_id)} triggered successfully... RUN ID : {str(run_id)}"
            )
        else:
            ex = str(res.json()["error_code"]) + "  " + res.json()["message"]
            cls._LOGGER.error(f"An error has occurred: {ex}")

        return run_id, ex


================================================
FILE: lakehouse_engine/core/table_manager.py
================================================
"""Table manager module."""

from typing import List

from delta.tables import DeltaTable
from pyspark.sql import DataFrame
from pyspark.sql.functions import translate

from lakehouse_engine.core.definitions import SQLDefinitions
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.sql_parser_utils import SQLParserUtils


class TableManager(object):
    """Set of actions to manipulate tables/views in several ways.

    {{ get_table_manager_operations() }}
    """

    def __init__(self, configs: dict):
        """Construct TableManager algorithm instances.

        Args:
            configs: configurations for the TableManager algorithm.
        """
        self._logger = LoggingHandler(__name__).get_logger()
        self.configs = configs
        self.function = self.configs["function"]

    def get_function(self) -> None:
        """Get a specific function to execute."""
        available_functions = {
            "compute_table_statistics": self.compute_table_statistics,
            "create_table": self.create,
            "create_tables": self.create_many,
            "create_view": self.create,
            "drop_table": self.drop_table,
            "drop_view": self.drop_view,
            "execute_sql": self.execute_sql,
            "truncate": self.truncate,
            "vacuum": self.vacuum,
            "describe": self.describe,
            "optimize": self.optimize,
            "show_tbl_properties": self.show_tbl_properties,
            "get_tbl_pk": self.get_tbl_pk,
            "repair_table": self.repair_table,
            "delete_where": self.delete_where,
        }

        self._logger.info("Function being executed: {}".format(self.function))

        if self.function in available_functions.keys():
            func = available_functions[self.function]
            func()
        else:
            raise NotImplementedError(
                f"The requested function {self.function} is not implemented."
            )

    def create(self) -> None:
        """Create a new table or view on metastore."""
        disable_dbfs_retry = (
            self.configs["disable_dbfs_retry"]
            if "disable_dbfs_retry" in self.configs.keys()
            else False
        )
        sql = ConfigUtils.read_sql(self.configs["path"], disable_dbfs_retry)
        try:
            sql_commands = SQLParserUtils().split_sql_commands(
                sql_commands=sql,
                delimiter=self.configs.get("delimiter", ";"),
                advanced_parser=self.configs.get("advanced_parser", False),
            )
            for command in sql_commands:
                if command.strip():
                    self._logger.info(f"sql command: {command}")
                    ExecEnv.SESSION.sql(command)
            self._logger.info(f"{self.function} successfully executed!")
        except Exception as e:
            self._logger.error(e)
            raise

    def create_many(self) -> None:
        """Create multiple tables or views on metastore.

        In this function the path to the ddl files can be separated by comma.
        """
        self.execute_multiple_sql_files()

    def compute_table_statistics(self) -> None:
        """Compute table statistics."""
        sql = SQLDefinitions.compute_table_stats.value.format(
            self.configs["table_or_view"]
        )
        try:
            self._logger.info(f"sql command: {sql}")
            ExecEnv.SESSION.sql(sql)
            self._logger.info(f"{self.function} successfully executed!")
        except Exception as e:
            self._logger.error(e)
            raise

    def drop_table(self) -> None:
        """Delete table function deletes table from metastore and erases all data."""
        drop_stmt = "{} {}".format(
            SQLDefinitions.drop_table_stmt.value,
            self.configs["table_or_view"],
        )

        self._logger.info(f"sql command: {drop_stmt}")
        ExecEnv.SESSION.sql(drop_stmt)
        self._logger.info("Table successfully dropped!")

    def drop_view(self) -> None:
        """Delete view function deletes view from metastore and erases all data."""
        drop_stmt = "{} {}".format(
            SQLDefinitions.drop_view_stmt.value,
            self.configs["table_or_view"],
        )

        self._logger.info(f"sql command: {drop_stmt}")
        ExecEnv.SESSION.sql(drop_stmt)
        self._logger.info("View successfully dropped!")

    def truncate(self) -> None:
        """Truncate function erases all data but keeps metadata."""
        truncate_stmt = "{} {}".format(
            SQLDefinitions.truncate_stmt.value,
            self.configs["table_or_view"],
        )

        self._logger.info(f"sql command: {truncate_stmt}")
        ExecEnv.SESSION.sql(truncate_stmt)
        self._logger.info("Table successfully truncated!")

    def vacuum(self) -> None:
        """Vacuum function erases older versions from Delta Lake tables or locations."""
        if not self.configs.get("table_or_view", None):
            delta_table = DeltaTable.forPath(ExecEnv.SESSION, self.configs["path"])

            self._logger.info(f"Vacuuming location: {self.configs['path']}")
            delta_table.vacuum(self.configs.get("vacuum_hours", 168))
        else:
            delta_table = DeltaTable.forName(
                ExecEnv.SESSION, self.configs["table_or_view"]
            )

            self._logger.info(f"Vacuuming table: {self.configs['table_or_view']}")
            delta_table.vacuum(self.configs.get("vacuum_hours", 168))

    def describe(self) -> None:
        """Describe function describes metadata from some table or view."""
        describe_stmt = "{} {}".format(
            SQLDefinitions.describe_stmt.value,
            self.configs["table_or_view"],
        )

        self._logger.info(f"sql command: {describe_stmt}")
        output = ExecEnv.SESSION.sql(describe_stmt)
        self._logger.info(output)

    def optimize(self) -> None:
        """Optimize function optimizes the layout of Delta Lake data."""
        if self.configs.get("where_clause", None):
            where_exp = "WHERE {}".format(self.configs["where_clause"].strip())
        else:
            where_exp = ""

        if self.configs.get("optimize_zorder_col_list", None):
            zorder_exp = "ZORDER BY ({})".format(
                self.configs["optimize_zorder_col_list"].strip()
            )
        else:
            zorder_exp = ""

        optimize_stmt = "{} {} {} {}".format(
            SQLDefinitions.optimize_stmt.value,
            (
                f"delta.`{self.configs.get('path', None)}`"
                if not self.configs.get("table_or_view", None)
                else self.configs.get("table_or_view", None)
            ),
            where_exp,
            zorder_exp,
        )

        self._logger.info(f"sql command: {optimize_stmt}")
        output = ExecEnv.SESSION.sql(optimize_stmt)
        self._logger.info(output)

    def execute_multiple_sql_files(self) -> None:
        """Execute multiple statements in multiple sql files.

        In this function the path to the files is separated by comma.
        """
        for table_metadata_file in self.configs["path"].split(","):
            disable_dbfs_retry = (
                self.configs["disable_dbfs_retry"]
                if "disable_dbfs_retry" in self.configs.keys()
                else False
            )
            sql = ConfigUtils.read_sql(table_metadata_file.strip(), disable_dbfs_retry)
            sql_commands = SQLParserUtils().split_sql_commands(
                sql_commands=sql,
                delimiter=self.configs.get("delimiter", ";"),
                advanced_parser=self.configs.get("advanced_parser", False),
            )
            for command in sql_commands:
                if command.strip():
                    self._logger.info(f"sql command: {command}")
                    ExecEnv.SESSION.sql(command)
            self._logger.info("sql file successfully executed!")

    def execute_sql(self) -> None:
        """Execute sql commands separated by semicolon (;)."""
        sql_commands = SQLParserUtils().split_sql_commands(
            sql_commands=self.configs.get("sql"),
            delimiter=self.configs.get("delimiter", ";"),
            advanced_parser=self.configs.get("advanced_parser", False),
        )
        for command in sql_commands:
            if command.strip():
                self._logger.info(f"sql command: {command}")
                ExecEnv.SESSION.sql(command)
        self._logger.info("sql successfully executed!")

    def show_tbl_properties(self) -> DataFrame:
        """Show Table Properties.

        Returns:
            A dataframe with the table properties.
        """
        show_tbl_props_stmt = "{} {}".format(
            SQLDefinitions.show_tbl_props_stmt.value,
            self.configs["table_or_view"],
        )

        self._logger.info(f"sql command: {show_tbl_props_stmt}")
        output = ExecEnv.SESSION.sql(show_tbl_props_stmt)
        self._logger.info(output)
        return output

    def get_tbl_pk(self) -> List[str]:
        """Get the primary key of a particular table.

        Returns:
            The list of columns that are part of the primary key.
        """
        output: List[str] = (
            self.show_tbl_properties()
            .filter("key == 'lakehouse.primary_key'")
            .select("value")
            .withColumn("value", translate("value", " `", ""))
            .first()[0]
            .split(",")
        )
        self._logger.info(output)

        return output

    def repair_table(self) -> None:
        """Run the repair table command."""
        table_name = self.configs["table_or_view"]
        sync_metadata = self.configs["sync_metadata"]

        repair_stmt = (
            f"MSCK REPAIR TABLE {table_name} "
            f"{'SYNC METADATA' if sync_metadata else ''}"
        )

        self._logger.info(f"sql command: {repair_stmt}")
        output = ExecEnv.SESSION.sql(repair_stmt)
        self._logger.info(output)

    def delete_where(self) -> None:
        """Run the delete where command."""
        table_name = self.configs["table_or_view"]
        delete_where = self.configs["where_clause"].strip()

        delete_stmt = SQLDefinitions.delete_where_stmt.value.format(
            table_name, delete_where
        )

        self._logger.info(f"sql command: {delete_stmt}")
        output = ExecEnv.SESSION.sql(delete_stmt)
        self._logger.info(output)


================================================
FILE: lakehouse_engine/dq_processors/__init__.py
================================================
"""Package to define data quality processes available in the lakehouse engine."""


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/__init__.py
================================================
"""Package containing custom DQ expectations available in the lakehouse engine."""


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b.py
================================================
"""Expectation to check if column 'a' is not equal to column 'b'."""

from typing import Any, Dict, Optional

from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine
from great_expectations.expectations.expectation import ColumnPairMapExpectation
from great_expectations.expectations.metrics.map_metric_provider import (
    ColumnPairMapMetricProvider,
    column_pair_condition_partial,
)

from lakehouse_engine.utils.expectations_utils import validate_result


class ColumnPairCustom(ColumnPairMapMetricProvider):
    """Asserts that column 'A' is not equal to column 'B'.

    Additionally, It compares Null as well.
    """

    condition_metric_name = "column_pair_values.a_not_equal_to_b"
    condition_domain_keys = (
        "batch_id",
        "table",
        "column_A",
        "column_B",
        "ignore_row_if",
    )
    condition_value_keys = ()

    @column_pair_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: ColumnPairMapMetricProvider,
        column_A: Any,
        column_B: Any,
        **kwargs: dict,
    ) -> Any:
        """Implementation of the expectation's logic.

        Args:
            column_A: Value of the row of column_A.
            column_B: Value of the row of column_B.
            kwargs: dict with additional parameters.

        Returns:
            If the condition is met.
        """
        return ((column_A.isNotNull()) | (column_B.isNotNull())) & (
            column_A != column_B
        )  # noqa: E501


class ExpectColumnPairAToBeNotEqualToB(ColumnPairMapExpectation):
    """Expect values in column A to be not equal to column B.

    Args:
        column_A: The first column name.
        column_B: The second column name.

    Keyword Args:
        allow_cross_type_comparisons: If True, allow
            comparisons between types (e.g. integer and string).
            Otherwise, attempting such comparisons will raise an exception.
        ignore_row_if: "both_values_are_missing",
            "either_value_is_missing", "neither" (default).
        result_format: Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.
        include_config: If True (default), then include the expectation config
            as part of the result object.
        catch_exceptions: If True, then catch exceptions and
            include them as part of the result object. Default: False.
        meta: A JSON-serializable dictionary (nesting allowed)
            that will be included in the output without modification.

    Returns:
        An ExpectationSuiteValidationResult.
    """

    mostly: float = 1.0
    ignore_row_if: str = "neither"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = False
    column_A: Any = None
    column_B: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": ["IE4019", "IM6092", "IE1405"],
                        "b": ["IE4019", "IM6092", "IE1405"],
                        "c": ["IE1404", "IN6192", "842075"],
                    },
                    "schemas": {
                        "spark": {
                            "a": "StringType",
                            "b": "StringType",
                            "c": "StringType",
                        }
                    },
                }
            ],
            "tests": [
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "b",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["b"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {"b": "IE4019", "a": "IE4019"},
                            {"b": "IM6092", "a": "IM6092"},
                            {"b": "IE1405", "a": "IE1405"},
                        ],
                    },
                },
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "c",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a"],
                        },
                    },
                    "out": {
                        "success": True,
                        "unexpected_index_list": [],
                    },
                },
            ],
        },
    ]

    map_metric = "column_pair_values.a_not_equal_to_b"
    success_keys = (
        "column_A",
        "column_B",
        "ignore_row_if",
        "mostly",
    )

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)


"""Mandatory block of code. If it is removed the expectation will not be available."""
if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectColumnPairAToBeNotEqualToB().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b.py
================================================
"""Expectation to check if column 'a' is lower or equal than column 'b'."""

from typing import Any, Dict, Optional

from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine
from great_expectations.expectations.expectation import ColumnPairMapExpectation
from great_expectations.expectations.metrics.map_metric_provider import (
    ColumnPairMapMetricProvider,
    column_pair_condition_partial,
)

from lakehouse_engine.utils.expectations_utils import validate_result


class ColumnPairCustom(ColumnPairMapMetricProvider):
    """Asserts that column 'A' is lower or equal than column 'B'.

    Additionally, the 'margin' parameter can be used to add a margin to the
    check between column 'A' and 'B': 'A' <= 'B' + 'margin'.
    """

    condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b"
    condition_domain_keys = (
        "batch_id",
        "table",
        "column_A",
        "column_B",
        "ignore_row_if",
    )
    condition_value_keys = ("margin",)

    @column_pair_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: ColumnPairMapMetricProvider,
        column_A: Any,
        column_B: Any,
        **kwargs: dict,
    ) -> Any:
        """Implementation of the expectation's logic.

        Args:
            column_A: Value of the row of column_A.
            column_B: Value of the row of column_B.
            kwargs: dict with additional parameters.

        Returns:
            If the condition is met.
        """
        margin = kwargs.get("margin") or None
        if margin is None:
            approx = 0
        elif not isinstance(margin, (int, float, complex)):
            raise TypeError(
                f"margin must be one of int, float, complex."
                f" Found: {margin} as {type(margin)}"
            )
        else:
            approx = margin  # type: ignore

        return column_A <= column_B + approx  # type: ignore


class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):
    """Expect values in column A to be lower or equal than column B.

    Args:
        column_A: The first column name.
        column_B: The second column name.
        margin: additional approximation to column B value.

    Keyword Args:
        allow_cross_type_comparisons: If True, allow
            comparisons between types (e.g. integer and string).
            Otherwise, attempting such comparisons will raise an exception.
        ignore_row_if: "both_values_are_missing",
            "either_value_is_missing", "neither" (default).
        result_format: Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.
        include_config: If True (default), then include the expectation config
            as part of the result object.
        catch_exceptions: If True, then catch exceptions and
            include them as part of the result object. Default: False.
        meta: A JSON-serializable dictionary (nesting allowed)
            that will be included in the output without modification.

    Returns:
        An ExpectationSuiteValidationResult.
    """

    mostly: float = 1.0
    ignore_row_if: str = "neither"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = False
    margin: Any = None
    column_A: Any = None
    column_B: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": [11, 22, 50],
                        "b": [10, 21, 100],
                        "c": [9, 21, 30],
                    },
                    "schemas": {
                        "spark": {
                            "a": "IntegerType",
                            "b": "IntegerType",
                            "c": "IntegerType",
                        }
                    },
                }
            ],
            "tests": [
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "c",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["c"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {"c": 9, "a": 11},
                            {"c": 21, "a": 22},
                            {"c": 30, "a": 50},
                        ],
                    },
                },
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "b",
                        "margin": 1,
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a"],
                        },
                    },
                    "out": {
                        "success": True,
                        "unexpected_index_list": [],
                    },
                },
            ],
        },
    ]

    map_metric = "column_pair_values.a_smaller_or_equal_than_b"
    success_keys = (
        "column_A",
        "column_B",
        "ignore_row_if",
        "margin",
        "mostly",
    )

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)


"""Mandatory block of code. If it is removed the expectation will not be available."""
if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py
================================================
"""Expectation to check if date column 'a' is greater or equal to date column 'b'."""

import datetime
from typing import Any, Dict, Optional

from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine
from great_expectations.expectations.expectation import ColumnPairMapExpectation
from great_expectations.expectations.metrics.map_metric_provider import (
    ColumnPairMapMetricProvider,
    column_pair_condition_partial,
)

from lakehouse_engine.utils.expectations_utils import validate_result


# This class defines a Metric to support your Expectation
class ColumnPairDateAToBeGreaterOrEqualToDateB(ColumnPairMapMetricProvider):
    """Asserts that date column 'A' is greater or equal to date column 'B'."""

    # This is the id string that will be used to refer your metric.
    condition_metric_name = "column_pair_values.date_a_greater_or_equal_to_date_b"
    condition_domain_keys = (
        "batch_id",
        "table",
        "column_A",
        "column_B",
        "ignore_row_if",
    )

    @column_pair_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: ColumnPairMapMetricProvider,
        column_A: Any,
        column_B: Any,
        **kwargs: dict,
    ) -> Any:
        """Implementation of the expectation's logic.

        Args:
            column_A: Value of the row of column_A.
            column_B: Value of the row of column_B.
            kwargs: dict with additional parameters.

        Returns:
            Boolean on the basis of condition.
        """
        return (
            (column_A.isNotNull()) & (column_B.isNotNull()) & (column_A >= column_B)
        )  # type: ignore


class ExpectColumnPairDateAToBeGreaterThanOrEqualToDateB(ColumnPairMapExpectation):
    """Expect values in date column A to be greater than or equal to date column B.

    Args:
        column_A: The first date column name.
        column_B: The second date column name.

    Keyword Args:
        ignore_row_if: "both_values_are_missing",
            "either_value_is_missing", "neither" (default).
        result_format: Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.
        include_config: If True (default), then include the
            expectation config as part of the result object.
        catch_exceptions: If True, then catch exceptions and
            include them as part of the result object. Default: False.
        meta: A JSON-serializable dictionary (nesting allowed)
            that will be included in the output without modification.

    Returns:
        An ExpectationSuiteValidationResult.
    """

    mostly: float = 1.0
    ignore_row_if: str = "neither"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = True
    column_A: Any = None
    column_B: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": [
                            "2029-01-12",
                            "2024-11-21",
                            "2022-01-01",
                        ],
                        "b": [
                            "2019-02-11",
                            "2014-12-22",
                            "2012-09-09",
                        ],
                        "c": [
                            "2010-02-11",
                            "2015-12-22",
                            "2022-09-09",
                        ],
                    },
                    "schemas": {
                        "spark": {
                            "a": "DateType",
                            "b": "DateType",
                            "c": "DateType",
                        }
                    },
                }
            ],
            "tests": [
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "b",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a", "b"],
                        },
                    },
                    "out": {"success": True, "unexpected_index_list": []},
                },
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "b",
                        "column_B": "c",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {
                                "a": datetime.date(2024, 11, 21),
                                "b": datetime.date(2014, 12, 22),
                                "c": datetime.date(2015, 12, 22),
                            },
                            {
                                "a": datetime.date(2022, 1, 1),
                                "b": datetime.date(2012, 9, 9),
                                "c": datetime.date(2022, 9, 9),
                            },
                        ],
                    },
                },
            ],
        }
    ]

    map_metric = "column_pair_values.date_a_greater_or_equal_to_date_b"
    success_keys = (
        "column_A",
        "column_B",
        "ignore_row_if",
        "mostly",
    )

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)


"""Mandatory block of code. If it is removed the expectation will not be available."""
if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectColumnPairDateAToBeGreaterThanOrEqualToDateB().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_be_date_not_older_than.py
================================================
"""Expectation to check if column value is a date within a timeframe."""

import datetime
from datetime import timedelta
from typing import Any, Dict, Optional

from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine
from great_expectations.expectations.expectation import ColumnMapExpectation
from great_expectations.expectations.metrics import ColumnMapMetricProvider
from great_expectations.expectations.metrics.map_metric_provider import (
    column_condition_partial,
)

from lakehouse_engine.utils.expectations_utils import validate_result


class ColumnValuesDateNotOlderThan(ColumnMapMetricProvider):
    """Asserts that column values are a date that isn't older than a given date."""

    condition_metric_name = "column_values.date_is_not_older_than"
    condition_domain_keys = (
        "batch_id",
        "table",
        "column",
        "ignore_row_if",
    )  # type: ignore
    condition_value_keys = ("timeframe",)

    @column_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: ColumnMapMetricProvider,
        column: Any,
        **kwargs: dict,
    ) -> Any:
        """Implementation of the expectation's logic.

        Since timedelta can only define an interval up to weeks, a month is defined
        as 4 weeks and a year is defined as 52 weeks.

        Args:
            column: Name of column to validate.
            kwargs: dict with additional parameters.

        Returns:
            If the condition is met.
        """
        timeframe = kwargs.get("timeframe") or None
        weeks = (
            timeframe.get("weeks", 0)
            + (timeframe.get("months", 0) * 4)
            + (timeframe.get("years", 0) * 52)
        )

        delta = timedelta(
            days=timeframe.get("days", 0),
            seconds=timeframe.get("seconds", 0),
            microseconds=timeframe.get("microseconds", 0),
            milliseconds=timeframe.get("milliseconds", 0),
            minutes=timeframe.get("minutes", 0),
            hours=timeframe.get("hours", 0),
            weeks=weeks,
        )

        return delta > (datetime.datetime.now() - column)


class ExpectColumnValuesToBeDateNotOlderThan(ColumnMapExpectation):
    """Expect value in column to be date that is not older than a given time.

    Since timedelta can only define an interval up to weeks, a month is defined
    as 4 weeks and a year is defined as 52 weeks.

    Args:
        column: Name of column to validate
        Note: Column must be of type Date, Timestamp or String (with Timestamp format).
            Format: yyyy-MM-ddTHH:mm:ss
        timeframe: dict with the definition of the timeframe.
        kwargs: dict with additional parameters.

    Keyword Args:
        allow_cross_type_comparisons: If True, allow
            comparisons between types (e.g. integer and string).
            Otherwise, attempting such comparisons will raise an exception.
        ignore_row_if: "both_values_are_missing",
            "either_value_is_missing", "neither" (default).
        result_format: Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.
        include_config: If True (default), then include the expectation config
            as part of the result object.
        catch_exceptions: If True, then catch exceptions and
            include them as part of the result object. Default: False.
        meta: A JSON-serializable dictionary (nesting allowed)
            that will be included in the output without modification.

    Returns:
        An ExpectationSuiteValidationResult.
    """

    mostly: float = 1.0
    ignore_row_if: str = "neither"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = False
    timeframe: Any = {}
    column: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": [
                            datetime.datetime(2023, 6, 1, 12, 0, 0),
                            datetime.datetime(2023, 6, 2, 12, 0, 0),
                            datetime.datetime(2023, 6, 3, 12, 0, 0),
                        ],
                        "b": [
                            datetime.datetime(1800, 6, 1, 12, 0, 0),
                            datetime.datetime(2023, 6, 2, 12, 0, 0),
                            datetime.datetime(1800, 6, 3, 12, 0, 0),
                        ],
                    }
                }
            ],
            "schemas": {"spark": {"a": "TimestampType", "b": "TimestampType"}},
            "tests": [
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "a",
                        "timeframe": {"years": 100},
                        "result_format": {
                            "result_format": "BASIC",
                            "unexpected_index_column_names": ["b"],
                        },
                    },
                    "out": {
                        "success": True,
                        "unexpected_index_list": [],
                    },
                },
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "b",
                        "timeframe": {"years": 100},
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {
                                "a": datetime.datetime(2023, 6, 1, 12, 0),
                                "b": datetime.datetime(1800, 6, 1, 12, 0),
                            },
                            {
                                "a": datetime.datetime(2023, 6, 3, 12, 0),
                                "b": datetime.datetime(1800, 6, 3, 12, 0),
                            },
                        ],
                    },
                },
            ],
        },
    ]

    map_metric = "column_values.date_is_not_older_than"
    success_keys = ("column", "ignore_row_if", "timeframe", "mostly")

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)


"""Mandatory block of code. If it is removed the expectation will not be available."""
if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectColumnValuesToBeDateNotOlderThan().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_not_be_null_or_empty_string.py
================================================
"""Expectation to check if column value is not null or empty string."""

from typing import Any, Dict, Optional

from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine
from great_expectations.expectations.expectation import ColumnMapExpectation
from great_expectations.expectations.metrics import ColumnMapMetricProvider
from great_expectations.expectations.metrics.map_metric_provider import (
    column_condition_partial,
)

from lakehouse_engine.utils.expectations_utils import validate_result


class ColumnValuesNotNullOrEpmtyString(ColumnMapMetricProvider):
    """Asserts that column values are not null or empty string."""

    condition_metric_name = "column_values.not_null_or_empty_string"
    filter_column_isnull = False
    condition_domain_keys = (
        "batch_id",
        "table",
        "column",
        "ignore_row_if",
    )  # type: ignore
    condition_value_keys = ()

    @column_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: ColumnMapMetricProvider,
        column: Any,
        **kwargs: dict,
    ) -> Any:
        """Implementation of the expectation's logic.

        Args:
            column: Name of column to validate.
            kwargs: dict with additional parameters.

        Returns:
            If the condition is met.
        """
        return (column.isNotNull()) & (column != "")


class ExpectColumnValuesToNotBeNullOrEmptyString(ColumnMapExpectation):
    """Expect value in column to be not null or empty string.

    Args:
        column: Name of column to validate.
        kwargs: dict with additional parameters.

    Keyword Args:
        allow_cross_type_comparisons: If True, allow
            comparisons between types (e.g. integer and string).
            Otherwise, attempting such comparisons will raise an exception.
        ignore_row_if: "both_values_are_missing",
            "either_value_is_missing", "neither" (default).
        result_format: Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.
        include_config: If True (default), then include the expectation config
            as part of the result object.
        catch_exceptions: If True, then catch exceptions and
            include them as part of the result object. Default: False.
        meta: A JSON-serializable dictionary (nesting allowed)
            that will be included in the output without modification.

    Returns:
        An ExpectationSuiteValidationResult.
    """

    mostly: float = 1.0
    ignore_row_if: str = "neither"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = False
    column: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": [
                            "4061622965678",
                            "4061622965679",
                            "4061622965680",
                        ],
                        "b": [
                            "4061622965678",
                            "",
                            "4061622965680",
                        ],
                    }
                }
            ],
            "schemas": {"spark": {"a": "StringType", "b": "StringType"}},
            "tests": [
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "a",
                        "result_format": {
                            "result_format": "BASIC",
                            "unexpected_index_column_names": ["b"],
                        },
                    },
                    "out": {
                        "success": True,
                        "unexpected_index_list": [],
                    },
                },
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "b",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {
                                "a": "4061622965679",
                                "b": "",
                            }
                        ],
                    },
                },
            ],
        },
    ]

    map_metric = "column_values.not_null_or_empty_string"
    success_keys = ("column", "ignore_row_if", "mostly")

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)


"""Mandatory block of code. If it is removed the expectation will not be available."""
if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectColumnValuesToNotBeNullOrEmptyString().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c.py
================================================
"""Expectation to check if column 'a' equals 'b', or 'c'."""

from typing import Any, Dict, Literal, Optional

from great_expectations.execution_engine import ExecutionEngine, SparkDFExecutionEngine
from great_expectations.expectations.expectation import MulticolumnMapExpectation
from great_expectations.expectations.metrics.map_metric_provider import (
    MulticolumnMapMetricProvider,
    multicolumn_condition_partial,
)

from lakehouse_engine.utils.expectations_utils import validate_result


class MulticolumnCustomMetric(MulticolumnMapMetricProvider):
    """Expectation metric definition.

    This expectation asserts that column 'a' must equal to column 'b' or column 'c'.
    In addition to this it is possible to validate that column 'b' or 'c' match a regex.
    """

    condition_metric_name = "multicolumn_values.column_a_must_equal_b_or_c"
    condition_domain_keys = (
        "batch_id",
        "table",
        "column_list",
        "ignore_row_if",
    )

    condition_value_keys = ("validation_regex_b", "validation_regex_c")

    @multicolumn_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: MulticolumnMapMetricProvider, column_list: list, **kwargs: dict
    ) -> Any:
        validation_regex_b = (
            kwargs.get("validation_regex_b") if "validation_regex_b" in kwargs else ".*"
        )
        validation_regex_c = (
            kwargs.get("validation_regex_c") if "validation_regex_c" in kwargs else ".*"
        )

        return (column_list[0].isNotNull()) & (
            (
                column_list[1].isNotNull()
                & (column_list[1].rlike(validation_regex_b))
                & (column_list[0] == column_list[1])
            )
            | (
                (column_list[1].isNull())
                & (column_list[2].rlike(validation_regex_c))
                & (column_list[0] == column_list[2])
            )
        )


class ExpectMulticolumnColumnAMustEqualBOrC(MulticolumnMapExpectation):
    """Expect that the column 'a' is equal to 'b' when this is not empty; otherwise 'a' must be equal to 'c'.

    Args:
        column_list: The column names to evaluate.

    Keyword Args:
        ignore_row_if: default to "never".
        result_format:  Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
            Default set to `BASIC`.
        include_config: If True, then include the expectation
            config as part of the result object.
            Default set to True.
        catch_exceptions: If True, then catch exceptions
            and include them as part of the result object.
            Default set to False.

    Returns:
        An ExpectationSuiteValidationResult.
    """  # noqa: E501

    ignore_row_if: Literal[
        "all_values_are_missing", "any_value_is_missing", "never"
    ] = "never"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = False
    mostly: float = 1.0
    column_list: Any = None
    validation_regex_c: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": ["d001", "1000", "1001"],
                        "b": [None, "1000", "1001"],
                        "c": ["d001", "d002", "d002"],
                        "d": ["d001", "d002", "1001"],
                    },
                    "schemas": {
                        "spark": {
                            "a": "StringType",
                            "b": "StringType",
                            "c": "StringType",
                            "d": "StringType",
                        }
                    },
                }
            ],
            "tests": [
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_list": ["d", "b", "c"],
                        "validation_regex_c": "d[0-9]{3}$",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["d", "b", "c"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {
                                "d": "d002",
                                "b": "1000",
                                "c": "d002",
                            }
                        ],
                    },
                },
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_list": ["a", "b", "c"],
                        "validation_regex_c": "d[0-9]{3}$",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a", "b", "c"],
                        },
                    },
                    "out": {"success": True},
                },
            ],
        },
    ]

    map_metric = "multicolumn_values.column_a_must_equal_b_or_c"
    success_keys = (
        "validation_regex_b",
        "validation_regex_c",
        "mostly",
    )  # type: ignore

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)


if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectMulticolumnColumnAMustEqualBOrC().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_queried_column_agg_value_to_be.py
================================================
"""Expectation to check if aggregated column satisfy the condition."""

from typing import Any, Dict, Optional

from great_expectations.execution_engine import ExecutionEngine
from great_expectations.expectations.expectation import (
    ExpectationValidationResult,
    QueryExpectation,
)
from great_expectations.expectations.expectation_configuration import (
    ExpectationConfiguration,
)


class ExpectQueriedColumnAggValueToBe(QueryExpectation):
    """Expect agg of column to satisfy the condition specified.

    Args:
        template_dict: dict with the following keys:
            - column (column to check sum).
            - group_column_list (group by column names to be listed).
            - condition (how to validate the aggregated value eg: between,
                greater, lesser).
            - max_value (maximum allowed value).
            - min_value (minimum allowed value).
            - agg_type (sum/count/max/min).
    """

    metric_dependencies = ("query.template_values",)
    query_temp = """
        SELECT {group_column_list}, {agg_type}({column})
        FROM {batch}
        GROUP BY {group_column_list}
    """

    include_config: bool = True
    mostly: float = 1.0
    result_format: dict = {"result_format": "BASIC"}
    catch_exceptions: bool = False
    meta: Any = None
    query: str = query_temp
    template_dict: Any = None

    success_keys = ("template_dict", "query")
    condition_domain_keys = (
        "query",
        "template_dict",
        "batch_id",
        "row_condition",
        "condition_parser",
    )

    def validate_configuration(
        self, configuration: Optional[ExpectationConfiguration] = None
    ) -> None:
        """Validates that a configuration has been set.

        Args:
            configuration (OPTIONAL[ExpectationConfiguration]):
                An optional Expectation Configuration entry.

        Returns:
            None. Raises InvalidExpectationConfigurationError
        """
        super().validate_configuration(configuration)

    @staticmethod
    def _validate_between(
        x: str, y: int, expected_max_value: int, expected_min_value: int
    ) -> dict:
        """Method to check whether value satisfy the between condition.

        Args:
            x: contains key of dict(query_result).
            y: contains value of dict(query_result).
            expected_max_value: max value passed.
            expected_min_value: min value passed.

        Returns:
            dict with the results after being validated.
        """
        if expected_min_value <= y <= expected_max_value:
            return {
                "info": f"Value is within range\
                    {expected_min_value} and {expected_max_value}",
                "success": True,
            }
        else:
            return {
                "success": False,
                "result": {
                    "info": f"Value not in range\
                        {expected_min_value} and {expected_max_value}",
                    "observed_value": (x, y),
                },
            }

    @staticmethod
    def _validate_lesser(x: str, y: int, expected_max_value: int) -> dict:
        """Method to check whether value satisfy the less condition.

        Args:
            x: contains key of dict(query_result).
            y: contains value of dict(query_result).
            expected_max_value: max value passed.

        Returns:
            dict with the results after being validated.
        """
        if y < expected_max_value:
            return {
                "info": f"Value is lesser than {expected_max_value}",
                "success": True,
            }
        else:
            return {
                "success": False,
                "result": {
                    "info": f"Value is greater than {expected_max_value}",
                    "observed_value": (x, y),
                },
            }

    @staticmethod
    def _validate_greater(x: str, y: int, expected_min_value: int) -> dict:
        """Method to check whether value satisfy the greater condition.

        Args:
            x: contains key of dict(query_result).
            y: contains value of dict(query_result).
            expected_min_value: min value passed.

        Returns:
            dict with the results after being validated.
        """
        if y > expected_min_value:
            return {
                "info": f"Value is greater than {expected_min_value}",
                "success": True,
            }
        else:
            return {
                "success": False,
                "result": {
                    "info": f"Value is less than {expected_min_value}",
                    "observed_value": (x, y),
                },
            }

    def _validate_condition(self, query_result: dict, template_dict: dict) -> dict:
        """Method to check whether value satisfy the expected result.

        Args:
            query_result: contains dict of key and value.
            template_dict: contains dict of input provided.

        Returns:
            dict with the results after being validated.
        """
        result: Dict[Any, Any] = {}
        for x, y in query_result.items():
            condition_check = template_dict["condition"]
            if condition_check == "between":
                _max = template_dict["max_value"]
                _min = template_dict["min_value"]
                result = self._validate_between(x, y, _max, _min)
            elif condition_check == "lesser":
                _max = template_dict["max_value"]
                result = self._validate_lesser(x, y, _max)
            else:
                _min = template_dict["min_value"]
                result = self._validate_greater(x, y, _min)

        return result

    @staticmethod
    def _generate_dict(query_result: list) -> dict:
        """Generate a dict from a list of dicts and merge the group by columns values.

        Args:
            query_result: contains list of dict values obtained from query.

        Returns:
            Dict

        Example:
            input: [dict_values(['Male', 25, 3500]), dict_values(['Female', 25, 6200]),
                dict_values(['Female', 20, 3500]), dict_values(['Male', 20, 6900])].
            output: {'Male|25': 3500, 'Female|25': 6200,
                'Female|20': 3500, 'Male|20': 6900}.
        """
        intermediate_list = []
        final_list = []
        for i in range(len(query_result)):
            intermediate_list.append(list(query_result[i]))
            for element in intermediate_list:
                if type(element) is list:
                    output = "|".join(map(str, element))
                    key = "|".join(map(str, element[0:-1]))
                    value = output.replace(key + "|", "")
                    final_list.append(key)
                    final_list.append(value)

        new_result = {
            final_list[i]: int(final_list[i + 1]) for i in range(0, len(final_list), 2)
        }

        return new_result

    def _validate(
        self,
        metrics: dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> ExpectationValidationResult | dict:
        """Implementation of the GE _validate method.

        This method is used on the tests to validate the result
        of the query output.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        query_result = metrics.get("query.template_values")
        query_result = [element.values() for element in query_result]
        query_result = self._generate_dict(query_result)
        template_dict = self._validate_template_dict(self)
        output = self._validate_condition(query_result, template_dict)

        return output

    @staticmethod
    def _validate_template_dict(self: Any) -> dict:
        """Validate the template dict.

        Returns:
            Dict. Raises TypeError and KeyError
        """
        template_dict = self.template_dict

        if not isinstance(template_dict, dict):
            raise TypeError("template_dict must be supplied as a dict")

        if not all(
            [
                "column" in template_dict,
                "group_column_list" in template_dict,
                "agg_type" in template_dict,
                "condition" in template_dict,
            ]
        ):
            raise KeyError(
                "The following keys have to be in the \
                    template dict: column, group_column_list, condition, agg_type"
            )

        return template_dict

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "ID": [1, 2, 3, 4, 5, 6],
                        "Names": [
                            "Ramesh",
                            "Nasser",
                            "Jessica",
                            "Komal",
                            "Jude",
                            "Muffy",
                        ],
                        "Age": [25, 25, 25, 20, 20, 25],
                        "Gender": [
                            "Male",
                            "Male",
                            "Female",
                            "Female",
                            "Male",
                            "Female",
                        ],
                        "Salary": [1000, 2500, 5000, 3500, 6900, 1200],
                    },
                    "schemas": {
                        "spark": {
                            "ID": "IntegerType",
                            "Names": "StringType",
                            "Age": "IntegerType",
                            "Gender": "StringType",
                            "Salary": "IntegerType",
                        }
                    },
                }
            ],
            "tests": [
                {
                    "title": "basic_positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "template_dict": {
                            "column": "Salary",
                            "group_column_list": "Gender",
                            "agg_type": "sum",
                            "condition": "greater",
                            "min_value": 2000,
                        },
                        "result_format": {
                            "result_format": "COMPLETE",
                        },
                    },
                    "out": {"success": True},
                    "only_for": ["spark"],
                },
                {
                    "title": "basic_positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "template_dict": {
                            "column": "Salary",
                            "group_column_list": "Gender,Age",
                            "agg_type": "sum",
                            "condition": "between",
                            "max_value": 7000,
                            "min_value": 2000,
                        },
                        "result_format": {
                            "result_format": "COMPLETE",
                        },
                    },
                    "out": {"success": True},
                    "only_for": ["spark"],
                },
                {
                    "title": "basic_positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "template_dict": {
                            "column": "Salary",
                            "group_column_list": "Age",
                            "agg_type": "max",
                            "condition": "lesser",
                            "max_value": 10000,
                        },
                        "result_format": {
                            "result_format": "COMPLETE",
                        },
                    },
                    "out": {"success": True},
                    "only_for": ["spark"],
                },
                {
                    "title": "basic_negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "template_dict": {
                            "column": "Salary",
                            "group_column_list": "Gender",
                            "agg_type": "count",
                            "condition": "greater",
                            "min_value": 4,
                        },
                        "result_format": {
                            "result_format": "COMPLETE",
                        },
                    },
                    "out": {"success": False},
                    "only_for": ["sqlite", "spark"],
                },
                {
                    "title": "basic_negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "template_dict": {
                            "column": "Salary",
                            "group_column_list": "Gender,Age",
                            "agg_type": "sum",
                            "condition": "between",
                            "max_value": 2000,
                            "min_value": 1000,
                        },
                        "result_format": {
                            "result_format": "COMPLETE",
                        },
                    },
                    "out": {"success": False},
                    "only_for": ["spark"],
                },
            ],
        },
    ]

    library_metadata = {
        "tags": ["query-based"],
    }


if __name__ == "__main__":
    ExpectQueriedColumnAggValueToBe().print_diagnostic_checklist()


================================================
FILE: lakehouse_engine/dq_processors/dq_factory.py
================================================
"""Module containing the class definition of the Data Quality Factory."""

import importlib
import json
import random
from copy import deepcopy
from datetime import datetime, timezone
from json import dumps, loads
from typing import Optional, Tuple

import great_expectations as gx
from great_expectations import ExpectationSuite
from great_expectations.checkpoint import CheckpointResult
from great_expectations.core.batch_definition import BatchDefinition
from great_expectations.core.run_identifier import RunIdentifier
from great_expectations.data_context import EphemeralDataContext
from great_expectations.data_context.types.base import (
    DataContextConfig,
    FilesystemStoreBackendDefaults,
    S3StoreBackendDefaults,
)
from great_expectations.expectations.expectation_configuration import (
    ExpectationConfiguration,
)
from pyspark.sql import DataFrame
from pyspark.sql.functions import (
    col,
    dayofmonth,
    explode,
    from_json,
    lit,
    month,
    schema_of_json,
    struct,
    to_json,
    to_timestamp,
    transform,
    year,
)
from pyspark.sql.types import FloatType, StringType

from lakehouse_engine.core.definitions import (
    DQDefaults,
    DQFunctionSpec,
    DQResultFormat,
    DQSpec,
    DQType,
    OutputSpec,
    WriteType,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.table_manager import TableManager
from lakehouse_engine.dq_processors.exceptions import DQValidationsFailedException
from lakehouse_engine.dq_processors.validator import Validator
from lakehouse_engine.io.writer_factory import WriterFactory
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DQFactory(object):
    """Class for the Data Quality Factory."""

    _LOGGER = LoggingHandler(__name__).get_logger()
    _TIMESTAMP = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")

    @classmethod
    def _add_critical_function_tag(cls, args: dict) -> dict:
        """Add tags to function considered critical.

        Adds a tag to each of the functions passed on the dq_specs to
        denote that they are critical_functions. This means that if any
        of them fails, the dq process will fail, even if the threshold
        is not surpassed.
        This is done by adding a tag to the meta dictionary of the
        expectation configuration.

        Args:
            args: arguments passed on the dq_spec

        Returns:
            A dictionary with the args with the critical function tag.
        """
        if "meta" in args.keys():
            meta = args["meta"]

            if isinstance(meta["notes"], str):
                meta["notes"] = meta["notes"] + " **Critical function**."
            else:
                meta["notes"]["content"] = (
                    meta["notes"]["content"] + " **Critical function**."
                )

            args["meta"] = meta
            return args

        else:
            args["meta"] = {
                "notes": {
                    "format": "markdown",
                    "content": "**Critical function**.",
                }
            }
            return args

    @classmethod
    def _configure_checkpoint(
        cls,
        context: EphemeralDataContext,
        dataframe_bd: BatchDefinition,
        suite: ExpectationSuite,
        dq_spec: DQSpec,
        data: DataFrame,
        checkpoint_run_time: str,
    ) -> Tuple[CheckpointResult, Optional[list]]:
        """Create and configure the validation checkpoint.

        Creates and configures a validation definition based on the suite
        and then creates, configures and runs the checkpoint returning,
        at the end, the result as well as the primary key from the dq_specs.

        Args:
            context: The data context from GX
            dataframe_bd: The dataframe with the batch definition to validate
            suite: A group of expectations to validate
            dq_spec: The arguments directly passed from the acon in the dq_spec key
            data: Input dataframe to run the dq process on.
            checkpoint_run_time: A string with the time in miliseconds

        Returns:
            A tuple with the result from the checkpoint run and the primary key
            from the dq_spec.
        """
        validation_definition = context.validation_definitions.add(
            gx.ValidationDefinition(
                data=dataframe_bd,
                suite=suite,
                name=f"{dq_spec.spec_id}-{dq_spec.input_id}"
                f"-validation-{checkpoint_run_time}",
            )
        )

        source_pk = cls._get_unexpected_rows_pk(dq_spec)
        result_format: dict = {
            "result_format": DQResultFormat.COMPLETE.value,
        }

        # If the source primary key is defined, we add it to the result format
        # so that it is included in the results from GX.
        if source_pk:
            result_format = {
                **result_format,
                "unexpected_index_column_names": source_pk,
            }

        checkpoint = context.checkpoints.add(
            gx.Checkpoint(
                name=f"{dq_spec.spec_id}-{dq_spec.input_id}"
                f"-checkpoint-{checkpoint_run_time}"
                f"-{str(random.randint(1, 100))}",  # nosec B311
                validation_definitions=[validation_definition],
                actions=[],
                result_format=result_format,
            )
        )

        result = checkpoint.run(
            batch_parameters={"dataframe": data},
            run_id=RunIdentifier(
                run_name=f"{checkpoint_run_time}"
                f"-{dq_spec.spec_id}-{dq_spec.input_id}"
                f"-{str(random.randint(1, 100))}-checkpoint",  # nosec B311
                run_time=datetime.strptime(checkpoint_run_time, "%Y%m%d-%H%M%S%f"),
            ),
        )

        return result, source_pk

    @classmethod
    def _check_row_condition(
        cls, dq_spec: DQSpec, dq_function: DQFunctionSpec
    ) -> DQFunctionSpec:
        """Enables/disables row_conditions.

        Checks for row_codition arguments in the definition of expectations
        and enables/disables their usage based on the enable_row_condition
        argument. row_conditions allow you to filter the rows that are
        processed by the DQ functions. This is useful when you want to run the
        DQ functions only on a subset of the data.

        Args:
            dq_spec: The arguments directly passed from the acon in the dq_spec key
            dq_function: A DQFunctionSpec with the definition of a dq function.

        Returns:
            The definition of a dq_function with or without the row_condition key.
        """
        if (
            not dq_spec.enable_row_condition
            and "row_condition" in dq_function.args.keys()
        ):
            del dq_function.args["row_condition"]
            cls._LOGGER.info(
                f"Disabling row_condition for function: {dq_function.function}"
            )
        return dq_function

    @classmethod
    def _add_suite(
        cls, context: EphemeralDataContext, dq_spec: DQSpec, checkpoint_run_time: str
    ) -> ExpectationSuite:
        """Create and configure an ExpectationSuite.

        Creates and configures an expectation suite, adding the dq functions
        passed on the dq_spec as well as the dq_critical_functions also passed
        on the dq_spec, if they exist. Finally return the configured suite.

        Args:
            context: The data context from GX
            dq_spec: The arguments directly passed from the acon in the dq_spec key
            checkpoint_run_time: A string with the time in miliseconds

        Returns:
            A configured ExpectationSuite object.
        """
        expectation_suite_name = (
            dq_spec.expectation_suite_name
            if dq_spec.expectation_suite_name
            else f"{dq_spec.spec_id}-{dq_spec.input_id}"
            f"-{dq_spec.dq_type}-{checkpoint_run_time}"
        )
        suite = context.suites.add(gx.ExpectationSuite(name=expectation_suite_name))

        for dq_function in dq_spec.dq_functions:
            dq_function = cls._check_row_condition(dq_spec, dq_function)
            suite.add_expectation_configuration(
                ExpectationConfiguration(
                    type=dq_function.function,
                    kwargs=dq_function.args if dq_function.args else {},
                    meta=dq_function.args.get("meta") if dq_function.args else {},
                )
            )
        if dq_spec.critical_functions:
            for critical_function in dq_spec.critical_functions:
                meta_args = cls._add_critical_function_tag(critical_function.args)
                suite.add_expectation_configuration(
                    ExpectationConfiguration(
                        type=critical_function.function,
                        kwargs=(
                            critical_function.args if critical_function.args else {}
                        ),
                        meta=meta_args,
                    )
                )

        suite.save()
        return suite

    @classmethod
    def _check_expectation_result(cls, result_dict: dict) -> dict:
        """Add an empty dict if the unexpected_index_list key is empty.

        Checks if the unexpected_index_list key has any element, if it doesn't,
        add an empty dictionary to the result key. This is needed due to some
        edge cases that appeared due to the GX update to version 1.3.13 where
        the unexpected_index_list would sometimes exist even for successful
        validation runs.

        Args:
            result_dict: A dict with the result_dict from a checkpoint run.

        Returns:
            The configured result_dict
        """
        for expectation_result in result_dict["results"]:
            if "unexpected_index_list" in expectation_result["result"].keys():
                if len(expectation_result["result"]["unexpected_index_list"]) < 1:
                    expectation_result["result"] = {}
        return result_dict

    @classmethod
    def run_dq_process(cls, dq_spec: DQSpec, data: DataFrame) -> DataFrame:
        """Run the specified data quality process on a dataframe.

        Based on the dq_specs we apply the defined expectations on top of the dataframe
        in order to apply the necessary validations and then output the result of
        the data quality process.

        The logic of the function is as follows:
        1. Import the custom expectations defined in the engine.
        2. Create the context based on the dq_spec. - The context is the base class for
        the GX, an ephemeral context means that it does not store/load the
        configuration of the environment in a configuration file.
        3. Add the data source to the context. - This is the data source that will be
        used to run the dq process, in our case Spark.
        4. Create the dataframe asset and batch definition. - The asset represents the
        data where the expectations are applied and the batch definition is the
        way how the data should be split, in the case of dataframes it is always
        the whole dataframe.
        5. Create the expectation suite. - This is the group of expectations that will
        be applied to the data.
        6. Create the checkpoint and run it. - The checkpoint is the object that will
        run the expectations on the data and return the results.
        7. Transform the results and write them to the result sink. - The results are
        transformed to a more readable format and then written to the result sink.
        8. Log the results and raise an exception if needed. - The results are logged
        and if there are any failed expectations the process will raise an exception
        based on the dq_spec.
        9. Tag the source data if needed. - If the dq_spec has the tag_source_data
        argument set to True, the source data will be tagged with the dq results.

        Args:
            dq_spec: data quality specification.
            data: input dataframe to run the dq process on.

        Returns:
            The DataFrame containing the results of the DQ process.
        """
        # Creating the context
        if dq_spec.dq_type == "validator" or dq_spec.dq_type == "prisma":

            for expectation in DQDefaults.CUSTOM_EXPECTATION_LIST.value:
                importlib.__import__(
                    "lakehouse_engine.dq_processors.custom_expectations." + expectation
                )

            context = gx.get_context(
                cls._get_data_context_config(dq_spec), mode="ephemeral"
            )

            # Adding data source to context
            dataframe_data_source = context.data_sources.add_spark(
                name=f"{dq_spec.spec_id}-{dq_spec.input_id}-datasource",
                persist=False,
            )
            dataframe_asset = dataframe_data_source.add_dataframe_asset(
                name=f"{dq_spec.spec_id}-{dq_spec.input_id}-asset"
            )
            dataframe_bd = dataframe_asset.add_batch_definition_whole_dataframe(
                name=f"{dq_spec.spec_id}-{dq_spec.input_id}-batch"
            )

            checkpoint_run_time = datetime.today().strftime("%Y%m%d-%H%M%S%f")

            suite = cls._add_suite(context, dq_spec, checkpoint_run_time)

            result, source_pk = cls._configure_checkpoint(
                context, dataframe_bd, suite, dq_spec, data, checkpoint_run_time
            )

            expectation_result_key = list(result.run_results.keys())[0]

            result_dict = result.run_results[expectation_result_key].to_json_dict()

            result_dict = cls._check_expectation_result(result_dict)

            data = cls._transform_checkpoint_results(
                data, source_pk, result_dict, dq_spec
            )

            # Processed keys are only added for the PRISMA dq type
            # because they are being used to calculate the good
            # records that were processed in a run.
            if dq_spec.dq_type == DQType.PRISMA.value:

                keys = data.select(
                    [col(c).cast(StringType()).alias(c) for c in source_pk]
                )
                keys = keys.withColumn(
                    "run_name", lit(result_dict["meta"]["run_id"]["run_name"])
                )

                cls._write_to_location(dq_spec, keys, processed_keys=True)

        else:
            raise TypeError(
                f"Type of Data Quality '{dq_spec.dq_type}' is not supported."
            )

        return data

    @classmethod
    def _check_critical_functions_tags(cls, failed_expectations: dict) -> list:
        critical_failure = []

        for expectation in failed_expectations.values():
            meta = expectation["meta"]
            if meta and (
                ("notes" in meta.keys() and "Critical function" in meta["notes"])
                or (
                    "content" in meta["notes"].keys()
                    and "Critical function" in meta["notes"]["content"]
                )
            ):
                critical_failure.append(expectation["type"])

        return critical_failure

    @classmethod
    def _check_chunk_usage(cls, results_dict: dict, dq_spec: DQSpec) -> bool:
        """Check if the results should be split into chunks.

        If the size of the results dictionary is too big, we will split it into
        smaller chunks. This is needed to avoid memory issues when processing
        large datasets.

        Args:
            results_dict: The results dictionary to be checked.
            dq_spec: data quality specification.

        Returns:
            True if the results dictionary is too big, False otherwise.
        """
        for ele in results_dict["results"]:
            if (
                "unexpected_index_list" in ele["result"].keys()
                and len(ele["result"]["unexpected_index_list"])
                > dq_spec.result_sink_chunk_size
            ):
                return True

        return False

    @classmethod
    def _explode_results(
        cls,
        df: DataFrame,
        dq_spec: DQSpec,
    ) -> DataFrame:
        """Transform dq results dataframe exploding a set of columns.

        Args:
            df: dataframe with dq results to be exploded.
            dq_spec: data quality specification.
        """
        df = df.withColumn("validation_results", explode("results")).withColumn(
            "source", lit(dq_spec.source)
        )

        if (
            not df.schema["validation_results"]
            .dataType.fieldNames()  # type: ignore
            .__contains__("result")
        ):
            df = df.withColumn(
                "validation_results",
                col("validation_results").withField(
                    "result", struct(lit(None).alias("observed_value"))
                ),
            )

        kwargs_columns = [
            f"validation_results.expectation_config.kwargs.{col_name}"
            for col_name in df.select(
                "validation_results.expectation_config.kwargs.*"
            ).columns
        ]

        cols_to_cast = ["max_value", "min_value", "sum_total"]
        for col_name in kwargs_columns:
            if col_name.split(".")[-1] in cols_to_cast:
                df = df.withColumn(
                    "validation_results",
                    col("validation_results").withField(
                        "expectation_config",
                        col("validation_results.expectation_config").withField(
                            "kwargs",
                            col(
                                "validation_results.expectation_config.kwargs"
                            ).withField(
                                col_name.split(".")[-1],
                                col(col_name).cast(FloatType()),
                            ),
                        ),
                    ),
                )

        new_columns = [
            "validation_results.expectation_config.kwargs.*",
            "validation_results.expectation_config.type as expectation_type",
            "validation_results.success as expectation_success",
            "validation_results.exception_info",
            "statistics.*",
        ] + dq_spec.result_sink_extra_columns

        df_exploded = df.selectExpr(*df.columns, *new_columns).drop(
            *[c.replace(".*", "").split(" as")[0] for c in new_columns]
        )

        df_exploded = df_exploded.drop(
            "statistics", "id", "results", "meta", "suite_name"
        )

        if (
            "meta"
            in df_exploded.select("validation_results.expectation_config.*").columns
        ):
            df_exploded = df_exploded.withColumn(
                "meta", col("validation_results.expectation_config.meta")
            )

        schema = df_exploded.schema.simpleString()

        if (
            dq_spec.gx_result_format.upper() == DQResultFormat.COMPLETE.value
            and "unexpected_index_list" in schema
        ):
            df_exploded = df_exploded.withColumn(
                "unexpected_index_list",
                transform(
                    col("validation_results.result.unexpected_index_list"),
                    lambda y: y.withField("run_success", lit(False)),
                ),
            )

        if "observed_value" in schema:
            df_exploded = df_exploded.withColumn(
                "observed_value", col("validation_results.result.observed_value")
            )

        return (
            df_exploded.withColumn("run_time_year", year(to_timestamp("run_time")))
            .withColumn("run_time_month", month(to_timestamp("run_time")))
            .withColumn("run_time_day", dayofmonth(to_timestamp("run_time")))
            .withColumn(
                "kwargs", to_json(col("validation_results.expectation_config.kwargs"))
            )
            .withColumn("validation_results", to_json(col("validation_results")))
        )

    @classmethod
    def _get_data_context_config(cls, dq_spec: DQSpec) -> DataContextConfig:
        """Get the configuration of the data context.

        Based on the configuration it is possible to define the backend to be
        the file system (e.g. local file system) or S3, meaning that the DQ artefacts
        will be stored according to this configuration.

        Args:
            dq_spec: data quality process specification.

        Returns:
            The DataContextConfig object configuration.
        """
        store_backend: FilesystemStoreBackendDefaults | S3StoreBackendDefaults

        if dq_spec.store_backend == DQDefaults.FILE_SYSTEM_STORE.value:
            store_backend = FilesystemStoreBackendDefaults(
                root_directory=dq_spec.local_fs_root_dir
            )
        elif dq_spec.store_backend == DQDefaults.FILE_SYSTEM_S3_STORE.value:
            store_backend = S3StoreBackendDefaults(
                default_bucket_name=dq_spec.bucket,
                validation_results_store_prefix=dq_spec.validations_store_prefix,
                checkpoint_store_prefix=dq_spec.checkpoint_store_prefix,
                expectations_store_prefix=dq_spec.expectations_store_prefix,
            )

        return DataContextConfig(
            store_backend_defaults=store_backend,
            analytics_enabled=False,
        )

    @classmethod
    def _get_data_source_defaults(cls, dq_spec: DQSpec) -> dict:
        """Get the configuration for a datasource.

        Args:
            dq_spec: data quality specification.

        Returns:
            The python dictionary with the datasource configuration.
        """
        return {
            "name": f"{dq_spec.spec_id}-{dq_spec.input_id}-datasource",
            "class_name": DQDefaults.DATASOURCE_CLASS_NAME.value,
            "execution_engine": {
                "class_name": DQDefaults.DATASOURCE_EXECUTION_ENGINE.value,
                "persist": False,
            },
            "data_connectors": {
                f"{dq_spec.spec_id}-{dq_spec.input_id}-data_connector": {
                    "module_name": DQDefaults.DATA_CONNECTORS_MODULE_NAME.value,
                    "class_name": DQDefaults.DATA_CONNECTORS_CLASS_NAME.value,
                    "assets": {
                        (
                            dq_spec.data_asset_name
                            if dq_spec.data_asset_name
                            else f"{dq_spec.spec_id}-{dq_spec.input_id}"
                        ): {"batch_identifiers": DQDefaults.DQ_BATCH_IDENTIFIERS.value}
                    },
                }
            },
        }

    @classmethod
    def _get_failed_expectations(
        cls,
        results: dict,
        dq_spec: DQSpec,
        failed_expectations: dict,
        evaluated_expectations: dict,
        is_final_chunk: bool,
    ) -> Tuple[dict, dict]:
        """Get the failed expectations of a Checkpoint result.

        Args:
            results: the results of the DQ process.
            dq_spec: data quality specification.
            failed_expectations: dict of failed expectations.
            evaluated_expectations: dict of evaluated expectations.
            is_final_chunk: boolean indicating if this is the final chunk.

        Returns: a tuple with a dict of failed expectations
                and a dict of evaluated expectations.
        """
        expectations_results = results["results"]
        for result in expectations_results:
            evaluated_expectations[result["expectation_config"]["id"]] = result[
                "expectation_config"
            ]
            if not result["success"]:
                failed_expectations[result["expectation_config"]["id"]] = result[
                    "expectation_config"
                ]
                if result["exception_info"]["raised_exception"]:
                    cls._LOGGER.error(
                        f"""The expectation {str(result["expectation_config"])}
                        raised the following exception:
                        {result["exception_info"]["exception_message"]}"""
                    )
        cls._LOGGER.error(
            f"{len(failed_expectations)} out of {len(evaluated_expectations)} "
            f"Data Quality Expectation(s) have failed! Failed Expectations: "
            f"{failed_expectations}"
        )

        percentage_failure = 1 - (results["statistics"]["success_percent"] / 100)

        if (
            dq_spec.max_percentage_failure is not None
            and dq_spec.max_percentage_failure < percentage_failure
            and is_final_chunk
        ):
            raise DQValidationsFailedException(
                f"Max error threshold is being surpassed! "
                f"Expected: {dq_spec.max_percentage_failure} "
                f"Got: {percentage_failure}"
            )

        return failed_expectations, evaluated_expectations

    @classmethod
    def _get_unexpected_rows_pk(cls, dq_spec: DQSpec) -> Optional[list]:
        """Get primary key for using on rows failing DQ validations.

        Args:
            dq_spec: data quality specification.

        Returns: the list of columns that are part of the primary key.
        """
        if dq_spec.unexpected_rows_pk:
            return dq_spec.unexpected_rows_pk
        elif dq_spec.tbl_to_derive_pk:
            return TableManager(
                {"function": "get_tbl_pk", "table_or_view": dq_spec.tbl_to_derive_pk}
            ).get_tbl_pk()
        elif dq_spec.tag_source_data:
            raise ValueError(
                "You need to provide either the argument "
                "'unexpected_rows_pk' or 'tbl_to_derive_pk'."
            )
        else:
            return None

    @classmethod
    def _log_or_fail(
        cls,
        results: dict,
        dq_spec: DQSpec,
        failed_expectations: dict,
        evaluated_expectations: dict,
        is_final_chunk: bool,
    ) -> Tuple[dict, dict]:
        """Log the execution of the Data Quality process.

        Args:
            results: the results of the DQ process.
            dq_spec: data quality specification.
            failed_expectations: list of failed expectations.
            evaluated_expectations: list of evaluated expectations.
            is_final_chunk: boolean indicating if this is the final chunk.

        Returns: a tuple with a dict of failed expectations
                and a dict of evaluated expectations.
        """
        if results["success"]:
            cls._LOGGER.info(
                "The data passed all the expectations defined. Everything looks good!"
            )
        else:
            failed_expectations, evaluated_expectations = cls._get_failed_expectations(
                results,
                dq_spec,
                failed_expectations,
                evaluated_expectations,
                is_final_chunk,
            )

        if dq_spec.critical_functions and is_final_chunk:
            critical_failure = cls._check_critical_functions_tags(failed_expectations)

            if critical_failure:
                raise DQValidationsFailedException(
                    f"Data Quality Validations Failed, the following critical "
                    f"expectations failed: {critical_failure}."
                )
        if dq_spec.fail_on_error and is_final_chunk and failed_expectations:
            raise DQValidationsFailedException("Data Quality Validations Failed!")

        return failed_expectations, evaluated_expectations

    @classmethod
    def _transform_checkpoint_results(
        cls,
        data: DataFrame,
        source_pk: list,
        checkpoint_results: dict,
        dq_spec: DQSpec,
    ) -> DataFrame:
        """Transforms the checkpoint results and creates new entries.

        All the items of the dictionary are cast to a json like format.
        All columns are cast to json like format.
        After that the dictionary is converted into a dataframe.

        Args:
            data: input dataframe to run the dq process on.
            source_pk: list of columns that are part of the primary key.
            checkpoint_results: dict with results of the checkpoint run.
            dq_spec: data quality specification.
            checkpoint_run_time: A string with the time in miliseconds.

        Returns:
            Transformed results dataframe.
        """
        results_dict = loads(dumps(checkpoint_results))

        # Check the size of the results dictionary, if it is too big
        # we will split it into smaller chunks.
        results_dict_list = cls._generate_chunks(results_dict, dq_spec)

        index = 0

        failed_expectations: dict = {}
        evaluated_expectations: dict = {}

        # The processed chunk is removed from the list of results
        # so the memory is freed as soon as possible.
        while index < len(results_dict_list):
            is_final_chunk = len(results_dict_list) == 1
            data, failed_expectations, evaluated_expectations = cls._process_chunk(
                dq_spec,
                source_pk,
                results_dict_list[index],
                data,
                failed_expectations,
                evaluated_expectations,
                is_final_chunk,
            )
            del results_dict_list[index]

        return data

    @classmethod
    def _process_chunk(
        cls,
        dq_spec: DQSpec,
        source_pk: list[str],
        ele: dict,
        data: DataFrame,
        failed_expectations: dict,
        evaluated_expectations: dict,
        is_final_chunk: bool,
    ) -> Tuple[DataFrame, dict, dict]:
        """Process a chunk of the results.

        Args:
            dq_spec: data quality specification.
            source_pk: list of columns that are part of the primary key.
            ele: dictionary with the results of the dq process.
            data: input dataframe to run the dq process on.
            failed_expectations: list of failed expectations.
            evaluated_expectations: list of evaluated expectations.
            is_final_chunk: boolean indicating if this is the final chunk.

        Returns:
            A tuple with the processed data, failed expectations and evaluated
            expectations.
        """
        df = ExecEnv.SESSION.createDataFrame([json.dumps(ele)], schema=StringType())
        schema = schema_of_json(lit(json.dumps(ele)))
        df = (
            df.withColumn("value", from_json("value", schema))
            .select("value.*")
            .withColumn("spec_id", lit(dq_spec.spec_id))
            .withColumn("input_id", lit(dq_spec.input_id))
            .withColumn("run_name", col("meta.run_id.run_name"))
            .withColumn("run_time", col("meta.run_id.run_time"))
        )
        exploded_df = (
            cls._explode_results(df, dq_spec)
            if dq_spec.result_sink_explode
            else df.withColumn("validation_results", to_json(col("results"))).drop(
                "statistics", "meta", "suite_name", "results", "id"
            )
        )

        exploded_df = exploded_df.withColumn("source_primary_key", lit(source_pk))

        exploded_df = cls._cast_columns_to_string(exploded_df)

        cls._write_to_location(dq_spec, exploded_df)

        failed_expectations, evaluated_expectations = cls._log_or_fail(
            ele, dq_spec, failed_expectations, evaluated_expectations, is_final_chunk
        )
        if (
            dq_spec.tag_source_data
            and dq_spec.result_sink_explode
            and dq_spec.fail_on_error is not True
        ):
            data = Validator.tag_source_with_dq(source_pk, data, exploded_df)
            return data, failed_expectations, evaluated_expectations
        return data, failed_expectations, evaluated_expectations

    @classmethod
    def _cast_columns_to_string(cls, df: DataFrame) -> DataFrame:
        """Cast selected columns of the dataframe to string type.

        Args:
            df: The input dataframe.

        Returns:
            A new dataframe with selected columns cast to string type.
        """
        for col_name in df.columns:
            if col_name not in DQDefaults.DQ_COLUMNS_TO_KEEP_TYPES.value:
                df = df.withColumn(col_name, df[col_name].cast(StringType()))
        return df

    @classmethod
    def _generate_chunks(cls, results_dict: dict, dq_spec: DQSpec) -> list:
        """Split the results dictionary into smaller chunks.

        This is needed to avoid memory issues when processing large datasets.
        The size of the chunks is defined by the dq_spec.result_sink_chunk_size.

        Args:
            results_dict: The results dictionary to be split.
            dq_spec: data quality specification.

        Returns:
            A list of dictionaries, where each dictionary is a chunk of the original
            results dictionary.
        """
        results_dict_list = []

        split = cls._check_chunk_usage(results_dict, dq_spec)

        if split:
            # Here we are splitting the results into chunks per expectation
            # and then we are splitting the unexpected_index_list into
            # chunks of size dq_spec.result_sink_chunk_size.
            results_dict_list = cls._split_into_chunks(results_dict, dq_spec)
        else:
            # If the results are not too big, we can process them all at once.
            results_dict_list = [results_dict]

        return results_dict_list

    @classmethod
    def _split_into_chunks(cls, results_dict: dict, dq_spec: DQSpec) -> list:
        """Split the results into smaller chunks.

        This is needed to avoid memory issues when processing large datasets.
        The size of the chunks is defined by the dq_spec.result_sink_chunk_size.

        Args:
            results: The results to be split.
            dq_spec: data quality specification.

        Returns:
            A list of dictionaries, where each dictionary is a chunk of the original
            results.
        """
        results_dict_list = []

        for ele in results_dict["results"]:
            base_result = deepcopy(results_dict)

            if "unexpected_index_list" in ele["result"].keys():
                for key in ExecEnv.ENGINE_CONFIG.dq_result_sink_columns_to_delete:
                    del ele["result"][key]

                unexpected_index_list = ele["result"]["unexpected_index_list"]
                unexpected_index_list_chunks = cls.split_into_chunks(
                    unexpected_index_list, dq_spec.result_sink_chunk_size
                )

                del ele["result"]["unexpected_index_list"]

                for chunk in unexpected_index_list_chunks:
                    ele["result"]["unexpected_index_list"] = chunk
                    base_result["results"] = [ele]
                    results_dict_list.append(deepcopy(base_result))
            else:
                base_result["results"] = [ele]
                results_dict_list.append(base_result)

        return results_dict_list

    @classmethod
    def _write_to_location(
        cls,
        dq_spec: DQSpec,
        df: DataFrame,
        processed_keys: bool = False,
    ) -> None:
        """Write dq results dataframe to a table or location.

        It can be written:
        - a raw output (having result_sink_explode set as False)
        - an exploded output (having result_sink_explode set as True), which
        is more prepared for analysis, with some columns exploded, flatten and
        transformed. It can also be set result_sink_extra_columns with other
        columns desired to have in the output table or location.
        - processed keys when running the dq process with the dq_type set as
        'prisma'.

        Args:
            dq_spec: data quality specification.
            df: dataframe with dq results to write.
            processed_keys: boolean indicating if the dataframe contains
                the processed keys.
        """
        if processed_keys:
            table = None
            location = dq_spec.processed_keys_location
            options = {"mergeSchema": "true"}
        else:
            table = dq_spec.result_sink_db_table
            location = dq_spec.result_sink_location
            options = {"mergeSchema": "true"} if dq_spec.result_sink_explode else {}

        if table or location:
            WriterFactory.get_writer(
                spec=OutputSpec(
                    spec_id="dq_result_sink",
                    input_id="dq_result",
                    db_table=table,
                    location=location,
                    partitions=(
                        dq_spec.result_sink_partitions
                        if dq_spec.result_sink_partitions
                        else []
                    ),
                    write_type=WriteType.APPEND.value,
                    data_format=dq_spec.result_sink_format,
                    options=(
                        options
                        if dq_spec.result_sink_options is None
                        else {**dq_spec.result_sink_options, **options}
                    ),
                ),
                df=df,
                data=None,
            ).write()

    @staticmethod
    def split_into_chunks(lst: list, chunk_size: int) -> list:
        """Split a list into chunks of a specified size.

        Args:
            lst: The list to be split.
            chunk_size: Number of records in each chunk.

        Returns:
            A list of lists, where each inner list is a chunk of the original list.
        """
        if chunk_size <= 0:
            raise ValueError("Chunk size must be a positive integer.")
        chunk_list = []
        for i in range(0, len(lst), chunk_size):
            chunk_list.append(lst[i : i + chunk_size])
        return chunk_list


================================================
FILE: lakehouse_engine/dq_processors/exceptions.py
================================================
"""Package defining all the DQ custom exceptions."""


class DQValidationsFailedException(Exception):
    """Exception for when the data quality validations fail."""

    pass


class DQCheckpointsResultsException(Exception):
    """Exception for when the checkpoint results parsing fail."""

    pass


class DQSpecMalformedException(Exception):
    """Exception for when the DQSpec is malformed."""

    pass


class DQDuplicateRuleIdException(Exception):
    """Exception for when a duplicated rule id is found."""

    pass


================================================
FILE: lakehouse_engine/dq_processors/validator.py
================================================
"""Module containing the definition of a data quality validator."""

from typing import Any, List

from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.data_context import EphemeralDataContext
from pyspark.sql import DataFrame
from pyspark.sql.functions import (
    col,
    collect_set,
    concat,
    explode,
    first,
    lit,
    struct,
    when,
)

from lakehouse_engine.core.definitions import DQDefaults, DQFunctionSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Validator(object):
    """Class containing the data quality validator."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def get_dq_validator(
        cls,
        context: EphemeralDataContext,
        batch_request: RuntimeBatchRequest,
        expectation_suite_name: str,
        dq_functions: List[DQFunctionSpec],
        critical_functions: List[DQFunctionSpec],
    ) -> Any:
        """Get a validator according to the specification.

        We use getattr to dynamically execute any expectation available.
        getattr(validator, function) is similar to validator.function(). With this
        approach, we can execute any expectation supported.

        Args:
            context: the BaseDataContext containing the configurations for the data
                source and store backend.
            batch_request: run time batch request to be able to query underlying data.
            expectation_suite_name: name of the expectation suite.
            dq_functions: a list of DQFunctionSpec to consider in the expectation suite.
            critical_functions: list of critical expectations in the expectation suite.

        Returns:
            The validator with the expectation suite stored.
        """
        validator = context.get_validator(
            batch_request=batch_request, expectation_suite_name=expectation_suite_name
        )
        if dq_functions:
            for dq_function in dq_functions:
                getattr(validator, dq_function.function)(
                    **dq_function.args if dq_function.args else {}
                )

        if critical_functions:
            for critical_function in critical_functions:
                meta_args = cls._add_critical_function_tag(critical_function.args)

                getattr(validator, critical_function.function)(**meta_args)

        return validator.save_expectation_suite(discard_failed_expectations=False)

    @classmethod
    def tag_source_with_dq(
        cls, source_pk: List[str], source_df: DataFrame, results_df: DataFrame
    ) -> DataFrame:
        """Tags the source dataframe with a new column having the DQ results.

        Args:
            source_pk: the primary key of the source data.
            source_df: the source dataframe to be tagged with DQ results.
            results_df: dq results dataframe.

        Returns: a dataframe tagged with the DQ results.
        """
        run_success = results_df.select("success").first()[0]
        run_name = results_df.select("run_name").first()[0]
        raised_exceptions = (
            True
            if results_df.filter("exception_info.raised_exception == True").count() > 0
            else False
        )

        failures_df = (
            results_df.filter(
                "expectation_success == False and size(unexpected_index_list) > 0"
            )
            if "unexpected_index_list" in results_df.schema.simpleString()
            else results_df.filter("expectation_success == False")
        )

        if failures_df.isEmpty() is not True:

            source_df = cls._get_row_tagged_fail_df(
                failures_df, raised_exceptions, source_df, source_pk
            )

        return cls._join_complementary_data(
            run_name, run_success, raised_exceptions, source_df
        )

    @classmethod
    def _add_critical_function_tag(cls, args: dict) -> dict:
        if "meta" in args.keys():
            meta = args["meta"]

            if isinstance(meta["notes"], str):
                meta["notes"] = meta["notes"] + " **Critical function**."
            else:
                meta["notes"]["content"] = (
                    meta["notes"]["content"] + " **Critical function**."
                )

            args["meta"] = meta
            return args

        else:
            args["meta"] = {
                "notes": {
                    "format": "markdown",
                    "content": "**Critical function**.",
                }
            }
            return args

    @staticmethod
    def _get_row_tagged_fail_df(
        failures_df: DataFrame,
        raised_exceptions: bool,
        source_df: DataFrame,
        source_pk: List[str],
    ) -> DataFrame:
        """Get the source_df DataFrame tagged with the row level failures.

        Args:
            failures_df: dataframe having all failed expectations from the DQ execution.
            raised_exceptions: whether there was at least one expectation raising
                exceptions (True) or not (False).
            source_df: the source dataframe being tagged with DQ results.
            source_pk: the primary key of the source data.

        Returns: the source_df tagged with the row level failures.
        """
        if "unexpected_index_list" in failures_df.schema.simpleString():

            row_failures_df = (
                failures_df.alias("a")
                .withColumn("exploded_list", explode(col("unexpected_index_list")))
                .selectExpr("a.*", "exploded_list.*")
                .groupBy(*source_pk)
                .agg(
                    struct(
                        first(col("run_name")).alias("run_name"),
                        first(col("success")).alias("run_success"),
                        lit(raised_exceptions).alias("raised_exceptions"),
                        first(col("expectation_success")).alias("run_row_success"),
                        collect_set(
                            struct(
                                col("expectation_type"),
                                col("kwargs"),
                            )
                        ).alias("dq_failure_details"),
                    ).alias("dq_validations")
                )
            )

            if all(item in row_failures_df.columns for item in source_pk):
                join_cond = [
                    col(f"a.{key}").eqNullSafe(col(f"b.{key}")) for key in source_pk
                ]
                columns = [
                    col_name
                    for col_name in source_df.columns
                    if col_name != "dq_validations"
                ]

                # Since we are creating multiple rows per run, if the dq_validations
                # column already exists, we need to add the new dq_validations to
                # the existing dq_validations.
                existing_validations = "a.dq_validations"
                existing_validations_details = "a.dq_validations.dq_failure_details"
                new_validations = "b.dq_validations"
                new_validations_details = "b.dq_validations.dq_failure_details"

                if "dq_validations" in source_df.columns:
                    source_df = (
                        source_df.alias("a")
                        .join(row_failures_df.alias("b"), join_cond, "left")
                        .select(
                            *[f"a.{col}" for col in columns],
                            when(
                                col(new_validations).isNotNull()
                                & col(existing_validations_details).isNotNull(),
                                col(new_validations).withField(
                                    "dq_failure_details",
                                    concat(
                                        col(existing_validations_details),
                                        col(new_validations_details),
                                    ),
                                ),
                            )
                            .when(
                                col(new_validations).isNotNull()
                                & col(new_validations_details).isNotNull(),
                                col(new_validations),
                            )
                            .otherwise(col(existing_validations))
                            .alias("dq_validations"),
                        )
                    )
                else:
                    source_df = (
                        source_df.alias("a")
                        .join(row_failures_df.alias("b"), join_cond, "left")
                        .select("a.*", new_validations)
                    )

        return source_df

    @staticmethod
    def _join_complementary_data(
        run_name: str, run_success: bool, raised_exceptions: bool, source_df: DataFrame
    ) -> DataFrame:
        """Join the source_df DataFrame with complementary data.

        The source_df was already tagged/joined with the row level DQ failures, in case
        there were any. However, there might be cases for which we don't have any
        failure (everything succeeded) or cases for which only not row level failures
        happened (e.g. table level expectations or column level aggregations), and, for
        those we need to join the source_df with complementary data.

        Args:
            run_name: the name of the DQ execution in great expectations.
            run_success: whether the general execution of the DQ was succeeded (True)
                or not (False).
            raised_exceptions: whether there was at least one expectation raising
                exceptions (True) or not (False).
            source_df: the source dataframe being tagged with DQ results.

        Returns: the source_df tagged with complementary data.
        """
        complementary_data = [
            {
                "dq_validations": {
                    "run_name": run_name,
                    "run_success": run_success,
                    "raised_exceptions": raised_exceptions,
                    "run_row_success": True,
                }
            }
        ]

        complementary_df = ExecEnv.SESSION.createDataFrame(
            complementary_data, schema=DQDefaults.DQ_VALIDATIONS_SCHEMA.value
        )

        return (
            source_df.crossJoin(
                complementary_df.withColumnRenamed(
                    "dq_validations", "tmp_dq_validations"
                )
            )
            .withColumn(
                "dq_validations",
                (
                    when(
                        col("dq_validations").isNotNull(), col("dq_validations")
                    ).otherwise(col("tmp_dq_validations"))
                    if "dq_validations" in source_df.columns
                    else col("tmp_dq_validations")
                ),
            )
            .drop("tmp_dq_validations")
        )


================================================
FILE: lakehouse_engine/engine.py
================================================
"""Contract of the lakehouse engine with all the available functions to be executed."""

from typing import List, Optional, OrderedDict

from lakehouse_engine.algorithms.data_loader import DataLoader
from lakehouse_engine.algorithms.gab import GAB
from lakehouse_engine.algorithms.reconciliator import Reconciliator
from lakehouse_engine.algorithms.sensors.heartbeat import Heartbeat
from lakehouse_engine.algorithms.sensors.sensor import Sensor, SensorStatus
from lakehouse_engine.core.definitions import (
    CollectEngineUsage,
    SAPLogchain,
    TerminatorSpec,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.file_manager import FileManagerFactory
from lakehouse_engine.core.sensor_manager import SensorUpstreamManager
from lakehouse_engine.core.table_manager import TableManager
from lakehouse_engine.terminators.notifier_factory import NotifierFactory
from lakehouse_engine.terminators.sensor_terminator import SensorTerminator
from lakehouse_engine.utils.acon_utils import (
    validate_and_resolve_acon,
    validate_manager_list,
)
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.engine_usage_stats import EngineUsageStats


def load_data(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> Optional[OrderedDict]:
    """Load data using the DataLoader algorithm.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks or other
            apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    try:
        acon = ConfigUtils.get_acon(acon_path, acon)
        ExecEnv.get_or_create(app_name="data_loader", config=acon.get("exec_env", None))
        acon = validate_and_resolve_acon(acon, "in_motion")
    finally:
        EngineUsageStats.store_engine_usage(
            acon, load_data.__name__, collect_engine_usage, spark_confs
        )
    return DataLoader(acon).execute()


def execute_reconciliation(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Execute the Reconciliator algorithm.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks or other
            apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    try:
        acon = ConfigUtils.get_acon(acon_path, acon)
        ExecEnv.get_or_create(
            app_name="reconciliator", config=acon.get("exec_env", None)
        )
        acon = validate_and_resolve_acon(acon)
    finally:
        EngineUsageStats.store_engine_usage(
            acon, execute_reconciliation.__name__, collect_engine_usage, spark_confs
        )
    Reconciliator(acon).execute()


def execute_dq_validation(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Execute the DQValidator algorithm.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks or other
            apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    from lakehouse_engine.algorithms.dq_validator import DQValidator

    try:
        acon = ConfigUtils.get_acon(acon_path, acon)
        ExecEnv.get_or_create(
            app_name="dq_validator", config=acon.get("exec_env", None)
        )
        acon = validate_and_resolve_acon(acon, "at_rest")
    finally:
        EngineUsageStats.store_engine_usage(
            acon, execute_dq_validation.__name__, collect_engine_usage, spark_confs
        )
    DQValidator(acon).execute()


def manage_table(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Manipulate tables/views using Table Manager algorithm.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks
            or other apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    acon = ConfigUtils.get_acon(acon_path, acon)
    ExecEnv.get_or_create(app_name="manage_table", config=acon.get("exec_env", None))
    EngineUsageStats.store_engine_usage(
        acon, manage_table.__name__, collect_engine_usage, spark_confs
    )
    TableManager(acon).get_function()


def execute_manager(
    acon: dict,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Execute the Lakehouse Engine Manager.

    This function allows users to execute multiple managers in a single
    call by providing a list of acons.

    Args:
        acon: list of acons to be executed by the manager.
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    ExecEnv.get_or_create(app_name="lakehouse_engine_manager")
    acon_list = validate_manager_list(acon)
    for acon in acon_list:
        EngineUsageStats.store_engine_usage(
            acon, execute_manager.__name__, collect_engine_usage, spark_confs
        )
        if acon["manager"] == "file":
            FileManagerFactory.execute_function(configs=acon)
        elif acon["manager"] == "table":
            TableManager(acon).get_function()
        else:
            raise ValueError(f"Manager {acon['manager']} not recognized.")


def manage_files(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Manipulate s3 files using File Manager algorithm.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks
            or other apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    acon = ConfigUtils.get_acon(acon_path, acon)
    ExecEnv.get_or_create(app_name="manage_files", config=acon.get("exec_env", None))
    EngineUsageStats.store_engine_usage(
        acon, manage_files.__name__, collect_engine_usage, spark_confs
    )
    FileManagerFactory.execute_function(configs=acon)


def execute_sensor(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> bool:
    """Execute a sensor based on a Sensor Algorithm Configuration.

    A sensor is useful to check if an upstream system has new data.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks
            or other apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    acon = ConfigUtils.get_acon(acon_path, acon)
    ExecEnv.get_or_create(app_name="execute_sensor", config=acon.get("exec_env", None))
    EngineUsageStats.store_engine_usage(
        acon, execute_sensor.__name__, collect_engine_usage, spark_confs
    )
    return Sensor(acon).execute()


def execute_sensor_heartbeat(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Execute a sensor based on a Heartbeat Algorithm Configuration.

    The heartbeat mechanism monitors whether an upstream system has new data.

    The heartbeat job runs continuously within a defined data product or
    according to a user-defined schedule.

    This job operates based on the Control table, where source-related entries can be
    fed by users using the Heartbeat Data Feeder job.

    Each source (such as SAP, delta_table, Kafka, Local Manual Upload, etc.) can have
    tasks added in parallel within the Heartbeat Job.

    Based on source heartbeat ACON and control table entries,
    Heartbeat will send a final sensor acon to the existing sensor modules,
    which checks if a new event is available for the control table record.

    The sensor then returns the NEW_EVENT_AVAILABLE status to the Heartbeat modules,
    which update the control table.

    Following this, the related Databricks jobs are triggered through the
    Databricks Job API, ensuring that all dependencies are met.

    This process allows the Heartbeat sensor to efficiently manage and centralize
    the entire workflow with minimal user intervention and
    enhance sensor features by providing centralization, efficently manage and
    track using control table.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks
            or other apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    acon = ConfigUtils.get_acon(acon_path, acon)
    ExecEnv.get_or_create(
        app_name="execute_heartbeat", config=acon.get("exec_env", None)
    )
    EngineUsageStats.store_engine_usage(
        acon, execute_sensor_heartbeat.__name__, collect_engine_usage, spark_confs
    )
    return Heartbeat(acon).execute()


def trigger_heartbeat_sensor_jobs(
    acon: dict,
) -> None:
    """Trigger the jobs via Databricks job API.

    Args:
        acon: Heartbeat ACON containing data product configs and options.
    """
    ExecEnv.get_or_create(app_name="trigger_heartbeat_sensor_jobs")
    Heartbeat(acon).heartbeat_sensor_trigger_jobs()


def execute_heartbeat_sensor_data_feed(
    heartbeat_sensor_data_feed_path: str,
    heartbeat_sensor_control_table: str,
) -> None:
    """Control table Data feeder.

    It reads the CSV file stored at `data` folder and
    perform UPSERT and DELETE in control table.

    Args:
        heartbeat_sensor_data_feed_path: path where CSV file is stored.
        heartbeat_sensor_control_table: CONTROL table of Heartbeat sensor.
    """
    ExecEnv.get_or_create(app_name="execute_heartbeat_sensor_data_feed")
    Heartbeat.heartbeat_sensor_control_table_data_feed(
        heartbeat_sensor_data_feed_path, heartbeat_sensor_control_table
    )


def update_heartbeat_sensor_status(
    heartbeat_sensor_control_table: str,
    sensor_table: str,
    job_id: str,
) -> None:
    """UPDATE heartbeat sensor status.

    Update heartbeat sensor control table with COMPLETE status and
    job_end_timestamp for the triggered job.
    Update sensor control table with PROCESSED_NEW_DATA status and
    status_change_timestamp for the triggered job.

    Args:
        heartbeat_sensor_control_table: Heartbeat sensor control table name.
        sensor_table: lakehouse engine sensor table name.
        job_id: job_id of the running job. It refers to trigger_job_id in Control table.
    """
    ExecEnv.get_or_create(app_name="update_heartbeat_sensor_status")
    Heartbeat.update_heartbeat_sensor_completion_status(
        heartbeat_sensor_control_table, sensor_table, job_id
    )


def update_sensor_status(
    sensor_id: str,
    control_db_table_name: str,
    status: str = SensorStatus.PROCESSED_NEW_DATA.value,
    assets: List[str] = None,
) -> None:
    """Update internal sensor status.

    Update the sensor status in the control table,
    it should be used to tell the system
    that the sensor has processed all new data that was previously identified,
    hence updating the shifted sensor status.
    Usually used to move from `SensorStatus.ACQUIRED_NEW_DATA` to
    `SensorStatus.PROCESSED_NEW_DATA`,
    but there might be scenarios - still to identify -
    where we can update the sensor status from/to different statuses.

    Args:
        sensor_id: sensor id.
        control_db_table_name: `db.table` to store sensor checkpoints.
        status: status of the sensor.
        assets: a list of assets that are considered as available to
            consume downstream after this sensor has status
            PROCESSED_NEW_DATA.
    """
    ExecEnv.get_or_create(app_name="update_sensor_status")
    SensorTerminator.update_sensor_status(
        sensor_id=sensor_id,
        control_db_table_name=control_db_table_name,
        status=status,
        assets=assets,
    )


def generate_sensor_query(
    sensor_id: str,
    filter_exp: str = None,
    control_db_table_name: str = None,
    upstream_key: str = None,
    upstream_value: str = None,
    upstream_table_name: str = None,
) -> str:
    """Generates a preprocess query to be used in a sensor configuration.

    Args:
        sensor_id: sensor id.
        filter_exp: expression to filter incoming new data.
            You can use the placeholder ?default_upstream_key and
            ?default_upstream_value, so that it can be replaced by the
            respective values in the control_db_table_name for this specific
            sensor_id.
        control_db_table_name: `db.table` to retrieve the last status change
            timestamp. This is only relevant for the jdbc sensor.
        upstream_key: the key of custom sensor information to control how to
            identify new data from the upstream (e.g., a time column in the
            upstream).
        upstream_value: the upstream value
            to identify new data from the upstream (e.g., the value of a time
            present in the upstream).
        upstream_table_name: value for custom sensor
            to query new data from the upstream
            If none we will set the default value,
            our `sensor_new_data` view.

    Returns:
        The query string.
    """
    ExecEnv.get_or_create(app_name="generate_sensor_preprocess_query")
    if filter_exp:
        return SensorUpstreamManager.generate_filter_exp_query(
            sensor_id=sensor_id,
            filter_exp=filter_exp,
            control_db_table_name=control_db_table_name,
            upstream_key=upstream_key,
            upstream_value=upstream_value,
            upstream_table_name=upstream_table_name,
        )
    else:
        return SensorUpstreamManager.generate_sensor_table_preprocess_query(
            sensor_id=sensor_id
        )


def generate_sensor_sap_logchain_query(
    chain_id: str,
    dbtable: str = SAPLogchain.DBTABLE.value,
    status: str = SAPLogchain.GREEN_STATUS.value,
    engine_table_name: str = SAPLogchain.ENGINE_TABLE.value,
) -> str:
    """Generates a sensor query based in the SAP Logchain table.

    Args:
        chain_id: chain id to query the status on SAP.
        dbtable: `db.table` to retrieve the data to
            check if the sap chain is already finished.
        status: `db.table` to retrieve the last status change
            timestamp.
        engine_table_name: table name exposed with the SAP LOGCHAIN data.
            This table will be used in the jdbc query.

    Returns:
        The query string.
    """
    ExecEnv.get_or_create(app_name="generate_sensor_sap_logchain_query")
    return SensorUpstreamManager.generate_sensor_sap_logchain_query(
        chain_id=chain_id,
        dbtable=dbtable,
        status=status,
        engine_table_name=engine_table_name,
    )


def send_notification(args: dict) -> None:
    """Send a notification using a notifier.

    Args:
        args: arguments for the notifier.
    """
    notifier = NotifierFactory.get_notifier(
        spec=TerminatorSpec(function="notify", args=args)
    )

    notifier.create_notification()
    notifier.send_notification()


def execute_gab(
    acon_path: Optional[str] = None,
    acon: Optional[dict] = None,
    collect_engine_usage: str = CollectEngineUsage.PROD_ONLY.value,
    spark_confs: dict = None,
) -> None:
    """Execute the gold asset builder based on a GAB Algorithm Configuration.

    GaB is useful to build your gold assets with predefined functions for recurrent
    periods.

    Args:
        acon_path: path of the acon (algorithm configuration) file.
        acon: acon provided directly through python code (e.g., notebooks
            or other apps).
        collect_engine_usage: Lakehouse usage statistics collection strategy.
        spark_confs: optional dictionary with the spark confs to be used when collecting
            the engine usage.
    """
    acon = ConfigUtils.get_acon(acon_path, acon)
    ExecEnv.get_or_create(app_name="execute_gab", config=acon.get("exec_env", None))
    EngineUsageStats.store_engine_usage(
        acon, execute_gab.__name__, collect_engine_usage, spark_confs
    )
    GAB(acon).execute()


================================================
FILE: lakehouse_engine/io/__init__.py
================================================
"""Input and Output package responsible for the behaviour of reading and writing."""


================================================
FILE: lakehouse_engine/io/exceptions.py
================================================
"""Package defining all the io custom exceptions."""


class IncrementalFilterInputNotFoundException(Exception):
    """Exception for when the input of an incremental filter is not found.

    This may occur when tables are being loaded in incremental way, taking the increment
    definition out of a specific table, but the table still does not exist, mainly
    because probably it was not loaded for the first time yet.
    """

    pass


class WrongIOFormatException(Exception):
    """Exception for when a user provides a wrong I/O format."""

    pass


class NotSupportedException(RuntimeError):
    """Exception for when a user provides a not supported operation."""

    pass


class InputNotFoundException(Exception):
    """Exception for when a user does not provide a mandatory input."""

    pass


class EndpointNotFoundException(Exception):
    """Exception for when the endpoint is not found by the Graph API."""

    pass


class LocalPathNotFoundException(Exception):
    """Exception for when a local path is not found."""

    pass


class WriteToLocalException(Exception):
    """Exception for when an error occurs when trying to write to the local path."""

    pass


class SharePointAPIError(Exception):
    """Custom exception class to handle errors Sharepoint API requests."""

    pass


class InvalidSharepointPathException(Exception):
    """Raised when folder path conflicts with file name.

    Happens if both `folder_relative_path` and `file_name` are set, but the folder path
    looks like a file path (last segment has a dot).
    """

    pass


================================================
FILE: lakehouse_engine/io/reader.py
================================================
"""Defines abstract reader behaviour."""

from abc import ABC, abstractmethod

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Reader(ABC):
    """Abstract Reader class."""

    def __init__(self, input_spec: InputSpec):
        """Construct Reader instances.

        Args:
            input_spec: input specification for reading data.
        """
        self._logger = LoggingHandler(self.__class__.__name__).get_logger()
        self._input_spec = input_spec

    @abstractmethod
    def read(self) -> DataFrame:
        """Abstract read method.

        Returns:
            A dataframe read according to the input specification.
        """
        raise NotImplementedError


================================================
FILE: lakehouse_engine/io/reader_factory.py
================================================
"""Module for reader factory."""

from abc import ABC

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import FILE_INPUT_FORMATS, InputFormat, InputSpec
from lakehouse_engine.io.readers.dataframe_reader import DataFrameReader
from lakehouse_engine.io.readers.file_reader import FileReader
from lakehouse_engine.io.readers.jdbc_reader import JDBCReader
from lakehouse_engine.io.readers.kafka_reader import KafkaReader
from lakehouse_engine.io.readers.query_reader import QueryReader
from lakehouse_engine.io.readers.sap_b4_reader import SAPB4Reader
from lakehouse_engine.io.readers.sap_bw_reader import SAPBWReader
from lakehouse_engine.io.readers.sharepoint_reader import SharepointReader
from lakehouse_engine.io.readers.table_reader import TableReader


class ReaderFactory(ABC):  # noqa: B024
    """Class for reader factory."""

    @classmethod
    def get_data(cls, spec: InputSpec) -> DataFrame:
        """Get data according to the input specification following a factory pattern.

        Args:
            spec: input specification to get the data.

        Returns:
            A dataframe containing the data.
        """
        if spec.db_table:
            read_df = TableReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.JDBC.value:
            read_df = JDBCReader(input_spec=spec).read()
        elif spec.data_format in FILE_INPUT_FORMATS:
            read_df = FileReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.KAFKA.value:
            read_df = KafkaReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.SQL.value:
            read_df = QueryReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.SAP_BW.value:
            read_df = SAPBWReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.SAP_B4.value:
            read_df = SAPB4Reader(input_spec=spec).read()
        elif spec.data_format == InputFormat.DATAFRAME.value:
            read_df = DataFrameReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.SFTP.value:
            from lakehouse_engine.io.readers.sftp_reader import SFTPReader

            read_df = SFTPReader(input_spec=spec).read()
            return SFTPReader(input_spec=spec).read()
        elif spec.data_format == InputFormat.SHAREPOINT.value:
            return SharepointReader(input_spec=spec).read()
        else:
            raise NotImplementedError(
                f"The requested input spec format {spec.data_format} is not supported."
            )

        if spec.temp_view:
            read_df.createOrReplaceTempView(spec.temp_view)

        return read_df


================================================
FILE: lakehouse_engine/io/readers/__init__.py
================================================
"""Readers package to define reading behaviour."""


================================================
FILE: lakehouse_engine/io/readers/dataframe_reader.py
================================================
"""Module to define behaviour to read from dataframes."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec
from lakehouse_engine.io.reader import Reader


class DataFrameReader(Reader):
    """Class to read data from a dataframe."""

    def __init__(self, input_spec: InputSpec):
        """Construct DataFrameReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read data from a dataframe.

        Returns:
            A dataframe containing the data from a dataframe previously
            computed.
        """
        return self._input_spec.df_name


================================================
FILE: lakehouse_engine/io/readers/file_reader.py
================================================
"""Module to define behaviour to read from files."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import FILE_INPUT_FORMATS, InputSpec, ReadType
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader
from lakehouse_engine.utils.schema_utils import SchemaUtils


class FileReader(Reader):
    """Class to read from files."""

    def __init__(self, input_spec: InputSpec):
        """Construct FileReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read file data.

        Returns:
            A dataframe containing the data from the files.
        """
        if (
            self._input_spec.read_type == ReadType.BATCH.value
            and self._input_spec.data_format in FILE_INPUT_FORMATS
        ):
            df = ExecEnv.SESSION.read.load(
                path=self._input_spec.location,
                format=self._input_spec.data_format,
                schema=SchemaUtils.from_input_spec(self._input_spec),
                **self._input_spec.options if self._input_spec.options else {},
            )

            if self._input_spec.with_filepath:
                # _metadata contains hidden columns
                df = df.selectExpr(
                    "*", "_metadata.file_path as lhe_extraction_filepath"
                )

            return df
        elif (
            self._input_spec.read_type == ReadType.STREAMING.value
            and self._input_spec.data_format in FILE_INPUT_FORMATS
        ):
            df = ExecEnv.SESSION.readStream.load(
                path=self._input_spec.location,
                format=self._input_spec.data_format,
                schema=SchemaUtils.from_input_spec(self._input_spec),
                **self._input_spec.options if self._input_spec.options else {},
            )

            if self._input_spec.with_filepath:
                # _metadata contains hidden columns
                df = df.selectExpr(
                    "*", "_metadata.file_path as lhe_extraction_filepath"
                )

            return df
        else:
            raise NotImplementedError(
                "The requested read type and format combination is not supported."
            )


================================================
FILE: lakehouse_engine/io/readers/jdbc_reader.py
================================================
"""Module to define behaviour to read from JDBC sources."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputFormat, InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader
from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionUtils,
)


class JDBCReader(Reader):
    """Class to read from JDBC source."""

    def __init__(self, input_spec: InputSpec):
        """Construct JDBCReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read data from JDBC source.

        Returns:
            A dataframe containing the data from the JDBC source.
        """
        if (
            self._input_spec.options is not None
            and self._input_spec.options.get("predicates", None) is not None
        ):
            raise WrongArgumentsException("Predicates can only be used with jdbc_args.")

        options = self._input_spec.options if self._input_spec.options else {}
        if self._input_spec.calculate_upper_bound:
            jdbc_util = JDBCExtractionUtils(
                JDBCExtraction(
                    user=options["user"],
                    password=options["password"],
                    url=options["url"],
                    dbtable=options["dbtable"],
                    extraction_type=options.get(
                        "extraction_type", JDBCExtraction.extraction_type
                    ),
                    partition_column=options["partitionColumn"],
                    calc_upper_bound_schema=self._input_spec.calc_upper_bound_schema,
                    default_upper_bound=options.get(
                        "default_upper_bound", JDBCExtraction.default_upper_bound
                    ),
                )
            )  # type: ignore
            options["upperBound"] = jdbc_util.get_spark_jdbc_optimal_upper_bound()

        if self._input_spec.jdbc_args:
            return ExecEnv.SESSION.read.options(**options).jdbc(
                **self._input_spec.jdbc_args
            )
        else:
            return (
                ExecEnv.SESSION.read.format(InputFormat.JDBC.value)
                .options(**options)
                .load()
            )


================================================
FILE: lakehouse_engine/io/readers/kafka_reader.py
================================================
"""Module to define behaviour to read from Kafka."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputFormat, InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader


class KafkaReader(Reader):
    """Class to read from Kafka."""

    def __init__(self, input_spec: InputSpec):
        """Construct KafkaReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read Kafka data.

        Returns:
            A dataframe containing the data from Kafka.
        """
        df = ExecEnv.SESSION.readStream.load(
            format=InputFormat.KAFKA.value,
            **self._input_spec.options if self._input_spec.options else {},
        )

        return df


================================================
FILE: lakehouse_engine/io/readers/query_reader.py
================================================
"""Module to define behaviour to read from a query."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader


class QueryReader(Reader):
    """Class to read data from a query."""

    def __init__(self, input_spec: InputSpec):
        """Construct QueryReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read data from a query.

        Returns:
            A dataframe containing the data from the query.
        """
        return ExecEnv.SESSION.sql(self._input_spec.query)


================================================
FILE: lakehouse_engine/io/readers/sap_b4_reader.py
================================================
"""Module to define behaviour to read from SAP B4 sources."""

from logging import Logger
from typing import Tuple

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader
from lakehouse_engine.utils.extraction.sap_b4_extraction_utils import (
    ADSOTypes,
    SAPB4Extraction,
    SAPB4ExtractionUtils,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SAPB4Reader(Reader):
    """Class to read from SAP B4 source."""

    _LOGGER: Logger = LoggingHandler(__name__).get_logger()

    def __init__(self, input_spec: InputSpec):
        """Construct SAPB4Reader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)
        self.jdbc_utils = self._get_jdbc_utils()

    def read(self) -> DataFrame:
        """Read data from SAP B4 source.

        Returns:
            A dataframe containing the data from the SAP B4 source.
        """
        options_args, jdbc_args = self._get_options()
        return ExecEnv.SESSION.read.options(**options_args).jdbc(**jdbc_args)

    def _get_jdbc_utils(self) -> SAPB4ExtractionUtils:
        jdbc_extraction = SAPB4Extraction(
            user=self._input_spec.options["user"],
            password=self._input_spec.options["password"],
            url=self._input_spec.options["url"],
            dbtable=self._input_spec.options["dbtable"],
            adso_type=self._input_spec.options["adso_type"],
            request_status_tbl=self._input_spec.options.get(
                "request_status_tbl", SAPB4Extraction.request_status_tbl
            ),
            changelog_table=self._input_spec.options.get(
                "changelog_table",
                (
                    self._input_spec.options["dbtable"]
                    if self._input_spec.options["adso_type"] == ADSOTypes.AQ.value
                    else self._input_spec.options["changelog_table"]
                ),
            ),
            data_target=SAPB4ExtractionUtils.get_data_target(self._input_spec.options),
            act_req_join_condition=self._input_spec.options.get(
                "act_req_join_condition", SAPB4Extraction.act_req_join_condition
            ),
            latest_timestamp_data_location=self._input_spec.options.get(
                "latest_timestamp_data_location",
                SAPB4Extraction.latest_timestamp_data_location,
            ),
            latest_timestamp_input_col=self._input_spec.options.get(
                "latest_timestamp_input_col",
                SAPB4Extraction.latest_timestamp_input_col,
            ),
            latest_timestamp_data_format=self._input_spec.options.get(
                "latest_timestamp_data_format",
                SAPB4Extraction.latest_timestamp_data_format,
            ),
            extraction_type=self._input_spec.options.get(
                "extraction_type", SAPB4Extraction.extraction_type
            ),
            driver=self._input_spec.options.get("driver", SAPB4Extraction.driver),
            num_partitions=self._input_spec.options.get(
                "numPartitions", SAPB4Extraction.num_partitions
            ),
            partition_column=self._input_spec.options.get(
                "partitionColumn", SAPB4Extraction.partition_column
            ),
            lower_bound=self._input_spec.options.get(
                "lowerBound", SAPB4Extraction.lower_bound
            ),
            upper_bound=self._input_spec.options.get(
                "upperBound", SAPB4Extraction.upper_bound
            ),
            default_upper_bound=self._input_spec.options.get(
                "default_upper_bound", SAPB4Extraction.default_upper_bound
            ),
            fetch_size=self._input_spec.options.get(
                "fetchSize", SAPB4Extraction.fetch_size
            ),
            compress=self._input_spec.options.get("compress", SAPB4Extraction.compress),
            custom_schema=self._input_spec.options.get(
                "customSchema", SAPB4Extraction.custom_schema
            ),
            extraction_timestamp=self._input_spec.options.get(
                "extraction_timestamp",
                SAPB4Extraction.extraction_timestamp,
            ),
            min_timestamp=self._input_spec.options.get(
                "min_timestamp", SAPB4Extraction.min_timestamp
            ),
            max_timestamp=self._input_spec.options.get(
                "max_timestamp", SAPB4Extraction.max_timestamp
            ),
            default_max_timestamp=self._input_spec.options.get(
                "default_max_timestamp", SAPB4Extraction.default_max_timestamp
            ),
            default_min_timestamp=self._input_spec.options.get(
                "default_min_timestamp", SAPB4Extraction.default_min_timestamp
            ),
            max_timestamp_custom_schema=self._input_spec.options.get(
                "max_timestamp_custom_schema",
                SAPB4Extraction.max_timestamp_custom_schema,
            ),
            generate_predicates=self._input_spec.generate_predicates,
            predicates=self._input_spec.options.get(
                "predicates", SAPB4Extraction.predicates
            ),
            predicates_add_null=self._input_spec.predicates_add_null,
            extra_cols_req_status_tbl=self._input_spec.options.get(
                "extra_cols_req_status_tbl", SAPB4Extraction.extra_cols_req_status_tbl
            ),
            calc_upper_bound_schema=self._input_spec.calc_upper_bound_schema,
            include_changelog_tech_cols=self._input_spec.options.get(
                "include_changelog_tech_cols",
                (
                    False
                    if self._input_spec.options["adso_type"] == ADSOTypes.AQ.value
                    else True
                ),
            ),
        )
        return SAPB4ExtractionUtils(jdbc_extraction)

    def _get_options(self) -> Tuple[dict, dict]:
        """Get Spark Options using JDBC utilities.

        Returns:
            A tuple dict containing the options args and
            jdbc args to be passed to Spark.
        """
        self._LOGGER.info(
            f"Initial options passed to the SAP B4 Reader: {self._input_spec.options}"
        )

        options_args, jdbc_args = self.jdbc_utils.get_spark_jdbc_options()

        if self._input_spec.generate_predicates or self._input_spec.options.get(
            "predicates", None
        ):
            options_args.update(
                self.jdbc_utils.get_additional_spark_options(
                    self._input_spec,
                    options_args,
                    ["partitionColumn", "numPartitions", "lowerBound", "upperBound"],
                )
            )
        else:
            if self._input_spec.calculate_upper_bound:
                options_args["upperBound"] = (
                    self.jdbc_utils.get_spark_jdbc_optimal_upper_bound()
                )

            options_args.update(
                self.jdbc_utils.get_additional_spark_options(
                    self._input_spec, options_args
                )
            )

        self._LOGGER.info(
            f"Final options to fill SAP B4 Reader Options: {options_args}"
        )
        self._LOGGER.info(f"Final jdbc args to fill SAP B4 Reader JDBC: {jdbc_args}")
        return options_args, jdbc_args


================================================
FILE: lakehouse_engine/io/readers/sap_bw_reader.py
================================================
"""Module to define behaviour to read from SAP BW sources."""

from logging import Logger
from typing import Tuple

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader
from lakehouse_engine.utils.extraction.sap_bw_extraction_utils import (
    SAPBWExtraction,
    SAPBWExtractionUtils,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SAPBWReader(Reader):
    """Class to read from SAP BW source."""

    _LOGGER: Logger = LoggingHandler(__name__).get_logger()

    def __init__(self, input_spec: InputSpec):
        """Construct SAPBWReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)
        self.jdbc_utils = self._get_jdbc_utils()

    def read(self) -> DataFrame:
        """Read data from SAP BW source.

        Returns:
            A dataframe containing the data from the SAP BW source.
        """
        options_args, jdbc_args = self._get_options()
        return ExecEnv.SESSION.read.options(**options_args).jdbc(**jdbc_args)

    def _get_jdbc_utils(self) -> SAPBWExtractionUtils:
        jdbc_extraction = SAPBWExtraction(
            user=self._input_spec.options["user"],
            password=self._input_spec.options["password"],
            url=self._input_spec.options["url"],
            dbtable=self._input_spec.options["dbtable"],
            latest_timestamp_data_location=self._input_spec.options.get(
                "latest_timestamp_data_location",
                SAPBWExtraction.latest_timestamp_data_location,
            ),
            latest_timestamp_input_col=self._input_spec.options.get(
                "latest_timestamp_input_col", SAPBWExtraction.latest_timestamp_input_col
            ),
            latest_timestamp_data_format=self._input_spec.options.get(
                "latest_timestamp_data_format",
                SAPBWExtraction.latest_timestamp_data_format,
            ),
            extraction_type=self._input_spec.options.get(
                "extraction_type", SAPBWExtraction.extraction_type
            ),
            act_request_table=self._input_spec.options.get(
                "act_request_table", SAPBWExtraction.act_request_table
            ),
            request_col_name=self._input_spec.options.get(
                "request_col_name", SAPBWExtraction.request_col_name
            ),
            act_req_join_condition=self._input_spec.options.get(
                "act_req_join_condition", SAPBWExtraction.act_req_join_condition
            ),
            driver=self._input_spec.options.get("driver", SAPBWExtraction.driver),
            changelog_table=self._input_spec.options.get(
                "changelog_table", SAPBWExtraction.changelog_table
            ),
            num_partitions=self._input_spec.options.get(
                "numPartitions", SAPBWExtraction.num_partitions
            ),
            partition_column=self._input_spec.options.get(
                "partitionColumn", SAPBWExtraction.partition_column
            ),
            lower_bound=self._input_spec.options.get(
                "lowerBound", SAPBWExtraction.lower_bound
            ),
            upper_bound=self._input_spec.options.get(
                "upperBound", SAPBWExtraction.upper_bound
            ),
            default_upper_bound=self._input_spec.options.get(
                "default_upper_bound", SAPBWExtraction.default_upper_bound
            ),
            fetch_size=self._input_spec.options.get(
                "fetchSize", SAPBWExtraction.fetch_size
            ),
            compress=self._input_spec.options.get("compress", SAPBWExtraction.compress),
            custom_schema=self._input_spec.options.get(
                "customSchema", SAPBWExtraction.custom_schema
            ),
            extraction_timestamp=self._input_spec.options.get(
                "extraction_timestamp",
                SAPBWExtraction.extraction_timestamp,
            ),
            odsobject=self._input_spec.options.get(
                "odsobject",
                SAPBWExtractionUtils.get_odsobject(self._input_spec.options),
            ),
            min_timestamp=self._input_spec.options.get(
                "min_timestamp", SAPBWExtraction.min_timestamp
            ),
            max_timestamp=self._input_spec.options.get(
                "max_timestamp", SAPBWExtraction.max_timestamp
            ),
            default_max_timestamp=self._input_spec.options.get(
                "default_max_timestamp", SAPBWExtraction.default_max_timestamp
            ),
            default_min_timestamp=self._input_spec.options.get(
                "default_min_timestamp", SAPBWExtraction.default_min_timestamp
            ),
            max_timestamp_custom_schema=self._input_spec.options.get(
                "max_timestamp_custom_schema",
                SAPBWExtraction.max_timestamp_custom_schema,
            ),
            include_changelog_tech_cols=self._input_spec.options.get(
                "include_changelog_tech_cols",
                SAPBWExtraction.include_changelog_tech_cols,
            ),
            generate_predicates=self._input_spec.generate_predicates,
            predicates=self._input_spec.options.get(
                "predicates", SAPBWExtraction.predicates
            ),
            predicates_add_null=self._input_spec.predicates_add_null,
            extra_cols_act_request=self._input_spec.options.get(
                "extra_cols_act_request", SAPBWExtraction.extra_cols_act_request
            ),
            get_timestamp_from_act_request=self._input_spec.options.get(
                "get_timestamp_from_act_request",
                SAPBWExtraction.get_timestamp_from_act_request,
            ),
            calc_upper_bound_schema=self._input_spec.calc_upper_bound_schema,
            sap_bw_schema=self._input_spec.options.get(
                "sap_bw_schema", SAPBWExtraction.sap_bw_schema
            ),
            ods_prefix=self._input_spec.options.get(
                "ods_prefix", SAPBWExtraction.ods_prefix
            ),
            logsys=self._input_spec.options.get("logsys", SAPBWExtraction.logsys),
        )
        return SAPBWExtractionUtils(jdbc_extraction)

    def _get_options(self) -> Tuple[dict, dict]:
        """Get Spark Options using JDBC utilities.

        Returns:
            A tuple dict containing the options args and
            jdbc args to be passed to Spark.
        """
        self._LOGGER.info(
            f"Initial options passed to the SAP BW Reader: {self._input_spec.options}"
        )

        options_args, jdbc_args = self.jdbc_utils.get_spark_jdbc_options()

        if self._input_spec.generate_predicates or self._input_spec.options.get(
            "predicates", None
        ):
            options_args.update(
                self.jdbc_utils.get_additional_spark_options(
                    self._input_spec,
                    options_args,
                    ["partitionColumn", "numPartitions", "lowerBound", "upperBound"],
                )
            )
        else:
            if self._input_spec.calculate_upper_bound:
                options_args["upperBound"] = (
                    self.jdbc_utils.get_spark_jdbc_optimal_upper_bound()
                )

            options_args.update(
                self.jdbc_utils.get_additional_spark_options(
                    self._input_spec, options_args
                )
            )

        self._LOGGER.info(
            f"Final options to fill SAP BW Reader Options: {options_args}"
        )
        self._LOGGER.info(f"Final jdbc args to fill SAP BW Reader JDBC: {jdbc_args}")
        return options_args, jdbc_args


================================================
FILE: lakehouse_engine/io/readers/sftp_reader.py
================================================
"""Module to define behaviour to read from SFTP."""

import gzip
from datetime import datetime
from io import TextIOWrapper
from logging import Logger
from typing import List
from zipfile import ZipFile

import pandas as pd
from pandas import DataFrame as PandasDataFrame
from pandas.errors import EmptyDataError
from paramiko.sftp_file import SFTPFile
from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec, ReadType
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader
from lakehouse_engine.utils.extraction.sftp_extraction_utils import SFTPExtractionUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SFTPReader(Reader):
    """Class to read from SFTP."""

    _logger: Logger = LoggingHandler(__name__).get_logger()

    def __init__(self, input_spec: InputSpec):
        """Construct SFTPReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read SFTP data.

        Returns:
            A dataframe containing the data from SFTP.
        """
        if self._input_spec.read_type == ReadType.BATCH.value:
            options_args = self._input_spec.options if self._input_spec.options else {}

            sftp_files_format = SFTPExtractionUtils.validate_format(
                self._input_spec.sftp_files_format.lower()
            )

            location = SFTPExtractionUtils.validate_location(self._input_spec.location)

            sftp, transport = SFTPExtractionUtils.get_sftp_client(options_args)

            files_list = SFTPExtractionUtils.get_files_list(
                sftp, location, options_args
            )

            dfs: List[PandasDataFrame] = []
            try:
                for filename in files_list:
                    with sftp.open(filename, "r") as sftp_file:
                        try:
                            pdf = self._read_files(
                                filename,
                                sftp_file,
                                options_args.get("args", {}),
                                sftp_files_format,
                            )
                            if options_args.get("file_metadata", None):
                                pdf["filename"] = filename
                                pdf["modification_time"] = datetime.fromtimestamp(
                                    sftp.stat(filename).st_mtime
                                )
                            self._append_files(pdf, dfs)
                        except EmptyDataError:
                            self._logger.info(f"{filename} - Empty or malformed file.")
                if dfs:
                    df = ExecEnv.SESSION.createDataFrame(pd.concat(dfs))
                else:
                    raise ValueError(
                        "No files were found with the specified parameters."
                    )
            finally:
                sftp.close()
                transport.close()
        else:
            raise NotImplementedError(
                "The requested read type supports only BATCH mode."
            )
        return df

    @classmethod
    def _append_files(cls, pdf: PandasDataFrame, dfs: List) -> List:
        """Append to the list dataframes with data.

        Args:
            pdf: a Pandas dataframe containing data from files.
            dfs: a list of Pandas dataframes.

        Returns:
            A list of not empty Pandas dataframes.
        """
        if not pdf.empty:
            dfs.append(pdf)
        return dfs

    @classmethod
    def _read_files(
        cls, filename: str, sftp_file: SFTPFile, option_args: dict, files_format: str
    ) -> PandasDataFrame:
        """Open and decompress files to be extracted from SFTP.

        For zip files, to avoid data type inferred issues
        during the iteration, all data will be read as string.
        Also, empty dataframes will NOT be considered to be processed.
        For the not considered ones, the file names will be logged.

        Args:
            filename: the filename to be read.
            sftp_file: SFTPFile object representing the open file.
            option_args: options from the acon.
            files_format: a string containing the file extension.

        Returns:
            A pandas dataframe with data from the file.
        """
        reader = getattr(pd, f"read_{files_format}")

        if filename.endswith(".gz"):
            with gzip.GzipFile(fileobj=sftp_file, mode="rb") as gz_file:
                pdf = reader(
                    TextIOWrapper(gz_file),  # type: ignore
                    **option_args,
                )
        elif filename.endswith(".zip"):
            with ZipFile(sftp_file, "r") as zf:  # type: ignore
                dfs = [
                    reader(TextIOWrapper(zf.open(f)), **option_args).fillna("")
                    for f in zf.namelist()
                ]
                if not pd.concat(dfs, ignore_index=True).empty:
                    pdf = pd.concat(dfs, ignore_index=True).astype(str)
                else:
                    pdf = pd.DataFrame()
                    cls._logger.info(f"{filename} - Empty or malformed file.")
        else:
            pdf = reader(
                sftp_file,
                **option_args,
            )
        return pdf


================================================
FILE: lakehouse_engine/io/readers/sharepoint_reader.py
================================================
"""Module to define the behaviour to read from Sharepoint."""

import csv
import fnmatch
import time
from functools import reduce
from pathlib import Path
from typing import Optional

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

from lakehouse_engine.core.definitions import InputSpec, SharepointFile
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.exceptions import (
    InvalidSharepointPathException,
    NotSupportedException,
)
from lakehouse_engine.io.reader import Reader
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.sharepoint_utils import SharepointUtils

_LOGGER = LoggingHandler(__name__).get_logger()


class SharepointReader(Reader):
    """Reader implementation for Sharepoint files."""

    def __init__(self, input_spec: InputSpec):
        """Construct SharepointReader instance.

        Args:
            input_spec: InputSpec with Sharepoint parameters.
        """
        super().__init__(input_spec)
        self.opts = self._input_spec.sharepoint_opts
        self.sharepoint_utils = self._get_sharepoint_utils()

        if self.opts.file_name and self.opts.folder_relative_path:
            folder_name = Path(self.opts.folder_relative_path).name
            if "." in folder_name:
                raise InvalidSharepointPathException(
                    f"Invalid path setup: `folder_relative_path` "
                    f"('{self.opts.folder_relative_path}') appears to include a file, "
                    f"but `file_name` ('{self.opts.file_name}') was also provided. "
                    f"Provide either a folder+file_name, or a full file path not both."
                )
            _LOGGER.warning(
                "Using `file_name` with a folder path. "
                "This will read only one file. "
                "To read all files in the folder, set `file_name` to None."
            )
            self.file_path = f"{self.opts.folder_relative_path}/{self.opts.file_name}"
        elif (
            self.opts.folder_relative_path
            and "." in Path(self.opts.folder_relative_path).name
        ):
            self.file_path = self.opts.folder_relative_path  # full path with extension
        else:
            self.file_path = self.opts.folder_relative_path

        if self.opts.file_name and self.opts.file_pattern:
            _LOGGER.warning(
                "`file_name` is provided. `file_pattern` will be ignored and only the "
                "specified file will be read."
            )

        self.pattern = self.opts.file_pattern  # may be None

        # Compute archive base folder from final self.file_path
        archive_base_folder = None
        if self.file_path:
            p = Path(self.file_path)
            archive_base_folder = str(p.parent) if p.suffix else str(p)

        # Set archive folders
        self.success_folder = (
            f"{archive_base_folder}/{self.opts.archive_success_subfolder}"
            if (archive_base_folder and self.opts.archive_success_subfolder)
            else None
        )
        self.error_folder = (
            f"{archive_base_folder}/{self.opts.archive_error_subfolder}"
            if (archive_base_folder and self.opts.archive_error_subfolder)
            else None
        )

    def read(self) -> DataFrame:
        """Read a Sharepoint file using a format-specific reader.

        This method delegates to a reader resolved by file extension or the
        declared `file_type` (e.g., SharepointCsvReader or SharepointExcelReader).

        Returns:
            Spark DataFrame.

        Raises:
            InputNotFoundException: Missing required Sharepoint options.
            NotSupportedException: Streaming requested or reader unsupported.
        """
        self._input_spec.sharepoint_opts.validate_for_reader()

        if self._input_spec.read_type == "streaming":
            raise NotSupportedException(
                "Sharepoint reader doesn't support streaming input."
            )

        return SharepointReaderFactory.get_reader(self._input_spec).read()

    def _get_sharepoint_utils(self) -> SharepointUtils:
        """Build a SharepointUtils instance from input specs.

        Returns:
            SharepointUtils.
        """
        return SharepointUtils(
            client_id=self._input_spec.sharepoint_opts.client_id,
            tenant_id=self._input_spec.sharepoint_opts.tenant_id,
            local_path=self._input_spec.sharepoint_opts.local_path,
            api_version=self._input_spec.sharepoint_opts.api_version,
            site_name=self._input_spec.sharepoint_opts.site_name,
            drive_name=self._input_spec.sharepoint_opts.drive_name,
            file_name=self._input_spec.sharepoint_opts.file_name,
            folder_relative_path=self._input_spec.sharepoint_opts.folder_relative_path,
            chunk_size=self._input_spec.sharepoint_opts.chunk_size,
            local_options=self._input_spec.sharepoint_opts.local_options,
            secret=self._input_spec.sharepoint_opts.secret,
            conflict_behaviour=self._input_spec.sharepoint_opts.conflict_behaviour,
            file_pattern=self._input_spec.sharepoint_opts.file_pattern,
            file_type=self._input_spec.sharepoint_opts.file_type,
        )


class SharepointCsvReader(SharepointReader):
    """Read CSV files from Sharepoint and return Spark DataFrame.

    Supports reading a single file or combining multiple files in a folder.
    Ensures schema consistency and archives processed files.
    """

    def read(self, file_path: str = None, pattern: str = None) -> DataFrame:
        """Read CSV data from Sharepoint.

        Args:
            file_path: Full file or folder path (overrides options if provided).
            pattern: Optional substring filter for folder mode.

        Returns:
            Spark DataFrame.

        Raises:
            ValueError: Invalid/missing path or path not found.
        """
        file_path = file_path or self.file_path
        pattern = pattern or self.pattern

        if not file_path:
            raise ValueError(
                """`file_name` or `folder_relative_path` must be provided via
                sharepoint_opts."""
            )

        # Case 1: file_path includes a file (e.g., folder/file.csv or full path)
        if "." in Path(file_path).name:
            sp_file = self.sharepoint_utils.get_file_metadata(file_path)
            _LOGGER.info(f"Detected single-file read mode for '{file_path}'.")
            return self._load_and_archive_file(sp_file)

        # Case 2: it's a folder — use optional pattern
        if not self.sharepoint_utils.check_if_endpoint_exists(file_path):
            raise ValueError(f"Folder '{file_path}' does not exist in Sharepoint.")

        _LOGGER.info(
            f"Detected folder read mode for '{file_path}' "
            + (
                f"with pattern '{pattern}'."
                if pattern
                else "with no pattern (all files)."
            )
        )
        return self.read_csv_folder(file_path, pattern)

    def _load_and_archive_file(self, sp_file: SharepointFile) -> DataFrame:
        """Download a Sharepoint CSV, stage it locally, load with Spark, and archive it.

        Handles:
        - Writing the CSV to a temporary local path.
        - Reading it as a Spark DataFrame.
        - Archiving goes to the configured success/error subfolders when enabled
        (defaults: "done"/"error").

        Args:
            sp_file: File metadata and content.

        Returns:
            Spark DataFrame.

        Raises:
            ValueError: Empty content.
            Exception: Staging or read failure.
        """
        if self.file_path:
            base_folder = (
                str(Path(self.file_path).parent)
                if "." in Path(self.file_path).name
                else str(Path(self.file_path))
            )
        else:
            base_folder = sp_file._folder if getattr(sp_file, "_folder", None) else None

        success_subfolder = self.opts.archive_success_subfolder or "done"
        error_subfolder = self.opts.archive_error_subfolder or "error"

        success_folder = f"{base_folder}/{success_subfolder}" if base_folder else None
        error_folder = f"{base_folder}/{error_subfolder}" if base_folder else None

        archive_target = error_folder  # default to error unless full read succeeds

        try:
            # IMPORTANT: empty check inside try so finally always runs
            if not sp_file.content:
                raise ValueError(
                    f"File '{getattr(sp_file, 'file_path', None) or self.file_path}' "
                    "is empty or could not be downloaded."
                )

            with self.sharepoint_utils.staging_area() as tmp_dir_raw:
                tmp_dir: Path = Path(tmp_dir_raw)

                sp_file, df = self._load_csv_to_spark(sp_file, tmp_dir)
                archive_target = success_folder  # only mark success after full read

                _LOGGER.info(
                    f"Successfully read '{sp_file.file_path}' into Spark DataFrame."
                )
                df = df.cache()
                df.count()  # Force materialization
                return df

        except Exception as e:
            _LOGGER.error(f"Error processing '{sp_file.file_name}': {e}")
            raise

        finally:
            self.sharepoint_utils.archive_sharepoint_file(
                sp_file=sp_file,
                to_path=archive_target,
                move_enabled=self.opts.archive_enabled,
            )

    def _get_csv_files_in_folder(
        self, folder_path: str, pattern: str = None
    ) -> list[SharepointFile]:
        """List CSV files in a Sharepoint folder, optionally filtered by pattern.

        Args:
            folder_path: Sharepoint folder path.
            pattern: Optional glob/substring pattern.

        Returns:
            List of SharepointFile.
        """
        items = self.sharepoint_utils.list_items_in_path(folder_path)
        files = []

        if pattern:
            _LOGGER.info(
                f"""Filtering Sharepoint files in '{folder_path}' using glob-style
                pattern: '{pattern}'.
                Ensure your pattern uses wildcards (e.g., '*.csv', 'sales_*.csv').
                """
            )

        for item in items:
            file = SharepointFile(
                file_name=item["name"],
                time_created=item.get("createdDateTime", ""),
                time_modified=item.get("lastModifiedDateTime", ""),
                _folder=folder_path,
            )

            if not file.is_csv:
                continue

            if pattern:
                if not fnmatch.fnmatch(file.file_name, pattern):
                    continue

            files.append(file)

        return sorted(files, key=lambda f: f.file_name)

    def _load_csv_to_spark(
        self, sp_file: SharepointFile, tmp_dir: Path
    ) -> tuple[SharepointFile, DataFrame]:
        """Load a staged CSV into Spark and return file + DataFrame.

        Args:
            sp_file: Sharepoint file metadata.
            tmp_dir: Local staging directory.

        Returns:
            (SharepointFile, Spark DataFrame).

        Raises:
            ValueError: Empty or undownloadable file.
        """
        sp_file = self.sharepoint_utils.get_file_metadata(sp_file.file_path)

        local_file = self.sharepoint_utils.save_to_staging_area(sp_file)

        spark_options = self.resolve_spark_csv_options(sp_file.content)

        try:
            _LOGGER.info(f"Starting to read file: {sp_file.file_name}")
            start_time = time.time()
            df = (
                ExecEnv.SESSION.read.format("csv")
                .options(**spark_options)
                .load(str(local_file))
                .cache()
            )
            _LOGGER.info(
                f"""Finished reading file: {sp_file.file_name} in
                {round(time.time() - start_time, 2)} seconds"""
            )
            df.count()  # force materialization

            return sp_file, df

        except Exception as e:
            _LOGGER.error(
                f"Failed to read local copy of Sharepoint file: {local_file}",
                exc_info=True,
            )
            raise ValueError(
                f"Failed to read Sharepoint file: '{sp_file.file_path}'."
            ) from e

    def read_csv_folder(self, folder_path: str, pattern: str = None) -> DataFrame:
        """Read and combine CSVs from a Sharepoint folder.

        If a pattern is provided, only files whose names contain the pattern will be
        read.
        Only archives files to the configured success subfolder if the full read
        and union succeeds.
        Files causing schema mismatches or other read issues are moved to the
        configured error subfolder (when enabled).

        Args:
            folder_path: Sharepoint folder path.
            pattern: Optional substring filter for filenames.

        Returns:
            Combined Spark DataFrame.

        Raises:
            ValueError: No valid files or schema mismatches.
        """
        files = self._get_csv_files_in_folder(folder_path, pattern)
        if not files:
            raise ValueError(f"No CSV files found in folder: {folder_path}")

        valid_files, dfs = [], []
        base_schema = None

        with self.sharepoint_utils.staging_area() as tmp_dir_raw:
            tmp_dir: Path = Path(tmp_dir_raw)

            for file in files:
                try:
                    file_with_content, df = self._validate_and_read_file(
                        file, tmp_dir, base_schema
                    )
                    base_schema = base_schema or df.schema
                    dfs.append(df)
                    valid_files.append(file_with_content)
                except Exception as e:
                    self._handle_file_error(file, folder_path, e)
                    raise

        if not dfs:
            raise ValueError("No valid CSV files could be loaded from folder.")

        combined = reduce(lambda a, b: a.unionByName(b), dfs).cache()
        combined.count()

        for sp_file in valid_files:
            self.sharepoint_utils.archive_sharepoint_file(
                sp_file,
                to_path=(
                    f"{folder_path}/{self.opts.archive_success_subfolder}"
                    if self.opts.archive_success_subfolder
                    else None
                ),
                move_enabled=self.opts.archive_enabled,
            )

        return combined

    def _validate_and_read_file(
        self,
        file: SharepointFile,
        tmp_dir: Path,
        base_schema: Optional[StructType],
    ) -> tuple[SharepointFile, DataFrame]:
        """Validate schema and read CSV file into a Spark DataFrame.

        Args:
            file: Sharepoint file to read.
            tmp_dir: Temporary staging directory.
            base_schema: Schema to validate against.

        Returns:
            (validated SharepointFile, DataFrame).

        Raises:
            ValueError: Schema mismatch.
        """
        file_with_content, df = self._load_csv_to_spark(file, tmp_dir)

        if base_schema and df.schema != base_schema:
            _LOGGER.error(
                f"""Schema mismatch in '{file.file_name}'. Expected: {base_schema},
                Found: {df.schema}"""
            )
            self.sharepoint_utils.archive_sharepoint_file(
                sp_file=file_with_content,
                to_path=self.error_folder,
                move_enabled=self.opts.archive_enabled,
            )
            raise ValueError(f"Schema mismatch in '{file.file_name}'")

        return file_with_content, df

    def _handle_file_error(
        self,
        file: SharepointFile,
        folder_path: str,
        error: Exception,
    ) -> None:
        """Handle file read or processing errors by logging and archiving.

        Logs the error, prevents duplicate archiving, and moves the file
        to the error subfolder when enabled. Falls back gracefully if
        archiving fails.

        Args:
            file: Problematic SharepointFile.
            folder_path: Folder path for fallback archiving.
            error: Exception encountered.
        """
        _LOGGER.error(f"Error processing '{file.file_name}': {error}")
        if not getattr(file, "_already_archived", False):
            file.skip_rename = True
            try:
                self.sharepoint_utils.archive_sharepoint_file(
                    sp_file=file,
                    to_path=self.error_folder,
                    move_enabled=self.opts.archive_enabled,
                )
                file._already_archived = True
            except Exception as archive_error:
                _LOGGER.warning(f"Secondary archiving failed: {archive_error}")
        else:
            _LOGGER.info(
                f"Skipping second archive for '{file.file_name}' (already archived)"
            )

    def detect_delimiter(
        self,
        file_content: bytes,
        provided_delimiter: Optional[str] = None,
        expected_columns: Optional[list] = None,
    ) -> str:
        """Detect the appropriate delimiter for a CSV file.

        If a delimiter is explicitly provided by the user, it will be used directly
        (sniffing is bypassed).
        Otherwise, attempts to auto-detect the delimiter using csv.Sniffer based on the
        first line or expected columns.

        Args:
            file_content: Raw CSV bytes.
            provided_delimiter: Explicit delimiter to use.
            expected_columns: Optional expected header names.

        Returns:
            Final delimiter.

        Raises:
            ValueError: Unable to determine delimiter.
        """
        if provided_delimiter:
            _LOGGER.info(f"User-specified delimiter '{provided_delimiter}' selected.")
            return provided_delimiter

        try:
            text = file_content.decode("utf-8")
            dialect = csv.Sniffer().sniff(text, delimiters=";,|\t")
            detected_delimiter = dialect.delimiter

            _LOGGER.info(
                f"No user-specified delimiter. Auto-detected: '{detected_delimiter}'"
            )

            first_line = text.splitlines()[0].strip()
            actual_column_count = len(first_line.split(detected_delimiter))

            if expected_columns:
                expected_count = len(expected_columns)
                if actual_column_count != expected_count:
                    _LOGGER.warning(
                        f"""Detected delimiter '{detected_delimiter}' resulted in
                        {actual_column_count} columns,
                        but {expected_count} were expected. Consider specifying
                        the delimiter explicitly."""
                    )
            elif actual_column_count <= 1:
                _LOGGER.warning(
                    f"""Detected delimiter '{detected_delimiter}' resulted in only
                    {actual_column_count} column.
                     Consider specifying the delimiter explicitly in
                     'sharepoint_opts.local_options'."""
                )

            return detected_delimiter

        except Exception as e:
            _LOGGER.warning(
                f"Failed to auto-detect delimiter. Defaulting to comma. Reason: {e}"
            )
            return ","

    def resolve_spark_csv_options(self, file_content: bytes) -> dict:
        """Resolve Spark CSV read options by validating or detecting delimiter.

        Args:
            file_content: Raw file bytes.

        Returns:
            Dict of Spark CSV options (includes delimiter).
        """
        local_options = self._input_spec.sharepoint_opts.local_options or {}

        if "sep" in local_options:
            user_delimiter = local_options["sep"]
        elif "delimiter" in local_options:
            user_delimiter = local_options["delimiter"]
        else:
            user_delimiter = None

        expected_columns = local_options.get("expected_columns")

        final_delimiter = self.detect_delimiter(
            file_content=file_content,
            provided_delimiter=user_delimiter,
            expected_columns=expected_columns,
        )

        # Warn if expected column names do not match the header when using the selected
        # delimiter
        if expected_columns:
            try:
                header_line = file_content.decode("utf-8").splitlines()[0].strip()
                actual_columns = [c.strip() for c in header_line.split(final_delimiter)]

                expected_normalized = [str(c).strip().lower() for c in expected_columns]
                actual_normalized = [c.strip().lower() for c in actual_columns]

                if actual_normalized != expected_normalized:
                    _LOGGER.warning(
                        "Expected columns don't match CSV header using delimiter '%s'. "
                        "Expected: %s vs. Actual: %s. The read will proceed; "
                        "consider specifying the correct delimiter or "
                        "updating expected_columns.",
                        final_delimiter,
                        expected_columns,
                        actual_columns,
                    )
            except Exception as e:
                _LOGGER.warning(
                    "Failed to validate expected_columns against CSV header. "
                    "The read will proceed. Reason: %s",
                    e,
                )

        # Safety fallback if detector returned nothing for some reason
        final_delimiter = final_delimiter or ","

        spark_options = dict(local_options)
        spark_options["sep"] = final_delimiter
        # Remove "delimiter" to avoid ambiguity as spark uses "sep"
        spark_options.pop("delimiter", None)

        return spark_options


class SharepointExcelReader(SharepointReader):
    """Read Excel files from Sharepoint (not yet implemented)."""

    def read(self) -> DataFrame:
        """Read Excel files from Sharepoint.

        This method is not yet implemented and currently raises an error.
        Intended for future support of .xlsx file read from Sharepoint folders or files.

        Raises:
            NotImplementedError: Always, since Excel reading is not implemented.
        """
        raise NotImplementedError("Excel reading is not yet implemented.")


class SharepointReaderFactory:
    """Select the correct Sharepoint reader based on file type, file name, folder path.

    Default to using the file path from SharepointUtils instance.
    """

    @staticmethod
    def get_reader(input_spec: InputSpec) -> SharepointReader:
        """Select the appropriate Sharepoint reader based on input specification.

        Resolution order:
        1. Use file extension from `file_name` if provided.
        2. If `folder_relative_path` includes a file with extension, use that.
        3. If neither applies, use `file_type`.

        Args:
            input_spec: InputSpec with Sharepoint options.

        Returns:
            Reader instance for the resolved file type.

        Raises:
            ValueError: If file format is unsupported or cannot be determined.
        """
        opts = input_spec.sharepoint_opts

        # 1. If reading a specific file, use file_name
        if opts.file_name:
            ext = Path(opts.file_name).suffix.lower()

        # 2. If folder_relative_path includes extension, treat it as full path
        elif opts.folder_relative_path and "." in Path(opts.folder_relative_path).name:
            ext = Path(opts.folder_relative_path).suffix.lower()

        # 3. Otherwise, rely on file_type
        elif opts.file_type:
            ext = f".{opts.file_type.lower()}"

        else:
            raise ValueError(
                """Cannot determine file format. Please provide `file_name`,
                 a full file path in `folder_relative_path`, or explicitly set
                 `file_type`."""
            )

        readers = {
            ".csv": SharepointCsvReader,
            ".xlsx": SharepointExcelReader,
        }
        try:
            _LOGGER.info(f"Detected {ext} read mode.")
            return readers[ext](input_spec)
        except KeyError:
            raise ValueError(f"Unsupported file format: {ext}")


================================================
FILE: lakehouse_engine/io/readers/table_reader.py
================================================
"""Module to define behaviour to read from tables."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputSpec, ReadType
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader import Reader


class TableReader(Reader):
    """Class to read data from a table."""

    def __init__(self, input_spec: InputSpec):
        """Construct TableReader instances.

        Args:
            input_spec: input specification.
        """
        super().__init__(input_spec)

    def read(self) -> DataFrame:
        """Read data from a table.

        Returns:
            A dataframe containing the data from the table.
        """
        if self._input_spec.read_type == ReadType.BATCH.value:
            return ExecEnv.SESSION.read.options(
                **self._input_spec.options if self._input_spec.options else {}
            ).table(self._input_spec.db_table)
        elif self._input_spec.read_type == ReadType.STREAMING.value:
            return ExecEnv.SESSION.readStream.options(
                **self._input_spec.options if self._input_spec.options else {}
            ).table(self._input_spec.db_table)
        else:
            self._logger.error("The requested read type is not supported.")
            raise NotImplementedError


================================================
FILE: lakehouse_engine/io/writer.py
================================================
"""Defines abstract writer behaviour."""

from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional, OrderedDict

from pyspark.sql import DataFrame
from pyspark.sql.functions import lit

from lakehouse_engine.core.definitions import DQSpec, OutputSpec
from lakehouse_engine.transformers.transformer_factory import TransformerFactory
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Writer(ABC):
    """Abstract Writer class."""

    def __init__(
        self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict = None
    ):
        """Construct Writer instances.

        Args:
            output_spec: output specification to write data.
            df: dataframe to write.
            data: list of all dfs generated on previous steps before writer.
        """
        self._logger = LoggingHandler(self.__class__.__name__).get_logger()
        self._output_spec = output_spec
        self._df = df
        self._data = data

    @abstractmethod
    def write(self) -> Optional[OrderedDict]:
        """Abstract write method."""
        raise NotImplementedError

    @staticmethod
    def write_transformed_micro_batch(**kwargs: Any) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        This function must define an inner function that manipulates a streaming batch,
        and then return that function. Look for concrete implementations of this
        function for more clarity.

        Args:
            kwargs: any keyword arguments.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            logger = LoggingHandler(__name__).get_logger()
            logger.warning("Skipping transform micro batch... nothing to do.")

        return inner

    @classmethod
    def get_transformed_micro_batch(
        cls,
        output_spec: OutputSpec,
        batch_df: DataFrame,
        batch_id: int,
        data: OrderedDict,
    ) -> DataFrame:
        """Get the result of the transformations applied to a micro batch dataframe.

        Args:
            output_spec: output specification associated with the writer.
            batch_df: batch dataframe (given from streaming foreachBatch).
            batch_id: if of the batch (given from streaming foreachBatch).
            data: list of all dfs generated on previous steps before writer
                to be available on micro batch transforms.

        Returns:
            The transformed dataframe.
        """
        transformed_df = batch_df
        if output_spec.with_batch_id:
            transformed_df = transformed_df.withColumn("lhe_batch_id", lit(batch_id))

        for transformer in output_spec.streaming_micro_batch_transformers:
            transformed_df = transformed_df.transform(
                TransformerFactory.get_transformer(transformer, data)
            )

        return transformed_df

    @classmethod
    def get_streaming_trigger(cls, output_spec: OutputSpec) -> Dict:
        """Define which streaming trigger will be used.

        Args:
            output_spec: output specification.

        Returns:
            A dict containing streaming trigger.
        """
        trigger: Dict[str, Any] = {}

        if output_spec.streaming_available_now:
            trigger["availableNow"] = output_spec.streaming_available_now
        elif output_spec.streaming_once:
            trigger["once"] = output_spec.streaming_once
        elif output_spec.streaming_processing_time:
            trigger["processingTime"] = output_spec.streaming_processing_time
        elif output_spec.streaming_continuous:
            trigger["continuous"] = output_spec.streaming_continuous
        else:
            raise NotImplementedError(
                "The requested output spec streaming trigger is not supported."
            )

        return trigger

    @staticmethod
    def run_micro_batch_dq_process(df: DataFrame, dq_spec: List[DQSpec]) -> DataFrame:
        """Run the data quality process in a streaming micro batch dataframe.

        Iterates over the specs and performs the checks or analysis depending on the
        data quality specification provided in the configuration.

        Args:
            df: the dataframe in which to run the dq process on.
            dq_spec: data quality specification.

        Returns: the validated dataframe.
        """
        from lakehouse_engine.dq_processors.dq_factory import DQFactory

        validated_df = df
        for spec in dq_spec:
            validated_df = DQFactory.run_dq_process(spec, df)

        return validated_df


================================================
FILE: lakehouse_engine/io/writer_factory.py
================================================
"""Module for writer factory."""

from abc import ABC
from typing import OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import (
    FILE_OUTPUT_FORMATS,
    OutputFormat,
    OutputSpec,
    WriteType,
)
from lakehouse_engine.io.writer import Writer
from lakehouse_engine.io.writers.console_writer import ConsoleWriter
from lakehouse_engine.io.writers.dataframe_writer import DataFrameWriter
from lakehouse_engine.io.writers.delta_merge_writer import DeltaMergeWriter
from lakehouse_engine.io.writers.file_writer import FileWriter
from lakehouse_engine.io.writers.jdbc_writer import JDBCWriter
from lakehouse_engine.io.writers.kafka_writer import KafkaWriter
from lakehouse_engine.io.writers.rest_api_writer import RestApiWriter
from lakehouse_engine.io.writers.sharepoint_writer import SharepointWriter
from lakehouse_engine.io.writers.table_writer import TableWriter


class WriterFactory(ABC):  # noqa: B024
    """Class for writer factory."""

    AVAILABLE_WRITERS = {
        OutputFormat.TABLE.value: TableWriter,
        OutputFormat.DELTAFILES.value: DeltaMergeWriter,
        OutputFormat.JDBC.value: JDBCWriter,
        OutputFormat.FILE.value: FileWriter,
        OutputFormat.KAFKA.value: KafkaWriter,
        OutputFormat.CONSOLE.value: ConsoleWriter,
        OutputFormat.DATAFRAME.value: DataFrameWriter,
        OutputFormat.REST_API.value: RestApiWriter,
        OutputFormat.SHAREPOINT.value: SharepointWriter,
    }

    @classmethod
    def _get_writer_name(cls, spec: OutputSpec) -> str:
        """Get the writer name according to the output specification.

        Args:
            OutputSpec spec: output specification to write data.

        Returns:
            Writer: writer name that will be created to write the data.
        """
        if spec.db_table and spec.write_type != WriteType.MERGE.value:
            writer_name = OutputFormat.TABLE.value
        elif (
            spec.data_format == OutputFormat.DELTAFILES.value or spec.db_table
        ) and spec.write_type == WriteType.MERGE.value:
            writer_name = OutputFormat.DELTAFILES.value
        elif spec.data_format in FILE_OUTPUT_FORMATS:
            writer_name = OutputFormat.FILE.value
        else:
            writer_name = spec.data_format
        return writer_name

    @classmethod
    def get_writer(cls, spec: OutputSpec, df: DataFrame, data: OrderedDict) -> Writer:
        """Get a writer according to the output specification using a factory pattern.

        Args:
            spec: output specification to write data.
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            Writer: writer that will write the data.
        """
        writer_name = cls._get_writer_name(spec)
        writer = cls.AVAILABLE_WRITERS.get(writer_name)

        if writer:
            return writer(output_spec=spec, df=df, data=data)  # type: ignore
        else:
            raise NotImplementedError(
                f"The requested output spec format {spec.data_format} is not supported."
            )


================================================
FILE: lakehouse_engine/io/writers/__init__.py
================================================
"""Package containing the writers responsible for writing data."""


================================================
FILE: lakehouse_engine/io/writers/console_writer.py
================================================
"""Module to define behaviour to write to console."""

from typing import Callable, OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.writer import Writer
from lakehouse_engine.utils.logging_handler import LoggingHandler


class ConsoleWriter(Writer):
    """Class to write data to console."""

    _logger = LoggingHandler(__name__).get_logger()

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct ConsoleWriter instances.

        Args:
            output_spec: output specification
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Write data to console."""
        self._output_spec.options = (
            self._output_spec.options if self._output_spec.options else {}
        )
        if not self._df.isStreaming:
            self._logger.info("Dataframe preview:")
            self._show_df(self._df, self._output_spec)
        else:
            self._logger.info("Stream Dataframe preview:")
            self._write_to_console_in_streaming_mode(
                self._df, self._output_spec, self._data
            )

    @staticmethod
    def _show_df(df: DataFrame, output_spec: OutputSpec) -> None:
        """Given a dataframe it applies Spark's show function to show it.

        Args:
            df: dataframe to be shown.
            output_spec: output specification.
        """
        df.show(
            n=output_spec.options.get("limit", 20),
            truncate=output_spec.options.get("truncate", True),
            vertical=output_spec.options.get("vertical", False),
        )

    @staticmethod
    def _show_streaming_df(output_spec: OutputSpec) -> Callable:
        """Define how to show a streaming df.

        Args:
            output_spec: output specification.

        Returns:
            A function to show df in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            ConsoleWriter._logger.info(f"Showing DF for batch {batch_id}")
            ConsoleWriter._show_df(batch_df, output_spec)

        return inner

    @staticmethod
    def _write_to_console_in_streaming_mode(
        df: DataFrame, output_spec: OutputSpec, data: OrderedDict
    ) -> None:
        """Write to console in streaming mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
        """
        df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec))

        if (
            output_spec.streaming_micro_batch_transformers
            or output_spec.streaming_micro_batch_dq_processors
        ):
            stream_df = df_writer.foreachBatch(
                ConsoleWriter._write_transformed_micro_batch(output_spec, data)
            ).start()
        else:
            stream_df = df_writer.foreachBatch(
                ConsoleWriter._show_streaming_df(output_spec)
            ).start()

        if output_spec.streaming_await_termination:
            stream_df.awaitTermination(output_spec.streaming_await_termination_timeout)

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec, data: OrderedDict
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            ConsoleWriter._show_df(transformed_df, output_spec)

        return inner


================================================
FILE: lakehouse_engine/io/writers/dataframe_writer.py
================================================
"""Module to define behaviour to write to dataframe."""

import uuid
from typing import Callable, Optional, OrderedDict

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

from lakehouse_engine.core.definitions import OutputFormat, OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.exceptions import NotSupportedException
from lakehouse_engine.io.writer import Writer
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.spark_utils import SparkUtils


class DataFrameWriter(Writer):
    """Class to write data to dataframe."""

    _logger = LoggingHandler(__name__).get_logger()

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct DataFrameWriter instances.

        Args:
            output_spec: output specification.
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)
        self.view_prefix = "global_temp" if not ExecEnv.IS_SERVERLESS else ""

    def write(self) -> Optional[OrderedDict]:
        """Write data to dataframe."""
        self._output_spec.options = (
            self._output_spec.options if self._output_spec.options else {}
        )
        written_dfs: OrderedDict = OrderedDict({})

        if (
            self._output_spec.streaming_processing_time
            or self._output_spec.streaming_continuous
        ):
            raise NotSupportedException(
                f"DataFrame writer doesn't support "
                f"processing time or continuous streaming "
                f"for step ${self._output_spec.spec_id}."
            )

        if self._df.isStreaming:
            output_df = self._write_to_dataframe_in_streaming_mode(
                self._df, self._output_spec, self._data
            )
        else:
            output_df = self._df

        written_dfs[self._output_spec.spec_id] = output_df

        return written_dfs

    def _get_prefixed_view_name(self, stream_df_view_name: str) -> str:
        """Return the fully qualified view name with prefix if needed."""
        return ".".join(filter(None, [self.view_prefix, stream_df_view_name]))

    def _create_temp_view(self, df: DataFrame, stream_df_view_name: str) -> None:
        """Given a dataframe create a temp view to be available for consumption.

        Args:
            df: dataframe to be shown.
            stream_df_view_name: stream df view name.
        """
        prefixed_view_name = self._get_prefixed_view_name(stream_df_view_name)
        if self._table_exists(stream_df_view_name):
            self._logger.info("Temp view already exists")
            existing_data = ExecEnv.SESSION.table(f"{prefixed_view_name}")
            df = existing_data.union(df)

        SparkUtils.create_temp_view(df, stream_df_view_name)

    def _write_streaming_df(self, stream_df_view_name: str) -> Callable:
        """Define how to create a df from streaming df.

        Args:
            stream_df_view_name: stream df view name.

        Returns:
            A function to show df in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            self._create_temp_view(batch_df, stream_df_view_name)

        return inner

    def _write_to_dataframe_in_streaming_mode(
        self, df: DataFrame, output_spec: OutputSpec, data: OrderedDict
    ) -> DataFrame:
        """Write to DataFrame in streaming mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
        """
        app_id = str(uuid.uuid4())
        stream_df_view_name = f"`{app_id}_{output_spec.spec_id}`"
        self._logger.info("Drop temp view if exists")
        prefixed_view_name = self._get_prefixed_view_name(stream_df_view_name)

        if self._table_exists(stream_df_view_name):
            # Cleaning Temp view to not maintain state and impact
            # consecutive acon runs
            ExecEnv.SESSION.sql(f"DROP VIEW {prefixed_view_name}")

        df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec))

        if (
            output_spec.streaming_micro_batch_transformers
            or output_spec.streaming_micro_batch_dq_processors
        ):
            stream_df = (
                df_writer.options(**output_spec.options if output_spec.options else {})
                .format(OutputFormat.NOOP.value)
                .foreachBatch(
                    self._write_transformed_micro_batch(
                        output_spec, data, stream_df_view_name
                    )
                )
                .start()
            )
        else:
            stream_df = (
                df_writer.options(**output_spec.options if output_spec.options else {})
                .format(OutputFormat.NOOP.value)
                .foreachBatch(self._write_streaming_df(stream_df_view_name))
                .start()
            )

        if output_spec.streaming_await_termination:
            stream_df.awaitTermination(output_spec.streaming_await_termination_timeout)

        self._logger.info("Reading stream data as df if exists")
        if self._table_exists(stream_df_view_name):
            stream_data_as_df = ExecEnv.SESSION.table(f"{prefixed_view_name}")
        else:
            self._logger.info(
                f"DataFrame writer couldn't find any data to return "
                f"for streaming, check if you are using checkpoint "
                f"for step {output_spec.spec_id}."
            )
            stream_data_as_df = ExecEnv.SESSION.createDataFrame(
                data=[], schema=StructType([])
            )

        return stream_data_as_df

    def _table_exists(self, table_name: str) -> bool:
        """Check if the table or view exists in the session catalog.

        Args:
            table_name: table/view name to check if exists in the session.
        """
        if not ExecEnv.IS_SERVERLESS:
            tables = ExecEnv.SESSION.sql(f"SHOW TABLES IN {self.view_prefix}")
        else:
            tables = ExecEnv.SESSION.sql("SHOW TABLES")
        return (
            len(tables.filter(f"tableName = '{table_name.strip('`')}'").collect()) > 0
        )

    def _write_transformed_micro_batch(
        self, output_spec: OutputSpec, data: OrderedDict, stream_as_df_view: str
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
            stream_as_df_view: stream df view name.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            self._create_temp_view(transformed_df, stream_as_df_view)

        return inner


================================================
FILE: lakehouse_engine/io/writers/delta_merge_writer.py
================================================
"""Module to define the behaviour of delta merges."""

from typing import Callable, Optional, OrderedDict

from delta.tables import DeltaMergeBuilder, DeltaTable
from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputFormat, OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.exceptions import WrongIOFormatException
from lakehouse_engine.io.writer import Writer


class DeltaMergeWriter(Writer):
    """Class to merge data using delta lake."""

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct DeltaMergeWriter instances.

        Args:
            output_spec: output specification containing merge options and
                relevant information.
            df: the dataframe containing the new data to be merged.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Merge new data with current data."""
        delta_table = self._get_delta_table(self._output_spec)
        if self._df.isStreaming:
            stream_df = (
                self._df.writeStream.options(
                    **self._output_spec.options if self._output_spec.options else {}
                )
                .foreachBatch(
                    self._write_transformed_micro_batch(
                        self._output_spec, self._data, delta_table
                    )
                )
                .trigger(**Writer.get_streaming_trigger(self._output_spec))
                .start()
            )

            if self._output_spec.streaming_await_termination:
                stream_df.awaitTermination(
                    self._output_spec.streaming_await_termination_timeout
                )
        else:
            DeltaMergeWriter._merge(delta_table, self._output_spec, self._df)

    @staticmethod
    def _get_delta_table(output_spec: OutputSpec) -> DeltaTable:
        """Get the delta table given an output specification w/ table name or location.

        Args:
            output_spec: output specification.

        Returns:
            DeltaTable: the delta table instance.
        """
        if output_spec.db_table:
            delta_table = DeltaTable.forName(ExecEnv.SESSION, output_spec.db_table)
        elif output_spec.data_format == OutputFormat.DELTAFILES.value:
            delta_table = DeltaTable.forPath(ExecEnv.SESSION, output_spec.location)
        else:
            raise WrongIOFormatException(
                f"{output_spec.data_format} is not compatible with Delta Merge "
                f"Writer."
            )

        return delta_table

    @staticmethod
    def _insert(
        delta_merge: DeltaMergeBuilder,
        insert_predicate: Optional[str],
        insert_column_set: Optional[dict],
    ) -> DeltaMergeBuilder:
        """Get the builder of merge data with insert predicate and column set.

        Args:
            delta_merge: builder of the merge data.
            insert_predicate: condition of the insert.
            insert_column_set: rules for setting the values of
                columns that need to be inserted.

        Returns:
            DeltaMergeBuilder: builder of the merge data with insert.
        """
        if insert_predicate:
            if insert_column_set:
                delta_merge = delta_merge.whenNotMatchedInsert(
                    condition=insert_predicate,
                    values=insert_column_set,
                )
            else:
                delta_merge = delta_merge.whenNotMatchedInsertAll(
                    condition=insert_predicate
                )
        else:
            if insert_column_set:
                delta_merge = delta_merge.whenNotMatchedInsert(values=insert_column_set)
            else:
                delta_merge = delta_merge.whenNotMatchedInsertAll()

        return delta_merge

    @staticmethod
    def _merge(delta_table: DeltaTable, output_spec: OutputSpec, df: DataFrame) -> None:
        """Perform a delta lake merge according to several merge options.

        Args:
            delta_table: delta table to which to merge data.
            output_spec: output specification containing the merge options.
            df: dataframe with the new data to be merged into the delta table.
        """
        delta_merge = delta_table.alias("current").merge(
            df.alias("new"), output_spec.merge_opts.merge_predicate
        )

        if not output_spec.merge_opts.insert_only:
            if output_spec.merge_opts.delete_predicate:
                delta_merge = delta_merge.whenMatchedDelete(
                    output_spec.merge_opts.delete_predicate
                )
            delta_merge = DeltaMergeWriter._update(
                delta_merge,
                output_spec.merge_opts.update_predicate,
                output_spec.merge_opts.update_column_set,
            )

        delta_merge = DeltaMergeWriter._insert(
            delta_merge,
            output_spec.merge_opts.insert_predicate,
            output_spec.merge_opts.insert_column_set,
        )

        delta_merge.execute()

    @staticmethod
    def _update(
        delta_merge: DeltaMergeBuilder,
        update_predicate: Optional[str],
        update_column_set: Optional[dict],
    ) -> DeltaMergeBuilder:
        """Get the builder of merge data with update predicate and column set.

        Args:
            delta_merge: builder of the merge data.
            update_predicate: condition of the update.
            update_column_set: rules for setting the values of
                columns that need to be updated.

        Returns:
            DeltaMergeBuilder: builder of the merge data with update.
        """
        if update_predicate:
            if update_column_set:
                delta_merge = delta_merge.whenMatchedUpdate(
                    condition=update_predicate,
                    set=update_column_set,
                )
            else:
                delta_merge = delta_merge.whenMatchedUpdateAll(
                    condition=update_predicate
                )
        else:
            if update_column_set:
                delta_merge = delta_merge.whenMatchedUpdate(set=update_column_set)
            else:
                delta_merge = delta_merge.whenMatchedUpdateAll()

        return delta_merge

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec,
        data: OrderedDict,
        delta_table: Optional[DeltaTable] = None,
    ) -> Callable:
        """Perform the merge in streaming mode by specifying a transform function.

        This function returns a function that will be invoked in the foreachBatch in
        streaming mode, performing a delta lake merge while streaming the micro batches.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
            delta_table: delta table for which to merge the streaming data
                with.

        Returns:
            Function to call in .foreachBatch streaming function.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            DeltaMergeWriter._merge(delta_table, output_spec, transformed_df)

        return inner


================================================
FILE: lakehouse_engine/io/writers/file_writer.py
================================================
"""Module to define behaviour to write to files."""

from typing import Callable, OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.writer import Writer


class FileWriter(Writer):
    """Class to write data to files."""

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct FileWriter instances.

        Args:
            output_spec: output specification
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Write data to files."""
        if not self._df.isStreaming:
            self._write_to_files_in_batch_mode(self._df, self._output_spec)
        else:
            self._write_to_files_in_streaming_mode(
                self._df, self._output_spec, self._data
            )

    @staticmethod
    def _write_to_files_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None:
        """Write to files in batch mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
        """
        df.write.format(output_spec.data_format).partitionBy(
            output_spec.partitions
        ).options(**output_spec.options if output_spec.options else {}).mode(
            output_spec.write_type
        ).save(
            output_spec.location
        )

    @staticmethod
    def _write_to_files_in_streaming_mode(
        df: DataFrame, output_spec: OutputSpec, data: OrderedDict
    ) -> None:
        """Write to files in streaming mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
        """
        df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec))

        if (
            output_spec.streaming_micro_batch_transformers
            or output_spec.streaming_micro_batch_dq_processors
        ):
            stream_df = (
                df_writer.options(**output_spec.options if output_spec.options else {})
                .foreachBatch(
                    FileWriter._write_transformed_micro_batch(output_spec, data)
                )
                .start()
            )
        else:
            stream_df = (
                df_writer.format(output_spec.data_format)
                .partitionBy(output_spec.partitions)
                .options(**output_spec.options if output_spec.options else {})
                .outputMode(output_spec.write_type)
                .start(output_spec.location)
            )

        if output_spec.streaming_await_termination:
            stream_df.awaitTermination(output_spec.streaming_await_termination_timeout)

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec, data: OrderedDict
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            FileWriter._write_to_files_in_batch_mode(transformed_df, output_spec)

        return inner


================================================
FILE: lakehouse_engine/io/writers/jdbc_writer.py
================================================
"""Module that defines the behaviour to write to JDBC targets."""

from typing import Callable, OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.writer import Writer


class JDBCWriter(Writer):
    """Class to write to JDBC targets."""

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct JDBCWriter instances.

        Args:
            output_spec: output specification.
            df: dataframe to be writen.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Write data into JDBC target."""
        if not self._df.isStreaming:
            self._write_to_jdbc_in_batch_mode(self._df, self._output_spec)
        else:
            stream_df = (
                self._df.writeStream.trigger(
                    **Writer.get_streaming_trigger(self._output_spec)
                )
                .options(
                    **self._output_spec.options if self._output_spec.options else {}
                )
                .foreachBatch(
                    self._write_transformed_micro_batch(self._output_spec, self._data)
                )
                .start()
            )

            if self._output_spec.streaming_await_termination:
                stream_df.awaitTermination(
                    self._output_spec.streaming_await_termination_timeout
                )

    @staticmethod
    def _write_to_jdbc_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None:
        """Write to jdbc in batch mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
        """
        df.write.format(output_spec.data_format).partitionBy(
            output_spec.partitions
        ).options(**output_spec.options if output_spec.options else {}).mode(
            output_spec.write_type
        ).save(
            output_spec.location
        )

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec, data: OrderedDict
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            JDBCWriter._write_to_jdbc_in_batch_mode(transformed_df, output_spec)

        return inner


================================================
FILE: lakehouse_engine/io/writers/kafka_writer.py
================================================
"""Module that defines the behaviour to write to Kafka."""

from typing import Callable, OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.writer import Writer


class KafkaWriter(Writer):
    """Class to write to a Kafka target."""

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct KafkaWriter instances.

        Args:
            output_spec: output specification.
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Write data to Kafka."""
        if not self._df.isStreaming:
            self._write_to_kafka_in_batch_mode(self._df, self._output_spec)
        else:
            self._write_to_kafka_in_streaming_mode(
                self._df, self._output_spec, self._data
            )

    @staticmethod
    def _write_to_kafka_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None:
        """Write to Kafka in batch mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
        """
        df.write.format(output_spec.data_format).options(
            **output_spec.options if output_spec.options else {}
        ).mode(output_spec.write_type).save()

    @staticmethod
    def _write_to_kafka_in_streaming_mode(
        df: DataFrame, output_spec: OutputSpec, data: OrderedDict
    ) -> None:
        """Write to kafka in streaming mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
        """
        df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec))

        if (
            output_spec.streaming_micro_batch_transformers
            or output_spec.streaming_micro_batch_dq_processors
        ):
            stream_df = (
                df_writer.options(**output_spec.options if output_spec.options else {})
                .foreachBatch(
                    KafkaWriter._write_transformed_micro_batch(output_spec, data)
                )
                .start()
            )
        else:
            stream_df = (
                df_writer.format(output_spec.data_format)
                .options(**output_spec.options if output_spec.options else {})
                .start()
            )

        if output_spec.streaming_await_termination:
            stream_df.awaitTermination(output_spec.streaming_await_termination_timeout)

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec, data: OrderedDict
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            KafkaWriter._write_to_kafka_in_batch_mode(transformed_df, output_spec)

        return inner


================================================
FILE: lakehouse_engine/io/writers/rest_api_writer.py
================================================
"""Module to define behaviour to write to REST APIs."""

import json
from typing import Any, Callable, OrderedDict

from pyspark.sql import DataFrame, Row

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.writer import Writer
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.rest_api import (
    RESTApiException,
    RestMethods,
    RestStatusCodes,
    execute_api_request,
)


class RestApiWriter(Writer):
    """Class to write data to a REST API."""

    _logger = LoggingHandler(__name__).get_logger()

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct RestApiWriter instances.

        Args:
            output_spec: output specification.
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Write data to REST API."""
        if not self._df.isStreaming:
            self._write_to_rest_api_in_batch_mode(self._df, self._output_spec)
        else:
            self._write_to_rest_api_in_streaming_mode(
                self._df, self._output_spec, self._data
            )

    @staticmethod
    def _get_func_to_send_payload_to_rest_api(output_spec: OutputSpec) -> Callable:
        """Define and return a function to send the payload to the REST api.

        Args:
            output_spec: Output Specification containing configurations to
                communicate with the REST api. Within the output_spec, the user
                can specify several options:
                    - rest_api_header: http headers.
                    - rest_api_basic_auth: basic http authentication details
                        (e.g., {"username": "x", "password": "y"}).
                    - rest_api_url: url of the api.
                    - rest_api_method: REST method (e.g., POST or PUT).
                    - rest_api_sleep_seconds: sleep seconds to avoid throttling.
                    - rest_api_is_file_payload: if the payload to be sent to the
                        api is in the format of a file using multipart encoding
                        upload. if this is true, then the payload will always be
                        sent using the "files" parameter in Python's requests
                        library.
                    - rest_api_file_payload_name: when rest_api_is_file_payload
                        is true, this option can be used to define the file
                        identifier in Python's requests library.
                    - extra_json_payload: when rest_api_file_payload_name is False,
                        can be used to provide additional JSON variables to add to
                        the original payload. This is useful to complement
                        the original payload with some extra input to better
                        configure the final payload to send to the REST api. An
                        example can be to add a constant configuration value to
                        add to the payload data.

        Returns:
            Function to be called inside Spark dataframe.foreach.
        """
        headers = output_spec.options.get("rest_api_header", None)
        basic_auth_dict = output_spec.options.get("rest_api_basic_auth", None)
        url = output_spec.options["rest_api_url"]
        method = output_spec.options.get("rest_api_method", RestMethods.POST.value)
        sleep_seconds = output_spec.options.get("rest_api_sleep_seconds", 0)
        is_file_payload = output_spec.options.get("rest_api_is_file_payload", False)
        file_payload_name = output_spec.options.get(
            "rest_api_file_payload_name", "file"
        )
        extra_json_payload = output_spec.options.get(
            "rest_api_extra_json_payload", None
        )
        success_status_codes = output_spec.options.get(
            "rest_api_success_status_codes", RestStatusCodes.OK_STATUS_CODES.value
        )

        def send_payload_to_rest_api(row: Row) -> Any:
            """Send payload to the REST API.

            The payload needs to be prepared as a JSON string column in a dataframe.
            E.g., {"a": "a value", "b": "b value"}.

            Args:
                row: a row in a dataframe.
            """
            if "payload" not in row:
                raise ValueError("Input DataFrame must contain 'payload' column.")

            str_payload = row.payload

            payload = None
            if not is_file_payload:
                payload = json.loads(str_payload)
            else:
                payload = {file_payload_name: str_payload}

            if extra_json_payload:
                payload.update(extra_json_payload)

            RestApiWriter._logger.debug(f"Original payload: {str_payload}")
            RestApiWriter._logger.debug(f"Final payload: {payload}")

            response = execute_api_request(
                method=method,
                url=url,
                headers=headers,
                basic_auth_dict=basic_auth_dict,
                json=payload if not is_file_payload else None,
                files=payload if is_file_payload else None,
                sleep_seconds=sleep_seconds,
            )

            RestApiWriter._logger.debug(
                f"Response: {response.status_code} - {response.text}"
            )

            if response.status_code not in success_status_codes:
                raise RESTApiException(
                    f"API response status code {response.status_code} is not in"
                    f" {success_status_codes}. Got {response.text}"
                )

        return send_payload_to_rest_api

    @staticmethod
    def _write_to_rest_api_in_batch_mode(
        df: DataFrame, output_spec: OutputSpec
    ) -> None:
        """Write to REST API in Spark batch mode.

        This function uses the dataframe.foreach function to generate a payload
        for each row of the dataframe and send it to the REST API endpoint.

        Warning! Make sure your execution environment supports RDD api operations,
        as there are environments where RDD operation may not be supported. As,
        df.foreach() is a shorthand for df.rdd.foreach(), this can bring issues
        in such environments.

        Args:
            df: dataframe to write.
            output_spec: output specification.
        """
        df.foreach(RestApiWriter._get_func_to_send_payload_to_rest_api(output_spec))

    @staticmethod
    def _write_to_rest_api_in_streaming_mode(
        df: DataFrame, output_spec: OutputSpec, data: OrderedDict
    ) -> None:
        """Write to REST API in streaming mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.
        """
        df_writer = df.writeStream.trigger(**Writer.get_streaming_trigger(output_spec))

        stream_df = (
            df_writer.options(**output_spec.options if output_spec.options else {})
            .foreachBatch(
                RestApiWriter._write_transformed_micro_batch(output_spec, data)
            )
            .start()
        )

        if output_spec.streaming_await_termination:
            stream_df.awaitTermination(output_spec.streaming_await_termination_timeout)

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec, data: OrderedDict
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            RestApiWriter._write_to_rest_api_in_batch_mode(transformed_df, output_spec)

        return inner


================================================
FILE: lakehouse_engine/io/writers/sharepoint_writer.py
================================================
"""Module to define the behaviour to write to Sharepoint."""

import os
from typing import OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.io.exceptions import (
    EndpointNotFoundException,
    NotSupportedException,
    WriteToLocalException,
)
from lakehouse_engine.io.writer import Writer
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.sharepoint_utils import SharepointUtils


class SharepointWriter(Writer):
    """Class to write data to Sharepoint.

    This writer is designed specifically for uploading a single file
    to Sharepoint. It first writes the data locally before uploading
    it to the specified Sharepoint location. Since it handles only
    a single file at a time, any logic for writing multiple files
    must be implemented on the notebook-side.
    """

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct FileWriter instances.

        Args:
            output_spec: output specification
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)
        self.sharepoint_utils = self._get_sharepoint_utils()
        self._logger = LoggingHandler(__name__).get_logger()

    def write(self) -> None:
        """Upload data to Sharepoint."""
        if self._df.isStreaming:
            raise NotSupportedException("Sharepoint writer doesn't support streaming!")

        self._output_spec.sharepoint_opts.validate_for_writer()
        if not self.sharepoint_utils.check_if_endpoint_exists(
            folder_root_path=self._output_spec.sharepoint_opts.folder_relative_path
        ):
            raise EndpointNotFoundException("The provided endpoint does not exist!")

        self._write_to_sharepoint_in_batch_mode(self._df)

    def _get_sharepoint_utils(self) -> SharepointUtils:
        sharepoint_utils = SharepointUtils(
            client_id=self._output_spec.sharepoint_opts.client_id,
            tenant_id=self._output_spec.sharepoint_opts.tenant_id,
            local_path=self._output_spec.sharepoint_opts.local_path,
            api_version=self._output_spec.sharepoint_opts.api_version,
            site_name=self._output_spec.sharepoint_opts.site_name,
            drive_name=self._output_spec.sharepoint_opts.drive_name,
            file_name=self._output_spec.sharepoint_opts.file_name,
            folder_relative_path=self._output_spec.sharepoint_opts.folder_relative_path,
            chunk_size=self._output_spec.sharepoint_opts.chunk_size,
            local_options=self._output_spec.sharepoint_opts.local_options,
            secret=self._output_spec.sharepoint_opts.secret,
            conflict_behaviour=self._output_spec.sharepoint_opts.conflict_behaviour,
        )

        return sharepoint_utils

    def _write_to_sharepoint_in_batch_mode(self, df: DataFrame) -> None:
        """Write to Sharepoint in batch mode.

        This method first writes the provided DataFrame to a local file using the
        SharePointUtils `write_to_local_path` method. If the local file is successfully
        written, it then uploads the file to Sharepoint using the `write_to_sharepoint`
        method, logging the process and outcome.

        Args:
            df: The DataFrame to write to a local file and subsequently
                upload to Sharepoint.
        """
        local_path = self._output_spec.sharepoint_opts.local_path
        file_name = self._output_spec.sharepoint_opts.file_name

        self._logger.info(f"Starting to write the data to the local path: {local_path}")

        try:
            self.sharepoint_utils.write_to_local_path(df)
        except IOError as err:
            self.sharepoint_utils.delete_local_path()
            self._logger.info(f"Deleted the local folder: {local_path}")
            raise WriteToLocalException(
                f"The data was not written on the local path: {local_path}"
            ) from err

        self._logger.info(f"The data was written to the local path: {local_path}")
        file_size = os.path.getsize(local_path)
        self._logger.info(
            f"Uploading the {file_name} ({file_size} bytes) to Sharepoint."
        )
        self.sharepoint_utils.write_to_sharepoint()
        self._logger.info(f"The {file_name} was uploaded to Sharepoint with success!")
        self.sharepoint_utils.delete_local_path()
        self._logger.info(f"Deleted the local folder: {local_path}")


================================================
FILE: lakehouse_engine/io/writers/table_writer.py
================================================
"""Module that defines the behaviour to write to tables."""

from typing import Any, Callable, OrderedDict

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputFormat, OutputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.writer import Writer


class TableWriter(Writer):
    """Class to write to a table."""

    def __init__(self, output_spec: OutputSpec, df: DataFrame, data: OrderedDict):
        """Construct TableWriter instances.

        Args:
            output_spec: output specification.
            df: dataframe to be written.
            data: list of all dfs generated on previous steps before writer.
        """
        super().__init__(output_spec, df, data)

    def write(self) -> None:
        """Write data to a table.

        After the write operation we repair the table (e.g., update partitions).
        However, there's a caveat to this, which is the fact that this repair
        operation is not reachable if we are running long-running streaming mode.
        Therefore, we recommend not using the TableWriter with formats other than
        delta lake for those scenarios (as delta lake does not need msck repair).
        So, you can: 1) use delta lake format for the table; 2) use the FileWriter
        and run the repair with a certain frequency in a separate task of your
        pipeline.
        """
        if not self._df.isStreaming:
            self._write_to_table_in_batch_mode(self._df, self._output_spec)
        else:
            df_writer = self._df.writeStream.trigger(
                **Writer.get_streaming_trigger(self._output_spec)
            )

            if (
                self._output_spec.streaming_micro_batch_transformers
                or self._output_spec.streaming_micro_batch_dq_processors
            ):
                stream_df = (
                    df_writer.options(
                        **self._output_spec.options if self._output_spec.options else {}
                    )
                    .foreachBatch(
                        self._write_transformed_micro_batch(
                            self._output_spec, self._data
                        )
                    )
                    .start()
                )

                if self._output_spec.streaming_await_termination:
                    stream_df.awaitTermination(
                        self._output_spec.streaming_await_termination_timeout
                    )
            else:
                self._write_to_table_in_streaming_mode(df_writer, self._output_spec)

        if (
            self._output_spec.data_format != OutputFormat.DELTAFILES.value
            and self._output_spec.partitions
        ):
            ExecEnv.SESSION.sql(f"MSCK REPAIR TABLE {self._output_spec.db_table}")

    @staticmethod
    def _write_to_table_in_batch_mode(df: DataFrame, output_spec: OutputSpec) -> None:
        """Write to a metastore table in batch mode.

        Args:
            df: dataframe to write.
            output_spec: output specification.
        """
        df_writer = df.write.format(output_spec.data_format)

        if output_spec.partitions:
            df_writer = df_writer.partitionBy(output_spec.partitions)

        if output_spec.location:
            df_writer = df_writer.options(
                path=output_spec.location,
                **output_spec.options if output_spec.options else {},
            )
        else:
            df_writer = df_writer.options(
                **output_spec.options if output_spec.options else {}
            )

        df_writer.mode(output_spec.write_type).saveAsTable(output_spec.db_table)

    @staticmethod
    def _write_to_table_in_streaming_mode(
        df_writer: Any, output_spec: OutputSpec
    ) -> None:
        """Write to a metastore table in streaming mode.

        Args:
            df_writer: dataframe writer.
            output_spec: output specification.
        """
        df_writer = df_writer.outputMode(output_spec.write_type).format(
            output_spec.data_format
        )

        if output_spec.partitions:
            df_writer = df_writer.partitionBy(output_spec.partitions)

        if output_spec.location:
            df_writer = df_writer.options(
                path=output_spec.location,
                **output_spec.options if output_spec.options else {},
            )
        else:
            df_writer = df_writer.options(
                **output_spec.options if output_spec.options else {}
            )

        if output_spec.streaming_await_termination:
            df_writer.toTable(output_spec.db_table).awaitTermination(
                output_spec.streaming_await_termination_timeout
            )
        else:
            df_writer.toTable(output_spec.db_table)

    @staticmethod
    def _write_transformed_micro_batch(  # type: ignore
        output_spec: OutputSpec, data: OrderedDict
    ) -> Callable:
        """Define how to write a streaming micro batch after transforming it.

        Args:
            output_spec: output specification.
            data: list of all dfs generated on previous steps before writer.

        Returns:
            A function to be executed in the foreachBatch spark write method.
        """

        def inner(batch_df: DataFrame, batch_id: int) -> None:
            ExecEnv.get_for_each_batch_session(batch_df)
            transformed_df = Writer.get_transformed_micro_batch(
                output_spec, batch_df, batch_id, data
            )

            if output_spec.streaming_micro_batch_dq_processors:
                transformed_df = Writer.run_micro_batch_dq_process(
                    transformed_df, output_spec.streaming_micro_batch_dq_processors
                )

            TableWriter._write_to_table_in_batch_mode(transformed_df, output_spec)

        return inner


================================================
FILE: lakehouse_engine/terminators/__init__.py
================================================
"""Package to define algorithm terminators (e.g., vacuum, optimize, compute stats)."""


================================================
FILE: lakehouse_engine/terminators/cdf_processor.py
================================================
"""Defines change data feed processor behaviour."""

from datetime import datetime, timedelta
from typing import OrderedDict

from delta.tables import DeltaTable
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, date_format

from lakehouse_engine.core.definitions import (
    InputSpec,
    OutputFormat,
    OutputSpec,
    ReadType,
    TerminatorSpec,
    WriteType,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.reader_factory import ReaderFactory
from lakehouse_engine.io.writer_factory import WriterFactory
from lakehouse_engine.utils.logging_handler import LoggingHandler


class CDFProcessor(object):
    """Change data feed processor class."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def expose_cdf(cls, spec: TerminatorSpec) -> None:
        """Expose CDF to external location.

        Args:
            spec: terminator specification.
        """
        cls._logger.info("Reading CDF from input table...")

        df_cdf = ReaderFactory.get_data(cls._get_table_cdf_input_specs(spec))
        new_df_cdf = df_cdf.withColumn(
            "_commit_timestamp",
            date_format(col("_commit_timestamp"), "yyyyMMddHHmmss"),
        )

        cls._logger.info("Writing CDF to external table...")
        cls._write_cdf_to_external(
            spec,
            new_df_cdf.repartition(
                spec.args.get(
                    "materialized_cdf_num_partitions", col("_commit_timestamp")
                )
            ),
        )

        # used to delete old data on CDF table (don't remove parquet).
        if spec.args.get("clean_cdf", True):
            cls._logger.info("Cleaning CDF table...")
            cls.delete_old_data(spec)

        # used to delete old parquet files.
        if spec.args.get("vacuum_cdf", False):
            cls._logger.info("Vacuuming CDF table...")
            cls.vacuum_cdf_data(spec)

    @staticmethod
    def _write_cdf_to_external(
        spec: TerminatorSpec, df: DataFrame, data: OrderedDict = None
    ) -> None:
        """Write cdf results dataframe.

        Args:
            spec: terminator specification.
            df: dataframe with cdf results to write.
            data: list of all dfs generated on previous steps before writer.
        """
        WriterFactory.get_writer(
            spec=OutputSpec(
                spec_id="materialized_cdf",
                input_id="input_table",
                location=spec.args["materialized_cdf_location"],
                write_type=WriteType.APPEND.value,
                data_format=spec.args.get("data_format", OutputFormat.DELTAFILES.value),
                options=spec.args["materialized_cdf_options"],
                partitions=["_commit_timestamp"],
            ),
            df=df,
            data=data,
        ).write()

    @staticmethod
    def _get_table_cdf_input_specs(spec: TerminatorSpec) -> InputSpec:
        """Get the input specifications from a terminator spec.

        Args:
            spec: terminator specifications.

        Returns:
            List of input specifications.
        """
        options = {
            "readChangeFeed": "true",
            **spec.args.get("db_table_options", {}),
        }

        input_specs = InputSpec(
            spec_id="input_table",
            db_table=spec.args["db_table"],
            read_type=ReadType.STREAMING.value,
            data_format=OutputFormat.DELTAFILES.value,
            options=options,
        )

        return input_specs

    @classmethod
    def delete_old_data(cls, spec: TerminatorSpec) -> None:
        """Delete old data from cdf delta table.

        Args:
            spec: terminator specifications.
        """
        today_datetime = datetime.today()
        limit_date = today_datetime + timedelta(
            days=spec.args.get("days_to_keep", 30) * -1
        )
        limit_timestamp = limit_date.strftime("%Y%m%d%H%M%S")

        cdf_delta_table = DeltaTable.forPath(
            ExecEnv.SESSION, spec.args["materialized_cdf_location"]
        )

        cdf_delta_table.delete(col("_commit_timestamp") < limit_timestamp)

    @classmethod
    def vacuum_cdf_data(cls, spec: TerminatorSpec) -> None:
        """Vacuum old data from cdf delta table.

        Args:
            spec: terminator specifications.
        """
        cdf_delta_table = DeltaTable.forPath(
            ExecEnv.SESSION, spec.args["materialized_cdf_location"]
        )

        cdf_delta_table.vacuum(spec.args.get("vacuum_hours", 168))


================================================
FILE: lakehouse_engine/terminators/dataset_optimizer.py
================================================
"""Module with dataset optimizer terminator."""

from typing import List, Optional

from pyspark.sql.utils import AnalysisException, ParseException

from lakehouse_engine.core.table_manager import TableManager
from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DatasetOptimizer(object):
    """Class with dataset optimizer terminator."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def optimize_dataset(
        cls,
        db_table: Optional[str] = None,
        location: Optional[str] = None,
        compute_table_stats: bool = True,
        vacuum: bool = True,
        vacuum_hours: int = 720,
        optimize: bool = True,
        optimize_where: Optional[str] = None,
        optimize_zorder_col_list: Optional[List[str]] = None,
        debug: bool = False,
    ) -> None:
        """Optimize a dataset based on a set of pre-conceived optimizations.

        Most of the time the dataset is a table, but it can be a file-based one only.

        Args:
            db_table: `database_name.table_name`.
            location: dataset/table filesystem location.
            compute_table_stats: to compute table statistics or not.
            vacuum: (delta lake tables only) whether to vacuum the delta lake
                table or not.
            vacuum_hours: (delta lake tables only) number of hours to consider
                in vacuum operation.
            optimize: (delta lake tables only) whether to optimize the table or
                not. Custom optimize parameters can be supplied through ExecEnv (Spark)
                configs
            optimize_where: expression to use in the optimize function.
            optimize_zorder_col_list: (delta lake tables only) list of
                columns to consider in the zorder optimization process. Custom optimize
                parameters can be supplied through ExecEnv (Spark) configs.
            debug: flag indicating if we are just debugging this for local
                tests and therefore pass through all the exceptions to perform some
                assertions in local tests.
        """
        if optimize:
            if debug:
                try:
                    cls._optimize(
                        db_table, location, optimize_where, optimize_zorder_col_list
                    )
                except ParseException:
                    pass
            else:
                cls._optimize(
                    db_table, location, optimize_where, optimize_zorder_col_list
                )

        if vacuum:
            cls._vacuum(db_table, location, vacuum_hours)

        if compute_table_stats:
            if debug:
                try:
                    cls._compute_table_stats(db_table)
                except AnalysisException:
                    pass
            else:
                cls._compute_table_stats(db_table)

    @classmethod
    def _compute_table_stats(cls, db_table: str) -> None:
        """Compute table statistics.

        Args:
            db_table: `<db>.<table>` string.
        """
        if not db_table:
            raise WrongArgumentsException("A table needs to be provided.")

        config = {"function": "compute_table_statistics", "table_or_view": db_table}
        cls._logger.info(f"Computing table statistics for {db_table}...")
        TableManager(config).compute_table_statistics()

    @classmethod
    def _vacuum(cls, db_table: str, location: str, hours: int) -> None:
        """Vacuum a delta table.

        Args:
            db_table: `<db>.<table>` string. Takes precedence over location.
            location: location of the delta table.
            hours: number of hours to consider in vacuum operation.
        """
        if not db_table and not location:
            raise WrongArgumentsException("A table or location need to be provided.")

        table_or_location = db_table if db_table else f"delta.`{location}`"

        config = {
            "function": "compute_table_statistics",
            "table_or_view": table_or_location,
            "vacuum_hours": hours,
        }
        cls._logger.info(f"Vacuuming table {table_or_location}...")
        TableManager(config).vacuum()

    @classmethod
    def _optimize(
        cls, db_table: str, location: str, where: str, zorder_cols: List[str]
    ) -> None:
        """Optimize a delta table.

        Args:
            db_table: `<db>.<table>` string. Takes precedence over location.
            location: location of the delta table.
            where: expression to use in the optimize function.
            zorder_cols: list of columns to consider in the zorder optimization process.
        """
        if not db_table and not location:
            raise WrongArgumentsException("A table or location needs to be provided.")

        table_or_location = db_table if db_table else f"delta.`{location}`"

        config = {
            "function": "compute_table_statistics",
            "table_or_view": table_or_location,
            "optimize_where": where,
            "optimize_zorder_col_list": ",".join(zorder_cols if zorder_cols else []),
        }
        cls._logger.info(f"Optimizing table {table_or_location}...")
        TableManager(config).optimize()


================================================
FILE: lakehouse_engine/terminators/notifier.py
================================================
"""Module with notification terminator."""

from abc import ABC, abstractmethod

from jinja2 import Template

from lakehouse_engine.core.definitions import (
    NotificationRuntimeParameters,
    TerminatorSpec,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.terminators.notifiers.notification_templates import (
    NotificationsTemplates,
)
from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Notifier(ABC):
    """Abstract Notification class."""

    _logger = LoggingHandler(__name__).get_logger()

    def __init__(self, notification_spec: TerminatorSpec):
        """Construct Notification instances.

        Args:
            notification_spec: notification specification.
        """
        self.type = notification_spec.args.get("type")
        self.notification = notification_spec.args

    @abstractmethod
    def create_notification(self) -> None:
        """Abstract create notification method."""
        raise NotImplementedError

    @abstractmethod
    def send_notification(self) -> None:
        """Abstract send notification method."""
        raise NotImplementedError

    def _render_notification_field(self, template_field: str) -> str:
        """Render the notification given args.

        Args:
            template_field: Message with templates to be replaced.

        Returns:
            Rendered field
        """
        args = {}
        field_template = Template(template_field)
        if (
            NotificationRuntimeParameters.DATABRICKS_JOB_NAME.value in template_field
            or NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID.value
            in template_field
            or NotificationRuntimeParameters.JOB_EXCEPTION.value in template_field
        ):
            workspace_id, job_name = DatabricksUtils.get_databricks_job_information(
                ExecEnv.SESSION
            )
            args["databricks_job_name"] = job_name
            args["databricks_workspace_id"] = workspace_id
            args["exception"] = self.notification.get("exception")

        return field_template.render(args)

    @staticmethod
    def check_if_notification_is_failure_notification(
        spec: TerminatorSpec,
    ) -> bool:
        """Check if given notification is a failure notification.

        Args:
            spec: spec to validate if it is a failure notification.

        Returns:
            A boolean telling if the notification is a failure notification
        """
        notification = spec.args
        is_notification_failure_notification: bool = False

        if "template" in notification.keys():
            template: dict = NotificationsTemplates.EMAIL_NOTIFICATIONS_TEMPLATES.get(
                notification["template"], {}
            )

            if template:
                is_notification_failure_notification = notification.get(
                    "on_failure", True
                )
            else:
                raise ValueError(f"""Template {notification["template"]} not found.""")
        else:
            is_notification_failure_notification = notification.get("on_failure", True)

        return is_notification_failure_notification


================================================
FILE: lakehouse_engine/terminators/notifier_factory.py
================================================
"""Module for notifier factory."""

from lakehouse_engine.core.definitions import NotifierType, TerminatorSpec
from lakehouse_engine.terminators.notifier import Notifier
from lakehouse_engine.terminators.notifiers.email_notifier import EmailNotifier
from lakehouse_engine.terminators.notifiers.exceptions import NotifierNotFoundException


class NotifierFactory(object):
    """Class for notification factory."""

    NOTIFIER_TYPES = {NotifierType.EMAIL.value: EmailNotifier}

    @classmethod
    def get_notifier(cls, spec: TerminatorSpec) -> Notifier:
        """Get a notifier according to the terminator specs using a factory.

        Args:
            spec: terminator specification.

        Returns:
            Notifier: notifier that will handle notifications.
        """
        notifier_name = spec.args.get("type")
        notifier = cls.NOTIFIER_TYPES.get(notifier_name)

        if notifier:
            return notifier(notification_spec=spec)
        else:
            raise NotifierNotFoundException(
                f"The requested notification format {notifier_name} is not supported."
            )

    @staticmethod
    def generate_failure_notification(spec: list, exception: Exception) -> None:
        """Check if it is necessary to send a failure notification and generate it.

        Args:
            spec: List of termination specs
            exception: Exception that caused the failure.
        """
        notification_specs = []

        for terminator in spec:
            if terminator.function == "notify":
                notification_specs.append(terminator)

        for notification in notification_specs:
            failure_notification_spec = notification.args
            generate_failure_notification = failure_notification_spec.get(
                "generate_failure_notification", False
            )

            if generate_failure_notification or (
                Notifier.check_if_notification_is_failure_notification(notification)
            ):
                failure_notification_spec["exception"] = str(exception)

                if generate_failure_notification:
                    failure_notification_spec["template"] = (
                        f"""failure_notification_{failure_notification_spec["type"]}"""
                    )

                failure_spec = TerminatorSpec(
                    function="notification", args=failure_notification_spec
                )

                notifier = NotifierFactory.get_notifier(failure_spec)
                notifier.create_notification()
                notifier.send_notification()


================================================
FILE: lakehouse_engine/terminators/notifiers/__init__.py
================================================
"""Notifications module."""


================================================
FILE: lakehouse_engine/terminators/notifiers/email_notifier.py
================================================
"""Module with email notifier."""

import asyncio
import smtplib
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from posixpath import basename
from typing import Any

from lakehouse_engine.core.definitions import TerminatorSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.terminators.notifier import Notifier
from lakehouse_engine.terminators.notifiers.exceptions import (
    NotifierConfigException,
    NotifierTemplateNotFoundException,
)
from lakehouse_engine.terminators.notifiers.notification_templates import (
    NotificationsTemplates,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class EmailNotifier(Notifier):
    """Base Notification class."""

    _logger = LoggingHandler(__name__).get_logger()

    def __init__(self, notification_spec: TerminatorSpec):
        """Construct Email Notification instance.

        Args:
            notification_spec: notification specification.
        """
        super().__init__(notification_spec)

    def create_notification(self) -> None:
        """Creates the notification to be sent."""
        if "template" in self.notification.keys():
            template: dict = NotificationsTemplates.EMAIL_NOTIFICATIONS_TEMPLATES.get(
                self.notification["template"], {}
            )

            if template:
                self.notification["message"] = self._render_notification_field(
                    template["message"]
                )
                self.notification["subject"] = self._render_notification_field(
                    template["subject"]
                )
                self.notification["mimetype"] = template["mimetype"]

            else:
                raise NotifierTemplateNotFoundException(
                    f"""Template {self.notification["template"]} does not exist"""
                )

        elif "message" in self.notification.keys():
            self.notification["message"] = self._render_notification_field(
                self.notification["message"]
            )
            self.notification["subject"] = self._render_notification_field(
                self.notification["subject"]
            )
        else:
            raise NotifierConfigException("Malformed Notification Definition")

    def send_notification(self) -> None:
        """Sends the notification by using a series of methods."""
        self._validate_email_notification()

        server = self.notification["server"]
        notification_office_email_servers = ["smtp.office365.com"]

        if (
            ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers is not None
            and server in ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers
        ):
            raise NotifierConfigException(
                f"Trying to use disallowed smtp server: '{server}'.\n"
                f"Disallowed smtp servers: "
                f"{str(ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers)}"
            )
        elif server in notification_office_email_servers:
            self._authenticate_and_send_office365()
        else:
            self._authenticate_and_send_simple_smtp()

    def _authenticate_and_send_office365(self) -> None:
        """Authenticates and sends an email notification using Graph API."""
        from azure.identity.aio import ClientSecretCredential
        from msgraph import GraphServiceClient

        self._logger.info("Attempting authentication using Graph API.")

        request_body = self._create_graph_api_email_body()

        self._logger.info(f"Sending notification email with body: {request_body}")

        credential = ClientSecretCredential(
            tenant_id=self.notification["tenant_id"],
            client_id=self.notification["user"],
            client_secret=self.notification["password"],
        )
        client = GraphServiceClient(credentials=credential)

        import nest_asyncio

        nest_asyncio.apply()
        asyncio.get_event_loop().run_until_complete(
            client.users.by_user_id(self.notification["from"]).send_mail.post(
                body=request_body
            )
        )

        self._logger.info("Notification email sent successfully.")

    def _authenticate_and_send_simple_smtp(self) -> None:
        """Authenticates and sends an email notification using simple authentication."""
        with smtplib.SMTP(
            self.notification["server"], self.notification["port"]
        ) as smtp:
            try:
                smtp.starttls()
                smtp.login(
                    self.notification.get("user", ""),
                    self.notification.get("password", ""),
                )
            except smtplib.SMTPException as e:
                self._logger.exception(
                    f"Exception while authenticating to smtp: {str(e)}"
                )
                self._logger.exception(
                    "Attempting to send the notification without authentication"
                )

            mesg = MIMEMultipart()
            mesg["From"] = self.notification["from"]

            to = self.notification.get("to", [])
            cc = self.notification.get("cc", [])
            bcc = self.notification.get("bcc", [])

            mesg["To"] = ", ".join(to)
            mesg["CC"] = ", ".join(cc)
            mesg["BCC"] = ", ".join(bcc)
            mesg["Subject"] = self.notification["subject"]
            mesg["Importance"] = self._get_importance(
                self.notification.get("importance", "normal")
            )

            match self.notification.get("mimetype", "plain"):
                case "html" | "text/html":
                    mimetype = "html"
                case "text" | "text/plain" | "plain" | "text/text":
                    mimetype = "text"
                case _:
                    self._logger.warning(
                        f"""Unknown mimetype '{self.notification["mimetype"]}' """
                        f"provided. Defaulting to 'plain'."
                    )
                    mimetype = "text"

            body = MIMEText(self.notification["message"], mimetype)
            mesg.attach(body)

            for f in self.notification.get("attachments", []):
                with open(f, "rb") as fil:
                    part = MIMEApplication(fil.read(), Name=basename(f))
                part["Content-Disposition"] = 'attachment; filename="%s"' % basename(f)
                mesg.attach(part)

            try:
                smtp.sendmail(
                    self.notification["from"], to + cc + bcc, mesg.as_string()
                )
                self._logger.info("Email sent successfully.")
            except smtplib.SMTPException as e:
                self._logger.exception(f"Exception while sending email: {str(e)}")

    def _validate_email_notification(self) -> None:
        """Validates the email notification."""
        if not self.notification.get("from"):
            raise NotifierConfigException(
                "Email notification must contain 'from' field."
            )
        if not self.notification.get("server"):
            raise NotifierConfigException(
                "Email notification must contain 'server' field."
            )
        if not self.notification.get("port"):
            raise NotifierConfigException(
                "Email notification must contain 'port' field."
            )
        if (
            not self.notification.get("to")
            and not self.notification.get("cc")
            and not self.notification.get("bcc")
        ):
            raise NotifierConfigException(
                "No recipients provided. Please provide at least one recipient."
            )

    def _get_importance(self, importance: str) -> Any:
        """Get the importance of the email notification.

        Args:
            importance: Importance level of the email.

        Returns:
            Importance level for the email notification.
        """
        from msgraph.generated.models.importance import Importance

        match importance:
            case "critical" | "high":
                return Importance.High
            case "normal":
                return Importance.Normal
            case "low":
                return Importance.Low
            case _:
                self._logger.warning(
                    f"""Unknown importance '{importance}' provided. """
                    f"Defaulting to 'normal'."
                )
                return Importance.Normal

    def _create_graph_api_email_body(self) -> Any:
        """Create the email body for the Graph API.

        Returns:
            Email body for the Graph API.
        """
        from msgraph.generated.models.body_type import BodyType
        from msgraph.generated.models.file_attachment import FileAttachment
        from msgraph.generated.models.item_body import ItemBody
        from msgraph.generated.models.message import Message
        from msgraph.generated.users.item.send_mail.send_mail_post_request_body import (
            SendMailPostRequestBody,
        )

        request_body = SendMailPostRequestBody()
        message = Message()
        message.subject = self.notification["subject"]

        message_body = ItemBody()

        message_body.content = self.notification["message"]
        match self.notification.get("mimetype", "plain"):
            case "html" | "text/html":
                message_body.content_type = BodyType.Html
            case "text" | "text/plain" | "plain" | "text/text":
                message_body.content_type = BodyType.Text
            case _:
                self._logger.warning(
                    f"""Unknown mimetype '{self.notification["mimetype"]}' """
                    f"provided. Defaulting to 'text'."
                )
                message_body.content_type = BodyType.Text

        message.body = message_body

        attachments = []
        for attachment_file in self.notification.get("attachments", []):
            attachment_name = attachment_file.split("/")[-1]

            with open(attachment_file, "rb") as f:
                content = f.read()

            attachment = FileAttachment()
            attachment.name = attachment_name
            attachment.size = len(content)
            attachment.content_bytes = content

            attachments.append(attachment)

        message.attachments = attachments  # type: ignore

        message.to_recipients = self._set_graph_api_recipients("to")
        message.cc_recipients = self._set_graph_api_recipients("cc")
        message.bcc_recipients = self._set_graph_api_recipients("bcc")

        message.importance = self._get_importance(
            self.notification.get("importance", "normal")
        )

        request_body.message = message
        request_body.save_to_sent_items = False

        return request_body

    def _set_graph_api_recipients(self, recipient_type: str) -> list:
        """Set the recipients for the Graph API.

        Args:
            recipient_type: Type of recipient (to, cc or bcc).

        Returns:
            List of recipients for the Graph API.
        """
        from msgraph.generated.models.email_address import EmailAddress
        from msgraph.generated.models.recipient import Recipient

        recipients = []
        for email in self.notification.get(recipient_type, []):
            recipient = Recipient()
            recipient_address = EmailAddress()
            recipient_address.address = email
            recipient.email_address = recipient_address

            recipients.append(recipient)
        return recipients


================================================
FILE: lakehouse_engine/terminators/notifiers/exceptions.py
================================================
"""Package defining all the Notifier custom exceptions."""


class NotifierNotFoundException(Exception):
    """Exception for when the notifier is not found."""

    pass


class NotifierConfigException(Exception):
    """Exception for when the notifier configuration is invalid."""

    pass


class NotifierTemplateNotFoundException(Exception):
    """Exception for when the notifier is not found."""

    pass


class NotifierTemplateConfigException(Exception):
    """Exception for when the notifier config is incorrect."""

    pass


================================================
FILE: lakehouse_engine/terminators/notifiers/notification_templates.py
================================================
"""Email notification templates."""


class NotificationsTemplates(object):
    """Templates for notifications."""

    EMAIL_NOTIFICATIONS_TEMPLATES = {
        "failure_notification_email": {
            "subject": "Service Failure",
            "mimetype": "text/text",
            "message": """
            Job {{ databricks_job_name }} in workspace {{ databricks_workspace_id }} has
            failed with the exception: {{ exception }}""",
            "on_failure": True,
        },
    }


================================================
FILE: lakehouse_engine/terminators/sensor_terminator.py
================================================
"""Module with sensor terminator."""

from typing import List

from lakehouse_engine.core.definitions import SensorSpec, SensorStatus
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.sensor_manager import SensorControlTableManager
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SensorTerminator(object):
    """Sensor Terminator class."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def update_sensor_status(
        cls,
        sensor_id: str,
        control_db_table_name: str,
        status: str = SensorStatus.PROCESSED_NEW_DATA.value,
        assets: List[str] = None,
    ) -> None:
        """Update internal sensor status.

        Update the sensor status in the control table, it should be used to tell the
        system that the sensor has processed all new data that was previously
        identified, hence updating the shifted sensor status.
        Usually used to move from `SensorStatus.ACQUIRED_NEW_DATA` to
        `SensorStatus.PROCESSED_NEW_DATA`, but there might be scenarios - still
        to identify - where we can update the sensor status from/to different statuses.

        Args:
            sensor_id: sensor id.
            control_db_table_name: `db.table` to store sensor checkpoints.
            status: status of the sensor.
            assets: a list of assets that are considered as available to
                consume downstream after this sensor has status
                PROCESSED_NEW_DATA.
        """
        if status not in [s.value for s in SensorStatus]:
            raise NotImplementedError(f"Status {status} not accepted in sensor.")

        ExecEnv.get_or_create(app_name="update_sensor_status")
        SensorControlTableManager.update_sensor_status(
            sensor_spec=SensorSpec(
                sensor_id=sensor_id,
                control_db_table_name=control_db_table_name,
                assets=assets,
                input_spec=None,
                preprocess_query=None,
                checkpoint_location=None,
            ),
            status=status,
        )


================================================
FILE: lakehouse_engine/terminators/spark_terminator.py
================================================
"""Module with spark terminator."""

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SparkTerminator(object):
    """Spark Terminator class."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def terminate_spark(cls) -> None:
        """Terminate spark session."""
        cls._logger.info("Terminating spark session...")
        ExecEnv.SESSION.stop()


================================================
FILE: lakehouse_engine/terminators/terminator_factory.py
================================================
"""Module with the factory pattern to return terminators."""

from typing import Optional

from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import TerminatorSpec
from lakehouse_engine.terminators.notifier import Notifier
from lakehouse_engine.terminators.notifier_factory import NotifierFactory
from lakehouse_engine.utils.logging_handler import LoggingHandler


class TerminatorFactory(object):
    """TerminatorFactory class following the factory pattern."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def execute_terminator(
        spec: TerminatorSpec, df: Optional[DataFrame] = None
    ) -> None:
        """Execute a terminator following the factory pattern.

        Args:
            spec: terminator specification.
            df: dataframe to be used in the terminator. Needed when a
                terminator requires one dataframe as input.

        Returns:
            Transformer function to be executed in .transform() spark function.
        """
        if spec.function == "optimize_dataset":
            from lakehouse_engine.terminators.dataset_optimizer import DatasetOptimizer

            DatasetOptimizer.optimize_dataset(**spec.args)
        elif spec.function == "terminate_spark":
            from lakehouse_engine.terminators.spark_terminator import SparkTerminator

            SparkTerminator.terminate_spark()
        elif spec.function == "expose_cdf":
            from lakehouse_engine.terminators.cdf_processor import CDFProcessor

            CDFProcessor.expose_cdf(spec)
        elif spec.function == "notify":
            if not Notifier.check_if_notification_is_failure_notification(spec):
                notifier = NotifierFactory.get_notifier(spec)
                notifier.create_notification()
                notifier.send_notification()
        else:
            raise NotImplementedError(
                f"The requested terminator {spec.function} is not implemented."
            )


================================================
FILE: lakehouse_engine/transformers/__init__.py
================================================
"""Package to define transformers available in the lakehouse engine."""


================================================
FILE: lakehouse_engine/transformers/aggregators.py
================================================
"""Aggregators module."""

from typing import Callable

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, max  # noqa: A004

from lakehouse_engine.utils.logging_handler import LoggingHandler


class Aggregators(object):
    """Class containing all aggregation functions."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def get_max_value(input_col: str, output_col: str = "latest") -> Callable:
        """Get the maximum value of a given column of a dataframe.

        Args:
            input_col: name of the input column.
            output_col: name of the output column (defaults to "latest").

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='get_max_value')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.select(col(input_col)).agg(max(input_col).alias(output_col))

        return inner


================================================
FILE: lakehouse_engine/transformers/column_creators.py
================================================
"""Column creators transformers module."""

from typing import Any, Callable, Dict

from pyspark.sql import DataFrame, Window
from pyspark.sql.functions import col, lit, monotonically_increasing_id, row_number
from pyspark.sql.types import IntegerType

from lakehouse_engine.transformers.exceptions import (
    UnsupportedStreamingTransformerException,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class ColumnCreators(object):
    """Class containing all functions that can create columns to add value."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def with_row_id(
        cls,
        output_col: str = "lhe_row_id",
    ) -> Callable:
        """Create a sequential but not consecutive id.

        Args:
            output_col: optional name of the output column.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='with_row_id')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if not df.isStreaming:
                return df.withColumn(output_col, monotonically_increasing_id())
            else:
                raise UnsupportedStreamingTransformerException(
                    "Transformer with_row_id is not supported in streaming mode."
                )

        return inner

    @classmethod
    def with_auto_increment_id(
        cls, output_col: str = "lhe_row_id", rdd: bool = True
    ) -> Callable:
        """Create a sequential and consecutive id.

        Args:
            output_col: optional name of the output column.
            rdd: optional parameter to use spark rdd.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='with_auto_increment_id')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if not df.isStreaming:
                if len(df.take(1)) == 0:
                    # if df is empty we have to prevent the algorithm from failing
                    return df.withColumn(output_col, lit(None).cast(IntegerType()))
                elif rdd:
                    return (
                        df.rdd.zipWithIndex()
                        .toDF()
                        .select(col("_1.*"), col("_2").alias(output_col))
                    )
                else:
                    w = Window.orderBy(monotonically_increasing_id())
                    return df.withColumn(output_col, (row_number().over(w)) - 1)

            else:
                raise UnsupportedStreamingTransformerException(
                    "Transformer with_auto_increment_id is not supported in "
                    "streaming mode."
                )

        return inner

    @classmethod
    def with_literals(
        cls,
        literals: Dict[str, Any],
    ) -> Callable:
        """Create columns given a map of column names and literal values (constants).

        Args:
            Dict[str, Any] literals: map of column names and literal values (constants).

        Returns:
            Callable: A function to be executed in the .transform() spark function.

        {{get_example(method_name='with_literals')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            df_with_literals = df
            for name, value in literals.items():
                df_with_literals = df_with_literals.withColumn(name, lit(value))
            return df_with_literals

        return inner


================================================
FILE: lakehouse_engine/transformers/column_reshapers.py
================================================
"""Module with column reshaping transformers."""

from collections import OrderedDict
from typing import Any, Callable, Dict, List, Optional

import pyspark.sql.types as spark_types
from pyspark.sql import DataFrame
from pyspark.sql.avro.functions import from_avro
from pyspark.sql.functions import (
    col,
    explode_outer,
    expr,
    from_json,
    map_entries,
    struct,
    to_json,
)

from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils


class ColumnReshapers(object):
    """Class containing column reshaping transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def cast(cls, cols: Dict[str, str]) -> Callable:
        """Cast specific columns into the designated type.

        Args:
            cols: dict with columns and respective target types.
                Target types need to have the exact name of spark types:
                https://spark.apache.org/docs/latest/sql-ref-datatypes.html

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='cast')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            cast_df = df
            for c, t in cols.items():
                cast_df = cast_df.withColumn(c, col(c).cast(getattr(spark_types, t)()))

            return cast_df

        return inner

    @classmethod
    def column_selector(cls, cols: OrderedDict) -> Callable:
        """Select specific columns with specific output aliases.

        Args:
            cols: dict with columns to select and respective aliases.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='column_selector')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.select(*[col(c).alias(a) for c, a in cols.items()])

        return inner

    @classmethod
    def flatten_schema(
        cls,
        max_level: int = None,
        shorten_names: bool = False,
        alias: bool = True,
        num_chars: int = 7,
        ignore_cols: List = None,
    ) -> Callable:
        """Flatten the schema of the dataframe.

        Args:
            max_level: level until which you want to flatten the schema.
                Default: None.
            shorten_names: whether to shorten the names of the prefixes
                of the fields being flattened or not. Default: False.
            alias: whether to define alias for the columns being flattened
                or not. Default: True.
            num_chars: number of characters to consider when shortening
                the names of the fields. Default: 7.
            ignore_cols: columns which you don't want to flatten.
                Default: None.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='flatten_schema')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.select(
                SchemaUtils.schema_flattener(
                    schema=df.schema,
                    max_level=max_level,
                    shorten_names=shorten_names,
                    alias=alias,
                    num_chars=num_chars,
                    ignore_cols=ignore_cols,
                )
            )

        return inner

    @classmethod
    def explode_columns(
        cls,
        explode_arrays: bool = False,
        array_cols_to_explode: List[str] = None,
        explode_maps: bool = False,
        map_cols_to_explode: List[str] = None,
    ) -> Callable:
        """Explode columns with types like ArrayType and MapType.

        After it can be applied the flatten_schema transformation,
        if we desired for example to explode the map (as we explode a StructType)
        or to explode a StructType inside the array.
        We recommend you to specify always the columns desired to explode
        and not explode all columns.

        Args:
            explode_arrays: whether you want to explode array columns (True)
                or not (False). Default: False.
            array_cols_to_explode: array columns which you want to explode.
                If you don't specify it will get all array columns and explode them.
                Default: None.
            explode_maps: whether you want to explode map columns (True)
                or not (False). Default: False.
            map_cols_to_explode: map columns which you want to explode.
                If you don't specify it will get all map columns and explode them.
                Default: None.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='explode_columns')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if explode_arrays or (array_cols_to_explode is not None):
                df = cls._explode_arrays(df, array_cols_to_explode)

            if explode_maps or (map_cols_to_explode is not None):
                df = cls._explode_maps(df, map_cols_to_explode)

            return df

        return inner

    @classmethod
    def _get_columns(
        cls,
        df: DataFrame,
        data_type: Any,
    ) -> List:
        """Get a list of columns from the dataframe of the data types specified.

        Args:
            df: input dataframe.
            data_type: data type specified.

        Returns:
            List of columns with the datatype specified.
        """
        cols = []
        for field in df.schema.fields:
            if isinstance(field.dataType, data_type):
                cols.append(field.name)
        return cols

    @classmethod
    def with_expressions(cls, cols_and_exprs: Dict[str, str]) -> Callable:
        """Execute Spark SQL expressions to create the specified columns.

        This function uses the Spark expr function. [Check here](
        https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.expr.html).

        Args:
            cols_and_exprs: dict with columns and respective expressions to compute
                (Spark SQL expressions).

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='with_expressions')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            enriched_df = df
            for c, e in cols_and_exprs.items():
                enriched_df = enriched_df.withColumn(c, expr(e))

            return enriched_df

        return inner

    @classmethod
    def rename(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:
        """Rename specific columns into the designated name.

        Args:
            cols: dict with columns and respective target names.
            escape_col_names: whether to escape column names (e.g. `/BIC/COL1`) or not.
                If True it creates a column with the new name and drop the old one.
                If False, uses the native withColumnRenamed Spark function.
                Default: True.

        Returns:
            Function to be called in .transform() spark function.

        {{get_example(method_name='rename')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            renamed_df = df
            if escape_col_names:
                for old_name, new_name in cols.items():
                    renamed_df = renamed_df.withColumn(new_name, col(old_name))
                    renamed_df = renamed_df.drop(old_name)
            else:
                for old_name, new_name in cols.items():
                    renamed_df = df.withColumnRenamed(old_name, new_name)

            return renamed_df

        return inner

    @classmethod
    def from_avro(
        cls,
        schema: str = None,
        key_col: str = "key",
        value_col: str = "value",
        options: dict = None,
        expand_key: bool = False,
        expand_value: bool = True,
    ) -> Callable:
        """Select all attributes from avro.

        Args:
            schema: the schema string.
            key_col: the name of the key column.
            value_col: the name of the value column.
            options: extra options (e.g., mode: "PERMISSIVE").
            expand_key: whether you want to expand the content inside the key
                column or not. Default: false.
            expand_value: whether you want to expand the content inside the value
                column or not. Default: true.

        Returns:
            Function to be called in .transform() spark function.

        {{get_example(method_name='from_avro')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            cols_to_select = [
                column for column in df.columns if column not in [key_col, value_col]
            ]

            return df.select(
                *cols_to_select,
                key_col,
                from_avro(col(value_col), schema, options if options else None).alias(
                    value_col
                ),
            ).select(
                *cols_to_select,
                f"{key_col}.*" if expand_key else key_col,
                f"{value_col}.*" if expand_value else value_col,
            )

        return inner

    @classmethod
    def from_avro_with_registry(
        cls,
        schema_registry: str,
        value_schema: str,
        value_col: str = "value",
        key_schema: str = None,
        key_col: str = "key",
        expand_key: bool = False,
        expand_value: bool = True,
        options: dict = None,
    ) -> Callable:
        """Select all attributes from avro using a schema registry.

        Args:
            schema_registry: the url to the schema registry.
            value_schema: the name of the value schema entry in the schema registry.
            value_col: the name of the value column.
            key_schema: the name of the key schema entry in the schema
                registry. Default: None.
            key_col: the name of the key column.
            expand_key: whether you want to expand the content inside the key
                column or not. Default: false.
            expand_value: whether you want to expand the content inside the value
                column or not. Default: true.
            options: extra options (e.g., mode: "PERMISSIVE").

        Returns:
            Function to be called in .transform() spark function.

        {{get_example(method_name='from_avro_with_registry')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            cols_to_select = [
                column for column in df.columns if column not in [key_col, value_col]
            ]

            return df.select(  # type: ignore
                *cols_to_select,
                (
                    from_avro(
                        data=col(key_col),
                        subject=key_schema,
                        schemaRegistryAddress=schema_registry,  # type: ignore
                        options=options if options else None,
                    ).alias(key_col)
                    if key_schema
                    else key_col
                ),
                from_avro(
                    data=col(value_col),
                    subject=value_schema,
                    schemaRegistryAddress=schema_registry,  # type: ignore
                    options=options if options else None,
                ).alias(value_col),
            ).select(
                *cols_to_select,
                f"{key_col}.*" if expand_key else key_col,
                f"{value_col}.*" if expand_value else value_col,
            )

        return inner

    @classmethod
    def from_json(
        cls,
        input_col: str,
        schema_path: Optional[str] = None,
        schema: Optional[dict] = None,
        json_options: Optional[dict] = None,
        drop_all_cols: bool = False,
        disable_dbfs_retry: bool = False,
    ) -> Callable:
        """Convert a json string into a json column (struct).

        The new json column can be added to the existing columns (default) or it can
        replace all the others, being the only one to output. The new column gets the
        same name as the original one suffixed with '_json'.

        Args:
            input_col: dict with columns and respective target names.
            schema_path: path to the StructType schema (spark schema).
            schema: dict with the StructType schema (spark schema).
            json_options: options to parse the json value.
            drop_all_cols: whether to drop all the input columns or not.
                Defaults to False.
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='from_json')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if schema_path:
                json_schema = SchemaUtils.from_file(schema_path, disable_dbfs_retry)
            elif schema:
                json_schema = SchemaUtils.from_dict(schema)
            else:
                raise WrongArgumentsException(
                    "A file or dict schema needs to be provided."
                )

            if drop_all_cols:
                df_with_json = df.select(
                    from_json(
                        col(input_col).cast("string").alias(f"{input_col}_json"),
                        json_schema,
                        json_options if json_options else None,
                    ).alias(f"{input_col}_json")
                )
            else:
                df_with_json = df.select(
                    "*",
                    from_json(
                        col(input_col).cast("string").alias(f"{input_col}_json"),
                        json_schema,
                        json_options if json_options else None,
                    ).alias(f"{input_col}_json"),
                )

            return df_with_json

        return inner

    @classmethod
    def to_json(
        cls, in_cols: List[str], out_col: str, json_options: Optional[dict] = None
    ) -> Callable:
        """Convert dataframe columns into a json value.

        Args:
            in_cols: name(s) of the input column(s).
                Example values:
                "*" - all
                columns; "my_col" - one column named "my_col";
                "my_col1, my_col2" - two columns.
            out_col: name of the output column.
            json_options: options to parse the json value.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='to_json')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.withColumn(
                out_col,
                to_json(struct(*in_cols), json_options if json_options else None),
            )

        return inner

    @classmethod
    def _explode_arrays(cls, df: DataFrame, cols_to_explode: List[str]) -> DataFrame:
        """Explode array columns from dataframe.

        Args:
            df: the dataframe to apply the explode operation.
            cols_to_explode: list of array columns to perform explode.

        Returns:
            A dataframe with array columns exploded.
        """
        if cols_to_explode is None:
            cols_to_explode = cls._get_columns(df, spark_types.ArrayType)

        for column in cols_to_explode:
            df = df.withColumn(column, explode_outer(column))

        return df

    @classmethod
    def _explode_maps(cls, df: DataFrame, cols_to_explode: List[str]) -> DataFrame:
        """Explode map columns from dataframe.

        Args:
            df: the dataframe to apply the explode operation.
            cols_to_explode: list of map columns to perform explode.

        Returns:
            A dataframe with map columns exploded.
        """
        if cols_to_explode is None:
            cols_to_explode = cls._get_columns(df, spark_types.MapType)

        for column in cols_to_explode:
            df = df.withColumn(column, explode_outer(map_entries(col(column))))

        return df


================================================
FILE: lakehouse_engine/transformers/condensers.py
================================================
"""Condensers module."""

from typing import Callable, List, Optional

from pyspark.sql import DataFrame, Window
from pyspark.sql.functions import col, row_number

from lakehouse_engine.transformers.exceptions import (
    UnsupportedStreamingTransformerException,
    WrongArgumentsException,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Condensers(object):
    """Class containing all the functions to condensate data for later merges."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def condense_record_mode_cdc(
        cls,
        business_key: List[str],
        record_mode_col: str,
        valid_record_modes: List[str],
        ranking_key_desc: Optional[List[str]] = None,
        ranking_key_asc: Optional[List[str]] = None,
    ) -> Callable:
        """Condense Change Data Capture (CDC) based on record_mode strategy.

        This CDC data is particularly seen in some CDC enabled systems. Other systems
        may have different CDC strategies.

        Args:
            business_key: The business key (logical primary key) of the data.
            ranking_key_desc: In this type of CDC condensation the data needs to be
                in descending order in a certain way, using columns specified in this
                parameter.
            ranking_key_asc: In this type of CDC condensation the data needs to be
                in ascending order in a certain way, using columns specified in
                this parameter.
            record_mode_col: Name of the record mode input_col.
            valid_record_modes: Depending on the context, not all record modes may be
                considered for condensation. Use this parameter to skip those.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='condense_record_mode_cdc')}}
        """
        if not ranking_key_desc and not ranking_key_asc:
            raise WrongArgumentsException(
                "The condense_record_mode_cdc transformer requires data to be either"
                "in descending or ascending order, but no arguments for ordering"
                "were provided."
            )

        def inner(df: DataFrame) -> DataFrame:
            if not df.isStreaming:
                partition_window = Window.partitionBy(
                    [col(c) for c in business_key]
                ).orderBy(
                    [
                        col(c).desc()
                        for c in (ranking_key_desc if ranking_key_desc else [])
                    ]  # type: ignore
                    + [
                        col(c).asc()
                        for c in (ranking_key_asc if ranking_key_asc else [])
                    ]  # type: ignore
                )

                return (
                    df.withColumn("ranking", row_number().over(partition_window))
                    .filter(
                        col(record_mode_col).isNull()
                        | col(record_mode_col).isin(valid_record_modes)
                    )
                    .filter(col("ranking") == 1)
                    .drop("ranking")
                )
            else:
                raise UnsupportedStreamingTransformerException(
                    "Transformer condense_record_mode_cdc is not supported in "
                    "streaming mode."
                )

        return inner

    @classmethod
    def group_and_rank(
        cls, group_key: List[str], ranking_key: List[str], descending: bool = True
    ) -> Callable:
        """Condense data based on a simple group by + take latest mechanism.

        Args:
            group_key: list of column names to use in the group by.
            ranking_key: the data needs to be in descending order using columns
                specified in this parameter.
            descending: if the ranking considers descending order or not. Defaults to
                True.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='group_and_rank')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if not df.isStreaming:
                partition_window = Window.partitionBy(
                    [col(c) for c in group_key]
                ).orderBy(
                    [
                        col(c).desc() if descending else col(c).asc()
                        for c in (ranking_key if ranking_key else [])
                    ]  # type: ignore
                )

                return (
                    df.withColumn("ranking", row_number().over(partition_window))
                    .filter(col("ranking") == 1)
                    .drop("ranking")
                )
            else:
                raise UnsupportedStreamingTransformerException(
                    "Transformer group_and_rank is not supported in streaming mode."
                )

        return inner


================================================
FILE: lakehouse_engine/transformers/custom_transformers.py
================================================
"""Custom transformers module."""

from typing import Callable

from pyspark.sql import DataFrame


class CustomTransformers(object):
    """Class representing a CustomTransformers."""

    @staticmethod
    def custom_transformation(custom_transformer: Callable) -> Callable:
        """Execute a custom transformation provided by the user.

        This transformer can be very useful whenever the user cannot use our provided
        transformers, or they want to write complex logic in the transform step of the
        algorithm.

        .. warning:: Attention!
            Please bear in mind that the custom_transformer function provided
            as argument needs to receive a DataFrame and return a DataFrame,
            because it is how Spark's .transform method is able to chain the
            transformations.

        Example:
        ```python
        def my_custom_logic(df: DataFrame) -> DataFrame:
        ```

        Args:
            custom_transformer: custom transformer function. A python function with all
                required pyspark logic provided by the user.

        Returns:
            Callable: the same function provided as parameter, in order to e called
                later in the TransformerFactory.

        {{get_example(method_name='custom_transformation')}}
        """
        return custom_transformer

    @staticmethod
    def sql_transformation(sql: str) -> Callable:
        """Execute a SQL transformation provided by the user.

        This transformer can be very useful whenever the user wants to perform
        SQL-based transformations that are not natively supported by the
        lakehouse engine transformers.

        Args:
            sql: the SQL query to be executed. This can read from any table or
                view from the catalog, or any dataframe registered as a temp
                view.

        Returns:
            Callable: A function to be called in .transform() spark function.

        {{get_example(method_name='sql_transformation')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.sparkSession.sql(sql)

        return inner


================================================
FILE: lakehouse_engine/transformers/data_maskers.py
================================================
"""Module with data masking transformers."""

from typing import Callable, List

from pyspark.sql import DataFrame
from pyspark.sql.functions import hash, sha2  # noqa: A004

from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DataMaskers(object):
    """Class containing data masking transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def hash_masker(
        cls,
        cols: List[str],
        approach: str = "SHA",
        num_bits: int = 256,
        suffix: str = "_hash",
    ) -> Callable:
        """Mask specific columns using an hashing approach.

        Args:
            cols: list of column names to mask.
            approach: hashing approach. Defaults to 'SHA'. There's "MURMUR3" as well.
            num_bits: number of bits of the SHA approach. Only applies to SHA approach.
            suffix: suffix to apply to new column name. Defaults to "_hash".
                Note: you can pass an empty suffix to have the original column replaced.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='hash_masker')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            masked_df = df
            for col in cols:
                if approach == "MURMUR3":
                    masked_df = masked_df.withColumn(col + suffix, hash(col))
                elif approach == "SHA":
                    masked_df = masked_df.withColumn(col + suffix, sha2(col, num_bits))
                else:
                    raise WrongArgumentsException("Hashing approach is not supported.")

            return masked_df

        return inner

    @classmethod
    def column_dropper(cls, cols: List[str]) -> Callable:
        """Drop specific columns.

        Args:
            cols: list of column names to drop.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='column_dropper')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            drop_df = df
            for col in cols:
                drop_df = drop_df.drop(col)

            return drop_df

        return inner


================================================
FILE: lakehouse_engine/transformers/date_transformers.py
================================================
"""Module containing date transformers."""

from datetime import datetime
from typing import Callable, List, Optional

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, date_format, lit, to_date, to_timestamp

from lakehouse_engine.utils.logging_handler import LoggingHandler


class DateTransformers(object):
    """Class with set of transformers to transform dates in several forms."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def add_current_date(output_col: str) -> Callable:
        """Add column with current date.

        The current date comes from the driver as a constant, not from every executor.

        Args:
            output_col: name of the output column.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='add_current_date')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.withColumn(output_col, lit(datetime.now()))

        return inner

    @staticmethod
    def convert_to_date(
        cols: List[str], source_format: Optional[str] = None
    ) -> Callable:
        """Convert multiple string columns with a source format into dates.

        Args:
            cols: list of names of the string columns to convert.
            source_format: dates source format (e.g., YYYY-MM-dd). [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='convert_to_date')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            converted_df = df
            for c in cols:
                converted_df = converted_df.withColumn(
                    c, to_date(col(c), source_format)
                )

            return converted_df

        return inner

    @staticmethod
    def convert_to_timestamp(
        cols: List[str], source_format: Optional[str] = None
    ) -> Callable:
        """Convert multiple string columns with a source format into timestamps.

        Args:
            cols: list of names of the string columns to convert.
            source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS).
                [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='convert_to_timestamp')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            converted_df = df
            for c in cols:
                converted_df = converted_df.withColumn(
                    c, to_timestamp(col(c), source_format)
                )

            return converted_df

        return inner

    @staticmethod
    def format_date(cols: List[str], target_format: Optional[str] = None) -> Callable:
        """Convert multiple date/timestamp columns into strings with the target format.

        Args:
            cols: list of names of the string columns to convert.
            target_format: strings target format (e.g., YYYY-MM-dd). [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='format_date')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            converted_df = df
            for c in cols:
                converted_df = converted_df.withColumn(
                    c, date_format(col(c), target_format)
                )

            return converted_df

        return inner

    @staticmethod
    def get_date_hierarchy(cols: List[str], formats: Optional[dict] = None) -> Callable:
        """Create day/month/week/quarter/year hierarchy for the provided date columns.

        Uses Spark's extract function.

        Args:
            cols: list of names of the date columns to create the hierarchy.
            formats: dict with the correspondence between the hierarchy and the format
                to apply. [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).
                Example: {
                    "year": "year",
                    "month": "month",
                    "day": "day",
                    "week": "week",
                    "quarter": "quarter"
                }

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='get_date_hierarchy')}}
        """
        if not formats:
            formats = {
                "year": "year",
                "month": "month",
                "day": "day",
                "week": "week",
                "quarter": "quarter",
            }

        def inner(df: DataFrame) -> DataFrame:
            transformer_df = df
            for c in cols:
                transformer_df = transformer_df.selectExpr(
                    "*",
                    f"extract({formats['day']} from {c}) as {c}_day",
                    f"extract({formats['month']} from {c}) as {c}_month",
                    f"extract({formats['week']} from {c}) as {c}_week",
                    f"extract({formats['quarter']} from {c}) as {c}_quarter",
                    f"extract({formats['year']} from {c}) as {c}_year",
                )

            return transformer_df

        return inner


================================================
FILE: lakehouse_engine/transformers/exceptions.py
================================================
"""Module for all the transformers exceptions."""


class WrongArgumentsException(Exception):
    """Exception for when a user provides wrong arguments to a transformer."""

    pass


class UnsupportedStreamingTransformerException(Exception):
    """Exception for when a user requests a transformer not supported in streaming."""

    pass


================================================
FILE: lakehouse_engine/transformers/filters.py
================================================
"""Module containing the filters transformers."""

from typing import Any, Callable, List, Optional

from pyspark.sql import DataFrame
from pyspark.sql.functions import col

from lakehouse_engine.transformers.watermarker import Watermarker
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Filters(object):
    """Class containing the filters transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def incremental_filter(
        cls,
        input_col: str,
        increment_value: Optional[Any] = None,
        increment_df: Optional[DataFrame] = None,
        increment_col: str = "latest",
        greater_or_equal: bool = False,
    ) -> Callable:
        """Incrementally Filter a certain dataframe given an increment logic.

        This logic can either be an increment value or an increment dataframe from
        which the get the latest value from. By default, the operator for the
        filtering process is greater or equal to cover cases where we receive late
        arriving data not cover in a previous load. You can change greater_or_equal
        to false to use greater, when you trust the source will never output more data
        with the increment after you have load the data (e.g., you will never load
        data until the source is still dumping data, which may cause you to get an
        incomplete picture of the last arrived data).

        Args:
            input_col: input column name
            increment_value: value to which to filter the data, considering the
                provided input_Col.
            increment_df: a dataframe to get the increment value from.
                you either specify this or the increment_value (this takes precedence).
                This is a good approach to get the latest value from a given dataframe
                that was read and apply that value as filter here. In this way you can
                perform incremental loads based on the last value of a given dataframe
                (e.g., table or file based). Can be used together with the
                get_max_value transformer to accomplish these incremental based loads.
                See our append load feature tests  to see how to provide an acon for
                incremental loads, taking advantage of the scenario explained here.
            increment_col: name of the column from which to get the increment
                value from (when using increment_df approach). This assumes there's
                only one row in the increment_df, reason why is a good idea to use
                together with the get_max_value transformer. Defaults to "latest"
                because that's the default output column name provided by the
                get_max_value transformer.
            greater_or_equal: if filtering should be done by also including the
                increment value or not (useful for scenarios where you are performing
                increment loads but still want to include data considering the increment
                value, and not only values greater than that increment... examples may
                include scenarios where you already loaded data including those values,
                but the source produced more data containing those values).
                Defaults to false.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='incremental_filter')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if increment_df:
                if greater_or_equal:
                    return df.filter(  # type: ignore
                        col(input_col) >= increment_df.collect()[0][increment_col]
                    )
                else:
                    return df.filter(  # type: ignore
                        col(input_col) > increment_df.collect()[0][increment_col]
                    )
            else:
                if greater_or_equal:
                    return df.filter(col(input_col) >= increment_value)  # type: ignore
                else:
                    return df.filter(col(input_col) > increment_value)  # type: ignore

        return inner

    @staticmethod
    def expression_filter(exp: str) -> Callable:
        """Filter a dataframe based on an expression.

        Args:
            exp: filter expression.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='expression_filter')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.filter(exp)  # type: ignore

        return inner

    @staticmethod
    def column_filter_exp(exp: List[str]) -> Callable:
        """Filter a dataframe's columns based on a list of SQL expressions.

        Args:
            exp: column filter expressions.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='column_filter_exp')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.selectExpr(*exp)  # type: ignore

        return inner

    @staticmethod
    def drop_duplicate_rows(
        cols: List[str] = None, watermarker: dict = None
    ) -> Callable:
        """Drop duplicate rows using spark function dropDuplicates().

        This transformer can be used with or without arguments.
        The provided argument needs to be a list of columns.
        For example: [“Name”,”VAT”] will drop duplicate records within
        "Name" and "VAT" columns.
        If the transformer is used without providing any columns list or providing
        an empty list, such as [] the result will be the same as using
        the distinct() pyspark function. If the watermark dict is present it will
        ensure that the drop operation will apply to rows within the watermark timeline
        window.


        Args:
            cols: column names.
            watermarker: properties to apply watermarker to the transformer.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='drop_duplicate_rows')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if watermarker:
                df = Watermarker.with_watermark(
                    watermarker["col"], watermarker["watermarking_time"]
                )(df)
            if not cols:
                return df.dropDuplicates()
            else:
                return df.dropDuplicates(cols)

        return inner


================================================
FILE: lakehouse_engine/transformers/joiners.py
================================================
"""Module with join transformers."""

import uuid
from typing import Callable, List, Optional

from pyspark.sql import DataFrame

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.transformers.watermarker import Watermarker
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.spark_utils import SparkUtils


class Joiners(object):
    """Class containing join transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def join(
        cls,
        join_with: DataFrame,
        join_condition: str,
        left_df_alias: str = "a",
        right_df_alias: str = "b",
        join_type: str = "inner",
        broadcast_join: bool = True,
        select_cols: Optional[List[str]] = None,
        watermarker: Optional[dict] = None,
    ) -> Callable:
        """Join two dataframes based on specified type and columns.

        Some stream to stream joins are only possible if you apply Watermark, so this
        method also provides a parameter to enable watermarking specification.

        Args:
            left_df_alias: alias of the first dataframe.
            join_with: right dataframe.
            right_df_alias: alias of the second dataframe.
            join_condition: condition to join dataframes.
            join_type: type of join. Defaults to inner.
                Available values: inner, cross, outer, full, full outer,
                left, left outer, right, right outer, semi,
                left semi, anti, and left anti.
            broadcast_join: whether to perform a broadcast join or not.
            select_cols: list of columns to select at the end.
            watermarker: properties to apply watermarking.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='join')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            # The goal here is to avoid problems on
            # simultaneously running process,
            # so an id is added as a prefix for the alias.
            app_id = str(uuid.uuid4())
            left = f"`{app_id}_{left_df_alias}`"
            right = f"`{app_id}_{right_df_alias}`"
            df_join_with = join_with
            if watermarker:
                left_df_watermarking = watermarker.get(left_df_alias, None)
                right_df_watermarking = watermarker.get(right_df_alias, None)
                if left_df_watermarking:
                    df = Watermarker.with_watermark(
                        left_df_watermarking["col"],
                        left_df_watermarking["watermarking_time"],
                    )(df)
                if right_df_watermarking:
                    df_join_with = Watermarker.with_watermark(
                        right_df_watermarking["col"],
                        right_df_watermarking["watermarking_time"],
                    )(df_join_with)

            l_prefix = SparkUtils.create_temp_view(df, left, return_prefix=True)
            r_prefix = SparkUtils.create_temp_view(
                df_join_with, right, return_prefix=True
            )

            query = f"""
                SELECT {f"/*+ BROADCAST({right_df_alias}) */" if broadcast_join else ""}
                {", ".join(select_cols)}
                FROM {l_prefix}{left} AS {left_df_alias}
                {join_type.upper()}
                JOIN {r_prefix}{right} AS {right_df_alias}
                ON {join_condition}
            """  # nosec: B608

            cls._logger.info(f"Execution query: {query}")

            return ExecEnv.SESSION.sql(query)

        return inner


================================================
FILE: lakehouse_engine/transformers/null_handlers.py
================================================
"""Module with null handlers transformers."""

from typing import Callable, List

from pyspark.sql import DataFrame

from lakehouse_engine.utils.logging_handler import LoggingHandler


class NullHandlers(object):
    """Class containing null handler transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def replace_nulls(
        cls,
        replace_on_nums: bool = True,
        default_num_value: int = -999,
        replace_on_strings: bool = True,
        default_string_value: str = "UNKNOWN",
        subset_cols: List[str] = None,
    ) -> Callable:
        """Replace nulls in a dataframe.

        Args:
            replace_on_nums: if it is to replace nulls on numeric columns.
                Applies to ints, longs and floats.
            default_num_value: default integer value to use as replacement.
            replace_on_strings: if it is to replace nulls on string columns.
            default_string_value: default string value to use as replacement.
            subset_cols: list of columns in which to replace nulls. If not
                provided, all nulls in all columns will be replaced as specified.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='replace_nulls')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if replace_on_nums:
                df = df.na.fill(default_num_value, subset_cols)
            if replace_on_strings:
                df = df.na.fill(default_string_value, subset_cols)

            return df

        return inner


================================================
FILE: lakehouse_engine/transformers/optimizers.py
================================================
"""Optimizers module."""

from typing import Callable

from pyspark.sql import DataFrame
from pyspark.storagelevel import StorageLevel

from lakehouse_engine.utils.logging_handler import LoggingHandler


class Optimizers(object):
    """Class containing all the functions that can provide optimizations."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def cache(cls) -> Callable:
        """Caches the current dataframe.

        The default storage level used is MEMORY_AND_DISK.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='cache')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.cache()

        return inner

    @classmethod
    def persist(cls, storage_level: str = None) -> Callable:
        """Caches the current dataframe with a specific StorageLevel.

        Args:
            storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER.
                [More options here](
                https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.StorageLevel.html).

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='persist')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            level = getattr(
                StorageLevel, storage_level, StorageLevel.MEMORY_AND_DISK_DESER
            )

            return df.persist(level)

        return inner

    @classmethod
    def unpersist(cls, blocking: bool = False) -> Callable:
        """Removes the dataframe from the disk and memory.

        Args:
            blocking: whether to block until all the data blocks are
                removed from disk/memory or run asynchronously.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='unpersist')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.unpersist(blocking)

        return inner


================================================
FILE: lakehouse_engine/transformers/regex_transformers.py
================================================
"""Regex transformers module."""

from typing import Callable

from pyspark.sql import DataFrame
from pyspark.sql.functions import col, regexp_extract

from lakehouse_engine.utils.logging_handler import LoggingHandler


class RegexTransformers(object):
    """Class containing all regex functions."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def with_regex_value(
        input_col: str,
        output_col: str,
        regex: str,
        drop_input_col: bool = False,
        idx: int = 1,
    ) -> Callable:
        """Get the result of applying a regex to an input column (via regexp_extract).

        Args:
            input_col: name of the input column.
            output_col: name of the output column.
            regex: regular expression.
            drop_input_col: whether to drop input_col or not.
            idx: index to return.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='with_regex_value')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx))

            if drop_input_col:
                df = df.drop(input_col)

            return df

        return inner


================================================
FILE: lakehouse_engine/transformers/repartitioners.py
================================================
"""Module with repartitioners transformers."""

from typing import Callable, List, Optional

from pyspark.sql import DataFrame

from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from lakehouse_engine.utils.logging_handler import LoggingHandler


class Repartitioners(object):
    """Class containing repartitioners transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def coalesce(cls, num_partitions: int) -> Callable:
        """Coalesce a dataframe into n partitions.

        Args:
            num_partitions: num of partitions to coalesce.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='coalesce')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.coalesce(num_partitions)

        return inner

    @classmethod
    def repartition(
        cls, num_partitions: Optional[int] = None, cols: Optional[List[str]] = None
    ) -> Callable:
        """Repartition a dataframe into n partitions.

        If num_partitions is provided repartitioning happens based on the provided
        number, otherwise it happens based on the values of the provided cols (columns).

        Args:
            num_partitions: num of partitions to repartition.
            cols: list of columns to use for repartitioning.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='repartition')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            if cols:
                return df.repartition(num_partitions, *cols)
            elif num_partitions:
                return df.repartition(num_partitions)
            else:
                raise WrongArgumentsException(
                    "num_partitions or cols should be specified"
                )

        return inner


================================================
FILE: lakehouse_engine/transformers/transformer_factory.py
================================================
"""Module with the factory pattern to return transformers."""

from typing import Callable, OrderedDict

from lakehouse_engine.core.definitions import TransformerSpec
from lakehouse_engine.transformers.aggregators import Aggregators
from lakehouse_engine.transformers.column_creators import ColumnCreators
from lakehouse_engine.transformers.column_reshapers import ColumnReshapers
from lakehouse_engine.transformers.condensers import Condensers
from lakehouse_engine.transformers.custom_transformers import CustomTransformers
from lakehouse_engine.transformers.data_maskers import DataMaskers
from lakehouse_engine.transformers.date_transformers import DateTransformers
from lakehouse_engine.transformers.filters import Filters
from lakehouse_engine.transformers.joiners import Joiners
from lakehouse_engine.transformers.null_handlers import NullHandlers
from lakehouse_engine.transformers.optimizers import Optimizers
from lakehouse_engine.transformers.regex_transformers import RegexTransformers
from lakehouse_engine.transformers.repartitioners import Repartitioners
from lakehouse_engine.transformers.unions import Unions
from lakehouse_engine.transformers.watermarker import Watermarker
from lakehouse_engine.utils.logging_handler import LoggingHandler


class TransformerFactory(object):
    """TransformerFactory class following the factory pattern."""

    _logger = LoggingHandler(__name__).get_logger()

    UNSUPPORTED_STREAMING_TRANSFORMERS = [
        "condense_record_mode_cdc",
        "group_and_rank",
        "with_auto_increment_id",
        "with_row_id",
    ]

    AVAILABLE_TRANSFORMERS = {
        "add_current_date": DateTransformers.add_current_date,
        "cache": Optimizers.cache,
        "cast": ColumnReshapers.cast,
        "coalesce": Repartitioners.coalesce,
        "column_dropper": DataMaskers.column_dropper,
        "column_filter_exp": Filters.column_filter_exp,
        "column_selector": ColumnReshapers.column_selector,
        "condense_record_mode_cdc": Condensers.condense_record_mode_cdc,
        "convert_to_date": DateTransformers.convert_to_date,
        "convert_to_timestamp": DateTransformers.convert_to_timestamp,
        "custom_transformation": CustomTransformers.custom_transformation,
        "drop_duplicate_rows": Filters.drop_duplicate_rows,
        "expression_filter": Filters.expression_filter,
        "format_date": DateTransformers.format_date,
        "flatten_schema": ColumnReshapers.flatten_schema,
        "explode_columns": ColumnReshapers.explode_columns,
        "from_avro": ColumnReshapers.from_avro,
        "from_avro_with_registry": ColumnReshapers.from_avro_with_registry,
        "from_json": ColumnReshapers.from_json,
        "get_date_hierarchy": DateTransformers.get_date_hierarchy,
        "get_max_value": Aggregators.get_max_value,
        "group_and_rank": Condensers.group_and_rank,
        "hash_masker": DataMaskers.hash_masker,
        "incremental_filter": Filters.incremental_filter,
        "join": Joiners.join,
        "persist": Optimizers.persist,
        "rename": ColumnReshapers.rename,
        "repartition": Repartitioners.repartition,
        "replace_nulls": NullHandlers.replace_nulls,
        "sql_transformation": CustomTransformers.sql_transformation,
        "to_json": ColumnReshapers.to_json,
        "union": Unions.union,
        "union_by_name": Unions.union_by_name,
        "with_watermark": Watermarker.with_watermark,
        "unpersist": Optimizers.unpersist,
        "with_auto_increment_id": ColumnCreators.with_auto_increment_id,
        "with_expressions": ColumnReshapers.with_expressions,
        "with_literals": ColumnCreators.with_literals,
        "with_regex_value": RegexTransformers.with_regex_value,
        "with_row_id": ColumnCreators.with_row_id,
    }

    @staticmethod
    def get_transformer(spec: TransformerSpec, data: OrderedDict = None) -> Callable:
        """Get a transformer following the factory pattern.

        Args:
            spec: transformer specification (individual transformation... not to be
                confused with list of all transformations).
            data: ordered dict of dataframes to be transformed. Needed when a
                transformer requires more than one dataframe as input.

        Returns:
            Transformer function to be executed in .transform() spark function.

        {{get_example(method_name='get_transformer')}}
        """
        if spec.function == "incremental_filter":
            # incremental_filter optionally expects a DataFrame as input, so find it.
            args_copy = TransformerFactory._get_spec_args_copy(spec.args)
            if "increment_df" in args_copy:
                args_copy["increment_df"] = data[args_copy["increment_df"]]
            return TransformerFactory.AVAILABLE_TRANSFORMERS[  # type: ignore
                spec.function
            ](**args_copy)
        elif spec.function == "join":
            # get the dataframe given the input_id in the input specs of the acon.
            args_copy = TransformerFactory._get_spec_args_copy(spec.args)
            args_copy["join_with"] = data[args_copy["join_with"]]
            return TransformerFactory.AVAILABLE_TRANSFORMERS[  # type: ignore
                spec.function
            ](**args_copy)
        elif spec.function == "union" or spec.function == "union_by_name":
            # get the list of dataframes given the input_id in the input specs
            # of the acon.
            args_copy = TransformerFactory._get_spec_args_copy(spec.args)
            args_copy["union_with"] = []
            for union_with_spec_id in spec.args["union_with"]:
                args_copy["union_with"].append(data[union_with_spec_id])
            return TransformerFactory.AVAILABLE_TRANSFORMERS[  # type: ignore
                spec.function
            ](**args_copy)
        elif spec.function in TransformerFactory.AVAILABLE_TRANSFORMERS:
            return TransformerFactory.AVAILABLE_TRANSFORMERS[  # type: ignore
                spec.function
            ](**spec.args)
        else:
            raise NotImplementedError(
                f"The requested transformer {spec.function} is not implemented."
            )

    @staticmethod
    def _get_spec_args_copy(spec_args: dict) -> dict:
        """Returns a shallow copy of `spec_args` to ensure immutability.

        Args:
            spec_args (dict): A dictionary containing the arguments of a
            TransformerSpec.

        Returns:
            dict: A shallow copy of `spec_args`, preventing modifications to the
            original dictionary. This is important in Spark, especially when
            retries of failed attempts occur. For example, if during the first
            run the `join_with` argument (initially a string) is replaced with a
            DataFrame (as done in the `get_transformer` function), then on a retry,
            depending on how Spark handles state, the `join_with` argument may no
            longer be a string but a DataFrame, leading to key error.
        """
        return dict(spec_args)


================================================
FILE: lakehouse_engine/transformers/unions.py
================================================
"""Module with union transformers."""

from functools import reduce
from typing import Callable, List

from pyspark.sql import DataFrame

from lakehouse_engine.utils.logging_handler import LoggingHandler


class Unions(object):
    """Class containing union transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def union(
        cls,
        union_with: List[DataFrame],
        deduplication: bool = True,
    ) -> Callable:
        """Union dataframes, resolving columns by position (not by name).

        Args:
            union_with: list of dataframes to union.
            deduplication: whether to perform deduplication of elements or not.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='union')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            union_df = reduce(lambda x, y: x.union(y), [df] + union_with)

            return union_df.distinct() if deduplication else union_df

        return inner

    @classmethod
    def union_by_name(
        cls,
        union_with: List[DataFrame],
        deduplication: bool = True,
        allow_missing_columns: bool = True,
    ) -> Callable:
        """Union dataframes, resolving columns by name (not by position).

        Args:
            union_with: list of dataframes to union.
            deduplication: whether to perform deduplication of elements or not.
            allow_missing_columns: allow the union of DataFrames with different
                schemas.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='union_by_name')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            union_df = reduce(
                lambda x, y: x.unionByName(
                    y, allowMissingColumns=allow_missing_columns
                ),
                [df] + union_with,
            )

            return union_df.distinct() if deduplication else union_df

        return inner


================================================
FILE: lakehouse_engine/transformers/watermarker.py
================================================
"""Watermarker module."""

from typing import Callable

from pyspark.sql import DataFrame

from lakehouse_engine.utils.logging_handler import LoggingHandler


class Watermarker(object):
    """Class containing all watermarker transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def with_watermark(watermarker_column: str, watermarker_time: str) -> Callable:
        """Get the dataframe with watermarker defined.

        Args:
            watermarker_column: name of the input column to be considered for
                the watermarking. Note: it must be a timestamp.
            watermarker_time: time window to define the watermark value.

        Returns:
            A function to be executed on other transformers.

        {{get_example(method_name='with_watermark')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.withWatermark(watermarker_column, watermarker_time)

        return inner


================================================
FILE: lakehouse_engine/utils/__init__.py
================================================
"""Utilities package."""


================================================
FILE: lakehouse_engine/utils/acon_utils.py
================================================
"""Module to perform validations and resolve the acon."""

from lakehouse_engine.core.definitions import (
    FILE_MANAGER_OPERATIONS,
    TABLE_MANAGER_OPERATIONS,
    DQType,
    InputFormat,
    OutputFormat,
)
from lakehouse_engine.io.exceptions import WrongIOFormatException
from lakehouse_engine.utils.dq_utils import PrismaUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler

_LOGGER = LoggingHandler(__name__).get_logger()


def validate_manager_list(acon: dict) -> list:
    """Function to validate an acon with a list of operations.

    Args:
        acon: Acon to be validated.
    """
    error_list: list[str] = []
    operations: list[dict] = acon.get("operations", [])

    if not operations:
        raise RuntimeError("No operations found in the acon.")

    for operation in operations:
        validate_managers(operation, error_list)
    if error_list:
        error_list_str = "\n" + "\n".join(error_list)
        raise RuntimeError(f"Errors found during validation:{error_list_str}")

    return operations


def validate_and_resolve_acon(acon: dict, execution_point: str = "") -> dict:
    """Function to validate and resolve the acon.

    Args:
        acon: Acon to be validated and resolved.
        execution_point: Execution point to resolve the dq functions.

    Returns:
        Acon after validation and resolution.
    """
    # Performing validations
    validate_readers(acon)
    validate_writers(acon)
    validate_managers(acon)

    # Resolving the acon
    if execution_point:
        acon = resolve_dq_functions(acon, execution_point)

    _LOGGER.info(f"Read Algorithm Configuration: {str(acon)}")

    return acon


def validate_readers(acon: dict) -> None:
    """Function to validate the readers in the acon.

    Args:
        acon: Acon to be validated.

    Raises:
        RuntimeError: If the input format is not supported.
    """
    if "input_specs" in acon.keys() or "input_spec" in acon.keys():
        for spec in acon.get("input_specs", []) or [acon.get("input_spec", {})]:
            if (
                not InputFormat.exists(spec.get("data_format"))
                and "db_table" not in spec.keys()
            ):
                raise WrongIOFormatException(
                    f"Input format not supported: {spec.get('data_format')}"
                )


def validate_writers(acon: dict) -> None:
    """Function to validate the writers in the acon.

    Args:
        acon: Acon to be validated.

    Raises:
        RuntimeError: If the output format is not supported.
    """
    if "output_specs" in acon.keys() or "output_spec" in acon.keys():
        for spec in acon.get("output_specs", []) or [acon.get("output_spec", {})]:
            if not OutputFormat.exists(spec.get("data_format")):
                raise WrongIOFormatException(
                    f"Output format not supported: {spec.get('data_format')}"
                )


def validate_managers(acon: dict, error_list: list = None) -> None:
    """Function to validate the managers in the acon.

    Args:
        acon: Acon to be validated.
        error_list: List to collect errors.
    """
    manager_type = acon.get("manager")
    temp_error_list = []
    if not manager_type:
        return

    function_name = acon.get("function")
    if not function_name:
        error = "Missing 'function' parameter for manager"
        temp_error_list.append(error)

    if manager_type == "file":
        operations_dict = FILE_MANAGER_OPERATIONS
    elif manager_type == "table":
        operations_dict = TABLE_MANAGER_OPERATIONS
    else:
        error = f"Manager type not supported: {manager_type}"
        temp_error_list.append(error)

    if function_name not in operations_dict:
        error = f"Function '{function_name}' not supported for {manager_type} manager"
        temp_error_list.append(error)
    else:
        expected_params = operations_dict[function_name]

        missing_mandatory = validate_mandatory_parameters(acon, expected_params)
        if missing_mandatory:
            error = (
                f"Missing mandatory parameters for {manager_type} "
                f"manager function {function_name}: {missing_mandatory}"
            )
            temp_error_list.append(error)

        type_errors = validate_parameter_types(acon, expected_params)

        if type_errors:
            error = (
                f"Type validation errors for {manager_type} "
                f"manager function {function_name}: {type_errors}"
            )
            temp_error_list.append(error)

    if error_list is not None:
        error_list.extend(temp_error_list)
    else:
        if temp_error_list:
            error_list_str = "\n".join(temp_error_list)
            raise RuntimeError(error_list_str)


def validate_mandatory_parameters(acon: dict, expected_params: dict) -> list:
    """Function to validate mandatory parameters in the acon.

    Args:
        acon: Acon to be validated.
        expected_params: Expected parameters with their mandatory status.

    Returns:
        List of missing mandatory parameters.
    """
    missing_mandatory = []
    for param_name, param_info in expected_params.items():
        if param_info["mandatory"] and param_name not in acon:
            missing_mandatory.append(param_name)

    return missing_mandatory


def validate_parameter_types(acon: dict, expected_params: dict) -> list:
    """Function to validate parameter types in the acon.

    Args:
        acon: Acon to be validated.
        expected_params: Expected parameters with their types.

    Returns:
        List of type validation errors.
    """
    type_errors = []
    for param_name, param_value in acon.items():
        if param_name in expected_params:
            expected_type = expected_params[param_name]["type"]
            param_type_name = type(param_value).__name__

            expected_python_type = {
                "str": str,
                "bool": bool,
                "int": int,
                "list": list,
            }.get(expected_type)

            if expected_python_type and not isinstance(
                param_value, expected_python_type
            ):
                type_errors.append(
                    f"Parameter '{param_name}' expected {expected_type}, "
                    f"got {param_type_name}"
                )

    return type_errors


def resolve_dq_functions(acon: dict, execution_point: str) -> dict:
    """Function to resolve the dq functions in the acon.

    Args:
        acon: Acon to resolve the dq functions.
        execution_point: Execution point of the dq_functions.

    Returns:
        Acon after resolving the dq functions.
    """
    if acon.get("dq_spec"):
        if acon.get("dq_spec").get("dq_type") == DQType.PRISMA.value:
            acon["dq_spec"] = PrismaUtils.build_prisma_dq_spec(
                spec=acon.get("dq_spec"), execution_point=execution_point
            )
    elif acon.get("dq_specs"):
        resolved_dq_specs = []
        for spec in acon.get("dq_specs", []):
            if spec.get("dq_type") == DQType.PRISMA.value:
                resolved_dq_specs.append(
                    PrismaUtils.build_prisma_dq_spec(
                        spec=spec, execution_point=execution_point
                    )
                )
            else:
                resolved_dq_specs.append(spec)
        acon["dq_specs"] = resolved_dq_specs
    return acon


================================================
FILE: lakehouse_engine/utils/configs/__init__.py
================================================
"""Config utilities package."""


================================================
FILE: lakehouse_engine/utils/configs/config_utils.py
================================================
"""Module to read configurations."""

from importlib.metadata import PackageNotFoundError, version
from typing import Any, Optional

import yaml
from importlib_resources import as_file, files

from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions


class ConfigUtils(object):
    """Config utilities class."""

    _LOGGER = LoggingHandler(__name__).get_logger()
    SENSITIVE_INFO = [
        "kafka.ssl.keystore.password",
        "kafka.ssl.truststore.password",
        "password",
        "secret",
        "credential",
        "credentials",
        "pass",
        "key",
    ]

    @classmethod
    def get_acon(
        cls,
        acon_path: Optional[str] = None,
        acon: Optional[dict] = None,
        disable_dbfs_retry: bool = False,
    ) -> dict:
        """Get acon based on a filesystem path or on a dict.

        Args:
            acon_path: path of the acon (algorithm configuration) file.
            acon: acon provided directly through python code (e.g., notebooks
                or other apps).
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            Dict representation of an acon.
        """
        acon = (
            acon if acon else ConfigUtils.read_json_acon(acon_path, disable_dbfs_retry)
        )
        return acon

    @staticmethod
    def get_config(package: str = "lakehouse_engine.configs") -> Any:
        """Get the lakehouse engine configuration file.

        Args:
            package: package where the engine default configurations can be found.

        Returns:
            Configuration dictionary
        """
        config_path = files(package) / "engine.yaml"
        with as_file(config_path) as config_file:
            with open(config_file, "r") as config:
                config = yaml.safe_load(config)
        return config

    @staticmethod
    def get_config_from_file(config_file_path: str) -> Any:
        """Get the lakehouse engine configurations using a file path.

         Args:
            config_file_path: a string with a path for a yaml file
            with custom configurations.

        Returns:
            Configuration dictionary
        """
        with open(config_file_path, "r") as config:
            config = yaml.safe_load(config)
        return config

    @classmethod
    def get_engine_version(cls) -> str:
        """Get Lakehouse Engine version from the installed packages.

        Returns:
            String of engine version.
        """
        try:
            _version = version("lakehouse-engine")
        except PackageNotFoundError:
            cls._LOGGER.info("Could not identify Lakehouse Engine version.")
            _version = ""
        return str(_version)

    @staticmethod
    def read_json_acon(path: str, disable_dbfs_retry: bool = False) -> Any:
        """Read an acon (algorithm configuration) file.

        Args:
            path: path to the acon file.
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            The acon file content as a dict.
        """
        return FileStorageFunctions.read_json(path, disable_dbfs_retry)

    @staticmethod
    def read_sql(path: str, disable_dbfs_retry: bool = False) -> Any:
        """Read a DDL file in Spark SQL format from a cloud object storage system.

        Args:
            path: path to the SQL file.
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            Content of the SQL file.
        """
        return FileStorageFunctions.read_sql(path, disable_dbfs_retry)

    @classmethod
    def remove_sensitive_info(cls, dict_to_replace: dict | list) -> dict | list:
        """Remove sensitive info from a dictionary.

        Args:
            dict_to_replace: dict where we want to remove sensitive info.

        Returns:
            dict without sensitive information.
        """
        if isinstance(dict_to_replace, list):
            return [cls.remove_sensitive_info(k) for k in dict_to_replace]
        elif isinstance(dict_to_replace, dict):
            return {
                k: "******" if k in cls.SENSITIVE_INFO else cls.remove_sensitive_info(v)
                for k, v in dict_to_replace.items()
            }
        else:
            return dict_to_replace


================================================
FILE: lakehouse_engine/utils/databricks_utils.py
================================================
"""Utilities for databricks operations."""

import ast
import json
import os
import re
from typing import Any, Tuple

from pyspark.sql import SparkSession

from lakehouse_engine.core.definitions import EngineStats
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DatabricksUtils(object):
    """Databricks utilities class."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @staticmethod
    def is_serverless_workload() -> bool:
        """Check if the current databricks workload is serverless.

        Returns:
            True if the current databricks workload is serverless, False otherwise.
        """
        if os.getenv("IS_SERVERLESS", "false").lower() == "true":
            return True
        else:
            return False

    @staticmethod
    def get_db_utils(spark: SparkSession) -> Any:
        """Get db utils on databricks.

        Args:
            spark: spark session.

        Returns:
            Dbutils from databricks.
        """
        try:
            from pyspark.dbutils import DBUtils

            if "dbutils" not in locals():
                dbutils = DBUtils(spark)
            else:
                dbutils = locals().get("dbutils")
        except ImportError:
            import IPython

            dbutils = IPython.get_ipython().user_ns["dbutils"]
        return dbutils

    @staticmethod
    def get_databricks_job_information(spark: SparkSession) -> Tuple[str, str]:
        """Get notebook context from running acon.

        Args:
            spark: spark session.

        Returns:
            Dict containing databricks notebook context.
        """
        dbutils = DatabricksUtils.get_db_utils(spark)
        notebook_context = json.loads(
            (
                dbutils.notebook.entry_point.getDbutils()
                .notebook()
                .getContext()
                .safeToJson()
            )
        )

        return notebook_context["attributes"].get("orgId"), notebook_context[
            "attributes"
        ].get("jobName")

    @staticmethod
    def _get_dp_name(job_name: str) -> str:
        """Extract the dp_name from a Databricks job name.

        The job name is expected to have a suffix separated by '-', and the dp_name is
        the part before the last '-'. Only '_' is used in the rest of the job name.
        E.g. 'sadp-template-my_awesome_job'

        Args:
            job_name: The Databricks job name string.

        Returns:
            The extracted dp_name.
        """
        return job_name.rsplit("-", 1)[0] if job_name and "-" in job_name else job_name

    @staticmethod
    def get_spark_conf_values(usage_stats: dict, spark_confs: dict) -> None:
        """Get information from spark session configurations.

        Args:
            usage_stats: usage_stats dictionary file.
            spark_confs: optional dictionary with the spark tags to be used when
                collecting the engine usage.
        """
        from lakehouse_engine.core.exec_env import ExecEnv

        spark_confs = (
            EngineStats.DEF_SPARK_CONFS
            if spark_confs is None
            else EngineStats.DEF_SPARK_CONFS | spark_confs
        )

        for spark_conf_key, spark_conf_value in spark_confs.items():
            # whenever the spark_conf_value has #, it means it is an array, so we need
            # to split it and adequately process it
            if "#" in spark_conf_value:
                array_key = spark_conf_value.split("#")
                array_values = ast.literal_eval(
                    ExecEnv.SESSION.conf.get(array_key[0], "[]")
                )
                final_value = [
                    key_val["value"]
                    for key_val in array_values
                    if key_val["key"] == array_key[1]
                ]
                usage_stats[spark_conf_key] = (
                    final_value[0] if len(final_value) > 0 else ""
                )
            else:
                usage_stats[spark_conf_key] = ExecEnv.SESSION.conf.get(
                    spark_conf_value, ""
                )

        run_id_extracted = re.search("run-([1-9]\\w+)", usage_stats.get("run_id", ""))
        usage_stats["run_id"] = run_id_extracted.group(1) if run_id_extracted else ""

    @classmethod
    def get_usage_context_for_serverless(cls, usage_stats: dict) -> None:
        """Get information from the execution environment for serverless scenarios.

        Since in serverless environments we might not have access to all the spark
        confs we want to collect, we will try to get that information from the
        execution environment when possible.

        Args:
            usage_stats: usage_stats dictionary file.
        """
        try:
            from dbruntime.databricks_repl_context import get_context

            from lakehouse_engine.core.exec_env import ExecEnv

            context = get_context()
            for key, attr in EngineStats.DEF_DATABRICKS_CONTEXT_KEYS.items():
                if key == "dp_name":
                    usage_stats[key] = DatabricksUtils._get_dp_name(
                        getattr(context, attr, None)
                    )
                elif key == "environment":
                    usage_stats[key] = ExecEnv.get_environment()
                else:
                    usage_stats[key] = getattr(context, attr, None)
        except Exception as ex:
            cls._LOGGER.error(f"Error getting Serverless Usage Context: {ex}")


================================================
FILE: lakehouse_engine/utils/dq_utils.py
================================================
"""Module containing utils for DQ processing."""

from json import loads

from pyspark.sql.functions import col, from_json, schema_of_json, struct

from lakehouse_engine.core.definitions import DQSpec, DQTableBaseParameters, DQType
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.dq_processors.exceptions import DQSpecMalformedException
from lakehouse_engine.utils.logging_handler import LoggingHandler

_LOGGER = LoggingHandler(__name__).get_logger()


class DQUtils:
    """Utils related to the data quality process."""

    @staticmethod
    def import_dq_rules_from_table(
        spec: dict,
        execution_point: str,
        base_expectation_arguments: list,
        extra_meta_arguments: list,
    ) -> dict:
        """Import dq rules from a table.

        Args:
            spec: data quality specification.
            execution_point: if the execution is in_motion or at_rest.
            base_expectation_arguments: base arguments for dq functions.
            extra_meta_arguments: extra meta arguments for dq functions.

        Returns:
            The dictionary containing the dq spec with dq functions defined.
        """
        dq_db_table = spec["dq_db_table"]
        dq_functions = []

        if spec.get("dq_table_table_filter"):
            dq_table_table_filter = spec["dq_table_table_filter"]
        else:
            raise DQSpecMalformedException(
                "When importing rules from a table "
                "dq_table_table_filter must be defined."
            )

        extra_filters_query = (
            f""" and {spec["dq_table_extra_filters"]}"""
            if spec.get("dq_table_extra_filters")
            else ""
        )

        fields = base_expectation_arguments + extra_meta_arguments

        dq_functions_query = f"""
            SELECT {", ".join(fields)}
            FROM {dq_db_table}
            WHERE
            execution_point='{execution_point}' and table = '{dq_table_table_filter}'
            {extra_filters_query}"""  # nosec: B608

        raw_dq_functions = ExecEnv.SESSION.sql(dq_functions_query)

        arguments = raw_dq_functions.select("arguments").collect()
        parsed_arguments = [loads(argument.arguments) for argument in arguments]
        combined_dict: dict = {}

        for argument in parsed_arguments:
            combined_dict = {**combined_dict, **argument}

        dq_function_arguments_schema = schema_of_json(str(combined_dict))

        processed_dq_functions = (
            raw_dq_functions.withColumn(
                "json_data", from_json(col("arguments"), dq_function_arguments_schema)
            )
            .withColumn(
                "parsed_arguments",
                struct(
                    col("json_data.*"),
                    struct(extra_meta_arguments).alias("meta"),
                ),
            )
            .drop(col("json_data"))
        )

        unique_dq_functions = processed_dq_functions.drop_duplicates(
            ["dq_tech_function", "arguments"]
        )

        duplicated_rows = processed_dq_functions.subtract(unique_dq_functions)

        if duplicated_rows.count() > 0:
            _LOGGER.warning("Found Duplicates Rows:")
            duplicated_rows.show(truncate=False)

        processed_dq_functions_list = unique_dq_functions.collect()
        for processed_dq_function in processed_dq_functions_list:
            dq_functions.append(
                {
                    "function": f"{processed_dq_function.dq_tech_function}",
                    "args": {
                        k: v
                        for k, v in processed_dq_function.parsed_arguments.asDict(
                            recursive=True
                        ).items()
                        if v is not None
                    },
                }
            )

        spec["dq_functions"] = dq_functions

        return spec

    @staticmethod
    def validate_dq_functions(
        spec: dict, execution_point: str = "", extra_meta_arguments: list = None
    ) -> None:
        """Function to validate the dq functions defined in the dq_spec.

        This function validates that the defined dq_functions contain all
        the fields defined in the extra_meta_arguments parameter.

        Args:
            spec: data quality specification.
            execution_point: if the execution is in_motion or at_rest.
            extra_meta_arguments: extra meta arguments for dq functions.

        Raises:
            DQSpecMalformedException: If the dq spec is malformed.
        """
        dq_functions = spec["dq_functions"]
        if not extra_meta_arguments:
            _LOGGER.info(
                "No extra meta parameters defined. "
                "Skipping validation of imported dq rule."
            )
            return

        for dq_function in dq_functions:
            if not dq_function.get("args").get("meta", None):
                raise DQSpecMalformedException(
                    "The dq function must have a meta field containing all "
                    f"the fields defined: {extra_meta_arguments}."
                )
            else:

                meta = dq_function["args"]["meta"]
                given_keys = meta.keys()
                missing_keys = sorted(set(extra_meta_arguments) - set(given_keys))
                if missing_keys:
                    raise DQSpecMalformedException(
                        "The dq function meta field must contain all the "
                        f"fields defined: {extra_meta_arguments}.\n"
                        f"Found fields: {list(given_keys)}.\n"
                        f"Diff: {list(missing_keys)}"
                    )
                if execution_point and meta["execution_point"] != execution_point:
                    raise DQSpecMalformedException(
                        "The dq function execution point must be the same as "
                        "the execution point of the dq spec."
                    )


class PrismaUtils:
    """Prisma related utils."""

    @staticmethod
    def build_prisma_dq_spec(spec: dict, execution_point: str) -> dict:
        """Fetch dq functions from given table.

        Args:
            spec: data quality specification.
            execution_point: if the execution is in_motion or at_rest.

        Returns:
            The dictionary containing the dq spec with dq functions defined.
        """
        if spec.get("dq_db_table"):
            spec = DQUtils.import_dq_rules_from_table(
                spec,
                execution_point,
                DQTableBaseParameters.PRISMA_BASE_PARAMETERS.value,
                ExecEnv.ENGINE_CONFIG.dq_functions_column_list,
            )
        elif spec.get("dq_functions"):
            DQUtils.validate_dq_functions(
                spec,
                execution_point,
                ExecEnv.ENGINE_CONFIG.dq_functions_column_list,
            )
        else:
            raise DQSpecMalformedException(
                "When using PRISMA either dq_db_table or "
                "dq_functions needs to be defined."
            )

        dq_bucket = (
            ExecEnv.ENGINE_CONFIG.dq_bucket
            if ExecEnv.get_environment() == "prod"
            else ExecEnv.ENGINE_CONFIG.dq_dev_bucket
        )

        spec["critical_functions"] = []
        spec["execution_point"] = execution_point
        spec["result_sink_db_table"] = None
        spec["result_sink_explode"] = True
        spec["fail_on_error"] = spec.get("fail_on_error", False)
        spec["max_percentage_failure"] = spec.get("max_percentage_failure", 1)

        if not spec.get("result_sink_extra_columns", None):
            spec["result_sink_extra_columns"] = [
                "validation_results.expectation_config.meta",
            ]
        else:
            spec["result_sink_extra_columns"] = [
                "validation_results.expectation_config.meta",
            ] + spec["result_sink_extra_columns"]
        if not spec.get("data_product_name", None):
            raise DQSpecMalformedException(
                "When using PRISMA DQ data_product_name must be defined."
            )
        spec["result_sink_location"] = (
            f"{dq_bucket}/{spec['data_product_name']}/result_sink/"
        )
        spec["processed_keys_location"] = (
            f"{dq_bucket}/{spec['data_product_name']}/dq_processed_keys/"
        )
        if not spec.get("tbl_to_derive_pk", None) and not spec.get(
            "unexpected_rows_pk", None
        ):
            raise DQSpecMalformedException(
                "When using PRISMA DQ either "
                "tbl_to_derive_pk or unexpected_rows_pk need to be defined."
            )
        return spec

    @staticmethod
    def validate_rule_id_duplication(
        specs: list[DQSpec],
    ) -> dict[str, str]:
        """Verify uniqueness of the dq_rule_id.

        Args:
            specs: a list of DQSpec to be validated

        Returns:
             A dictionary with the spec_id as key and
             rule_id as value for any duplicates.
        """
        error_dict = {}

        for spec in specs:
            dq_db_table = spec.dq_db_table
            dq_functions = spec.dq_functions
            spec_id = spec.spec_id

            if spec.dq_type == DQType.PRISMA.value and dq_db_table:
                dq_rule_id_query = f"""
                    SELECT dq_rule_id, COUNT(*) AS count
                    FROM {dq_db_table}
                    GROUP BY dq_rule_id
                    HAVING COUNT(*) > 1;
                    """  # nosec: B608

                duplicate_rule_id_table = ExecEnv.SESSION.sql(dq_rule_id_query)

                if not duplicate_rule_id_table.isEmpty():
                    rows = duplicate_rule_id_table.collect()
                    df_str = "; ".join([str(row) for row in rows])
                    error_dict[f"dq_spec_id: {spec_id}"] = df_str

            elif spec.dq_type == DQType.PRISMA.value and dq_functions:
                dq_rules_id_list = []
                for dq_function in dq_functions:
                    dq_rules_id_list.append(dq_function.args["meta"]["dq_rule_id"])

                if len(dq_rules_id_list) != len(set(dq_rules_id_list)):
                    error_dict[f"dq_spec_id: {spec_id}"] = "; ".join(
                        [str(dq_rule_id) for dq_rule_id in dq_rules_id_list]
                    )

        return error_dict


================================================
FILE: lakehouse_engine/utils/engine_usage_stats.py
================================================
"""Utilities for recording the engine activity."""

import json
from datetime import datetime
from urllib.parse import urlparse

from lakehouse_engine.core.definitions import CollectEngineUsage
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions


class EngineUsageStats(object):
    """Engine Usage utilities class."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def store_engine_usage(
        cls,
        acon: dict,
        func_name: str,
        collect_engine_usage: str = None,
        spark_confs: dict = None,
    ) -> None:
        """Collects and store Lakehouse Engine usage statistics.

        These statistics include the acon and other relevant information, such as
        the lakehouse engine version and the functions/algorithms being used.

        Args:
            acon: acon dictionary file.
            func_name: function name that called this log acon.
            collect_engine_usage: Lakehouse usage statistics collection strategy.
            spark_confs: optional dictionary with the spark confs to be used when
                collecting the engine usage.
        """
        if not cls._should_collect_usage(collect_engine_usage):
            return
        try:
            start_timestamp = datetime.now()
            timestamp_str = start_timestamp.strftime("%Y%m%d%H%M%S")
            usage_stats = cls._prepare_usage_stats(acon, spark_confs)
            engine_usage_path = cls._select_usage_path(
                usage_stats, collect_engine_usage
            )
            if engine_usage_path is None:
                return

            cls._add_metadata_to_stats(usage_stats, func_name, start_timestamp)
            log_file_name = f"eng_usage_{func_name}_{timestamp_str}.json"
            usage_stats_str = json.dumps(usage_stats, default=str)
            url = urlparse(
                f"{engine_usage_path}/{usage_stats['dp_name']}/"
                f"{start_timestamp.year}/{start_timestamp.month}/"
                f"{log_file_name}",
                allow_fragments=False,
            )
            try:
                FileStorageFunctions.write_payload(
                    engine_usage_path, url, usage_stats_str
                )
                cls._LOGGER.info("Storing Lakehouse Engine usage statistics")
            except FileNotFoundError as e:
                cls._LOGGER.error(f"Could not write engine stats into file: {e}.")
        except Exception as e:
            cls._LOGGER.error(
                "Failed while collecting the lakehouse engine stats: "
                f"Unexpected {e=}, {type(e)=}."
            )

    @classmethod
    def _should_collect_usage(cls, collect_engine_usage: str) -> bool:
        return (
            collect_engine_usage
            in [CollectEngineUsage.ENABLED.value, CollectEngineUsage.PROD_ONLY.value]
            or ExecEnv.ENGINE_CONFIG.collect_engine_usage
            in CollectEngineUsage.ENABLED.value
        )

    @classmethod
    def _prepare_usage_stats(cls, acon: dict, spark_confs: dict) -> dict:
        usage_stats = {"acon": ConfigUtils.remove_sensitive_info(acon)}
        if not ExecEnv.IS_SERVERLESS:
            DatabricksUtils.get_spark_conf_values(usage_stats, spark_confs)
        else:
            DatabricksUtils.get_usage_context_for_serverless(usage_stats)
        return usage_stats

    @classmethod
    def _select_usage_path(
        cls, usage_stats: dict, collect_engine_usage: str
    ) -> str | None:
        if usage_stats.get("environment") == "prod":
            return ExecEnv.ENGINE_CONFIG.engine_usage_path
        elif collect_engine_usage != CollectEngineUsage.PROD_ONLY.value:
            return ExecEnv.ENGINE_CONFIG.engine_dev_usage_path
        return None

    @classmethod
    def _add_metadata_to_stats(
        cls, usage_stats: dict, func_name: str, start_timestamp: datetime
    ) -> None:
        usage_stats["function"] = func_name
        usage_stats["engine_version"] = ConfigUtils.get_engine_version()
        usage_stats["start_timestamp"] = start_timestamp
        usage_stats["year"] = start_timestamp.year
        usage_stats["month"] = start_timestamp.month


================================================
FILE: lakehouse_engine/utils/expectations_utils.py
================================================
"""Utilities to be used by custom expectations."""

from typing import Any, Dict


def validate_result(
    expectation_configuration: Any,
    metrics: dict,
) -> None:
    """Validates that the unexpected_index_list in the tests is corretly defined.

    Additionally, it validates the expectation using the GE _validate method.

    Args:
        expectation_configuration: Expectation configuration.
        metrics: Test result metrics.
        runtime_configuration: Configuration used when running the expectation.
        execution_engine: Execution engine used in the expectation.
        base_expectation: Base expectation to validate.
    """
    example_unexpected_index_list = _get_example_unexpected_index_list(
        expectation_configuration
    )

    test_unexpected_index_list = _get_test_unexpected_index_list(
        expectation_configuration.map_metric, metrics
    )
    if example_unexpected_index_list:
        if example_unexpected_index_list != test_unexpected_index_list:
            raise AssertionError(
                f"Example unexpected_index_list: {example_unexpected_index_list}\n"
                f"Test unexpected_index_list: {test_unexpected_index_list}"
            )


def _get_example_unexpected_index_list(expectation_configuration: Any) -> list:
    """Retrieves the unexpected index list defined from the example used on the test.

    This needs to be done manually because GE allows us to get either the complete
    output of the test or the complete configuration used on the test.
    To get around this limitation this function is used to fetch the example used
    in the test directly from the expectation itself.

    Args:
        expectation_configuration: Expectation configuration.

    Returns:
        List of unexpected indexes defined in the example used.
    """
    filtered_example: dict = {"out": {"unexpected_index_list": []}}

    for example in expectation_configuration.examples:
        for test in example["tests"]:  # type: ignore
            example_result_format = []
            if "result_format" in expectation_configuration.result_format:
                example_result_format = expectation_configuration.result_format

            if test["in"]["result_format"] == example_result_format:
                filtered_example = test

    example_unexpected_index_list = []
    if "unexpected_index_list" in filtered_example["out"]:
        example_unexpected_index_list = filtered_example["out"]["unexpected_index_list"]

    return example_unexpected_index_list


def _get_test_unexpected_index_list(metric_name: str, metrics: Dict) -> list:
    """Retrieves the unexpected index list from the test case that has been run.

    Args:
        metric_name: Name of the metric to retrieve the unexpected index list.
        metrics: Metric values resulting from the test.

    Returns:
        List of unexpected indexes retrieved form the test.
    """
    test_unexpected_index_list = []
    if f"{metric_name}.unexpected_index_list" in metrics:
        if metrics[f"{metric_name}.unexpected_index_list"]:
            test_unexpected_index_list = metrics[f"{metric_name}.unexpected_index_list"]
        else:
            test_unexpected_index_list = []

    return test_unexpected_index_list


================================================
FILE: lakehouse_engine/utils/extraction/__init__.py
================================================
"""Extraction utilities package."""


================================================
FILE: lakehouse_engine/utils/extraction/jdbc_extraction_utils.py
================================================
"""Utilities module for JDBC extraction processes."""

from abc import abstractmethod
from dataclasses import dataclass
from datetime import datetime, timezone
from enum import Enum
from logging import Logger
from typing import Any, Dict, List, Optional, Tuple

from lakehouse_engine.core.definitions import InputFormat, InputSpec, ReadType
from lakehouse_engine.utils.logging_handler import LoggingHandler


class JDBCExtractionType(Enum):
    """Standardize the types of extractions we can have from a JDBC source."""

    INIT = "init"
    DELTA = "delta"


@dataclass
class JDBCExtraction(object):
    """Configurations available for an Extraction from a JDBC source.

    These configurations cover:

    - user: username to connect to JDBC source.
    - password: password to connect to JDBC source (always use secrets,
        don't use text passwords in your code).
    - url: url to connect to JDBC source.
    - dbtable: `database.table` to extract data from.
    - calc_upper_bound_schema: custom schema used for the upper bound calculation.
    - changelog_table: table of type changelog from which to extract data,
        when the extraction type is delta.
    - partition_column: column used to split the extraction.
    - latest_timestamp_data_location: data location (e.g., s3) containing the data
        to get the latest timestamp already loaded into bronze.
    - latest_timestamp_data_format: the format of the dataset in
        latest_timestamp_data_location. Default: delta.
    - extraction_type: type of extraction (delta or init). Default: "delta".
    - driver: JDBC driver name. Default: "com.sap.db.jdbc.Driver".
    - num_partitions: number of Spark partitions to split the extraction.
    - lower_bound: lower bound to decide the partition stride.
    - upper_bound: upper bound to decide the partition stride. If
        calculate_upper_bound is True, then upperBound will be
        derived by our upper bound optimizer, using the partition column.
    - default_upper_bound: the value to use as default upper bound in case
        the result of the upper bound calculation is None. Default: "1".
    - fetch_size: how many rows to fetch per round trip. Default: "100000".
    - compress: enable network compression. Default: True.
    - custom_schema: specify custom_schema for particular columns of the
        returned dataframe in the init/delta extraction of the source table.
    - min_timestamp: min timestamp to consider to filter the changelog data.
        Default: None and automatically derived from the location provided.
        In case this one is provided it has precedence and the calculation
        is not done.
    - max_timestamp: max timestamp to consider to filter the changelog data.
        Default: None and automatically derived from the table having information
        about the extraction requests, their timestamps and their status.
        In case this one is provided it has precedence and the calculation
        is not done.
    - generate_predicates: whether to generate predicates automatically or not.
        Default: False.
    - predicates: list containing all values to partition (if generate_predicates
        is used, the manual values provided are ignored). Default: None.
    - predicates_add_null: whether to consider null on predicates list.
        Default: True.
    - extraction_timestamp: the timestamp of the extraction. Default: current time
        following the format "%Y%m%d%H%M%S".
    - max_timestamp_custom_schema: custom schema used on the max_timestamp derivation
        from the table holding the extraction requests information.
    """

    user: str
    password: str
    url: str
    dbtable: str
    calc_upper_bound_schema: Optional[str] = None
    changelog_table: Optional[str] = None
    partition_column: Optional[str] = None
    latest_timestamp_data_location: Optional[str] = None
    latest_timestamp_data_format: str = InputFormat.DELTAFILES.value
    extraction_type: str = JDBCExtractionType.DELTA.value
    driver: str = "com.sap.db.jdbc.Driver"
    num_partitions: Optional[int] = None
    lower_bound: Optional[int | float | str] = None
    upper_bound: Optional[int | float | str] = None
    default_upper_bound: str = "1"
    fetch_size: str = "100000"
    compress: bool = True
    custom_schema: Optional[str] = None
    min_timestamp: Optional[str] = None
    max_timestamp: Optional[str] = None
    generate_predicates: bool = False
    predicates: Optional[List] = None
    predicates_add_null: bool = True
    extraction_timestamp: str = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
    max_timestamp_custom_schema: Optional[str] = None


class JDBCExtractionUtils(object):
    """Utils for managing data extraction from particularly relevant JDBC sources."""

    def __init__(self, jdbc_extraction: Any):
        """Construct JDBCExtractionUtils.

        Args:
            jdbc_extraction: JDBC Extraction configurations. Can be of type:
                JDBCExtraction, SAPB4Extraction or SAPBWExtraction.
        """
        self._LOGGER: Logger = LoggingHandler(__name__).get_logger()
        self._JDBC_EXTRACTION = jdbc_extraction

    @staticmethod
    def get_additional_spark_options(
        input_spec: InputSpec, options: dict, ignore_options: List = None
    ) -> dict:
        """Helper to get additional Spark Options initially passed.

        If people provide additional Spark options, not covered by the util function
        arguments (get_spark_jdbc_options), we need to consider them.
        Thus, we update the options retrieved by the utils, by checking if there is
        any Spark option initially provided that is not yet considered in the retrieved
        options or function arguments and if the value for the key is not None.
        If these conditions are filled, we add the options and return the complete dict.

        Args:
            input_spec: the input specification.
            options: dict with Spark options.
            ignore_options: list of options to be ignored by the process.
                Spark read has two different approaches to parallelize
                reading process, one of them is using upper/lower bound,
                another one is using predicates, those process can't be
                executed at the same time, you must choose one of them.
                By choosing predicates you can't pass lower and upper bound,
                also can't pass number of partitions and partition column
                otherwise spark will interpret the execution partitioned by
                upper and lower bound and will expect to fill all variables.
                To avoid fill all predicates hardcoded at the acon, there is
                a feature that automatically generates all predicates for init
                or delta load based on input partition column, but at the end
                of the process, partition column can't be passed to the options,
                because we are choosing predicates execution, that is why to
                generate predicates we need to pass some options to ignore.

        Returns:
             a dict with all the options passed as argument, plus the options that
             were initially provided, but were not used in the util
             (get_spark_jdbc_options).
        """
        func_args = JDBCExtractionUtils.get_spark_jdbc_options.__code__.co_varnames

        if ignore_options is None:
            ignore_options = []
        ignore_options = ignore_options + list(options.keys()) + list(func_args)

        return {
            key: value
            for key, value in input_spec.options.items()
            if key not in ignore_options and value is not None
        }

    def get_predicates(self, predicates_query: str) -> List:
        """Get the predicates list, based on a predicates query.

        Args:
            predicates_query: query to use as the basis to get the distinct values for
                a specified column, based on which predicates are generated.

        Returns:
            List containing the predicates to use to split the extraction from
            JDBC sources.
        """
        jdbc_args = {
            "url": self._JDBC_EXTRACTION.url,
            "table": predicates_query,
            "properties": {
                "user": self._JDBC_EXTRACTION.user,
                "password": self._JDBC_EXTRACTION.password,
                "driver": self._JDBC_EXTRACTION.driver,
            },
        }
        from lakehouse_engine.io.reader_factory import ReaderFactory

        predicates_df = ReaderFactory.get_data(
            InputSpec(
                spec_id="get_predicates",
                data_format=InputFormat.JDBC.value,
                read_type=ReadType.BATCH.value,
                jdbc_args=jdbc_args,
            )
        )

        predicates_list = [
            f"{self._JDBC_EXTRACTION.partition_column}='{row[0]}'"
            for row in predicates_df.collect()
        ]

        if self._JDBC_EXTRACTION.predicates_add_null:
            predicates_list.append(f"{self._JDBC_EXTRACTION.partition_column} IS NULL")
        self._LOGGER.info(
            f"The following predicate list was generated: {predicates_list}"
        )

        return predicates_list

    def get_spark_jdbc_options(self) -> Tuple[dict, dict]:
        """Get the Spark options to extract data from a JDBC source.

        Returns:
            The Spark jdbc args dictionary, including the query to submit
            and also options args dictionary.
        """
        options_args: Dict[str, Any] = {
            "fetchSize": self._JDBC_EXTRACTION.fetch_size,
            "compress": self._JDBC_EXTRACTION.compress,
        }

        jdbc_args = {
            "url": self._JDBC_EXTRACTION.url,
            "properties": {
                "user": self._JDBC_EXTRACTION.user,
                "password": self._JDBC_EXTRACTION.password,
                "driver": self._JDBC_EXTRACTION.driver,
            },
        }

        if self._JDBC_EXTRACTION.extraction_type == JDBCExtractionType.DELTA.value:
            jdbc_args["table"], predicates_query = self._get_delta_query()
        else:
            jdbc_args["table"], predicates_query = self._get_init_query()

        if self._JDBC_EXTRACTION.custom_schema:
            options_args["customSchema"] = self._JDBC_EXTRACTION.custom_schema

        if self._JDBC_EXTRACTION.generate_predicates:
            jdbc_args["predicates"] = self.get_predicates(predicates_query)
        else:
            if self._JDBC_EXTRACTION.predicates:
                jdbc_args["predicates"] = self._JDBC_EXTRACTION.predicates
            else:
                options_args = self._get_extraction_partition_opts(
                    options_args,
                )

        return options_args, jdbc_args

    def get_spark_jdbc_optimal_upper_bound(self) -> Any:
        """Get an optimal upperBound to properly split a Spark JDBC extraction.

        Returns:
             Either an int, date or timestamp to serve as upperBound Spark JDBC option.
        """
        options = {}
        if self._JDBC_EXTRACTION.calc_upper_bound_schema:
            options["customSchema"] = self._JDBC_EXTRACTION.calc_upper_bound_schema

        table = (
            self._JDBC_EXTRACTION.dbtable
            if self._JDBC_EXTRACTION.extraction_type == JDBCExtractionType.INIT.value
            else self._JDBC_EXTRACTION.changelog_table
        )
        jdbc_args = {
            "url": self._JDBC_EXTRACTION.url,
            "table": f"(SELECT COALESCE(MAX({self._JDBC_EXTRACTION.partition_column}), "
            f"{self._JDBC_EXTRACTION.default_upper_bound}) "
            f"upper_bound FROM {table})",  # nosec: B608
            "properties": {
                "user": self._JDBC_EXTRACTION.user,
                "password": self._JDBC_EXTRACTION.password,
                "driver": self._JDBC_EXTRACTION.driver,
            },
        }

        from lakehouse_engine.io.reader_factory import ReaderFactory

        upper_bound_df = ReaderFactory.get_data(
            InputSpec(
                spec_id="get_optimal_upper_bound",
                data_format=InputFormat.JDBC.value,
                read_type=ReadType.BATCH.value,
                jdbc_args=jdbc_args,
                options=options,
            )
        )
        upper_bound = upper_bound_df.first()[0]

        if upper_bound is not None:
            self._LOGGER.info(
                f"Upper Bound '{upper_bound}' derived from "
                f"'{self._JDBC_EXTRACTION.dbtable}' using the column "
                f"'{self._JDBC_EXTRACTION.partition_column}'"
            )
            return upper_bound
        else:
            raise AttributeError(
                f"Not able to calculate upper bound from "
                f"'{self._JDBC_EXTRACTION.dbtable}' using "
                f"the column '{self._JDBC_EXTRACTION.partition_column}'"
            )

    def _get_extraction_partition_opts(
        self,
        options_args: dict,
    ) -> dict:
        """Get an options dict with custom extraction partition options.

        Args:
            options_args: spark jdbc reader options.
        """
        if self._JDBC_EXTRACTION.num_partitions:
            options_args["numPartitions"] = self._JDBC_EXTRACTION.num_partitions
        if self._JDBC_EXTRACTION.upper_bound:
            options_args["upperBound"] = self._JDBC_EXTRACTION.upper_bound
        if self._JDBC_EXTRACTION.lower_bound:
            options_args["lowerBound"] = self._JDBC_EXTRACTION.lower_bound
        if self._JDBC_EXTRACTION.partition_column:
            options_args["partitionColumn"] = self._JDBC_EXTRACTION.partition_column

        return options_args

    def _get_max_timestamp(self, max_timestamp_query: str) -> str:
        """Get the max timestamp, based on the provided query.

        Args:
            max_timestamp_query: the query used to derive the max timestamp.

        Returns:
            A string having the max timestamp.
        """
        jdbc_args = {
            "url": self._JDBC_EXTRACTION.url,
            "table": max_timestamp_query,
            "properties": {
                "user": self._JDBC_EXTRACTION.user,
                "password": self._JDBC_EXTRACTION.password,
                "driver": self._JDBC_EXTRACTION.driver,
            },
        }
        from lakehouse_engine.io.reader_factory import ReaderFactory

        max_timestamp_df = ReaderFactory.get_data(
            InputSpec(
                spec_id="get_max_timestamp",
                data_format=InputFormat.JDBC.value,
                read_type=ReadType.BATCH.value,
                jdbc_args=jdbc_args,
                options={
                    "customSchema": self._JDBC_EXTRACTION.max_timestamp_custom_schema
                },
            )
        )
        max_timestamp = max_timestamp_df.first()[0]
        self._LOGGER.info(
            f"Max timestamp {max_timestamp} derived from query: {max_timestamp_query}"
        )

        return str(max_timestamp)

    @abstractmethod
    def _get_delta_query(self) -> Tuple[str, str]:
        """Get a query to extract delta (partially) from a source."""
        pass

    @abstractmethod
    def _get_init_query(self) -> Tuple[str, str]:
        """Get a query to extract init (fully) from a source."""
        pass


================================================
FILE: lakehouse_engine/utils/extraction/sap_b4_extraction_utils.py
================================================
"""Utilities module for SAP B4 extraction processes."""

import re
from dataclasses import dataclass
from enum import Enum
from logging import Logger
from typing import Any, Optional, Tuple

from lakehouse_engine.core.definitions import InputSpec, ReadType
from lakehouse_engine.transformers.aggregators import Aggregators
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionUtils,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


class ADSOTypes(Enum):
    """Standardise the types of ADSOs we can have for Extractions from SAP B4."""

    AQ = "AQ"
    CL = "CL"
    SUPPORTED_TYPES = [AQ, CL]


@dataclass
class SAPB4Extraction(JDBCExtraction):
    """Configurations available for an Extraction from SAP B4.

    It inherits from JDBCExtraction configurations, so it can use
    and/or overwrite those configurations.

    These configurations cover:

    - latest_timestamp_input_col: the column containing the request timestamps
        in the dataset in latest_timestamp_data_location. Default: REQTSN.
    - request_status_tbl: the name of the SAP B4 table having information
        about the extraction requests. Composed of database.table.
        Default: SAPHANADB.RSPMREQUEST.
    - request_col_name: name of the column having the request timestamp to join
        with the request status table. Default: REQUEST_TSN.
    - data_target: the data target to extract from. User in the join operation with
        the request status table.
    - act_req_join_condition: the join condition into activation table
        can be changed using this property.
        Default: 'tbl.reqtsn = req.request_col_name'.
    - include_changelog_tech_cols: whether to include the technical columns
        (usually coming from the changelog) table or not.
    - extra_cols_req_status_tbl: columns to be added from request status table.
        It needs to contain the prefix "req.". E.g. "req.col1 as column_one,
        req.col2 as column_two".
    - request_status_tbl_filter: filter to use for filtering the request status table,
        influencing the calculation of the max timestamps and the delta extractions.
    - adso_type: the type of ADSO that you are extracting from. Can be "AQ" or "CL".
    - max_timestamp_custom_schema: the custom schema to apply on the calculation of
        the max timestamp to consider for the delta extractions.
        Default: timestamp DECIMAL(23,0).
    - default_max_timestamp: the timestamp to use as default, when it is not possible
        to derive one.
    - default_min_timestamp: the timestamp to use as default, when it is not possible
        to derive one.
    - custom_schema: specify custom_schema for particular columns of the
        returned dataframe in the init/delta extraction of the source table.
    """

    latest_timestamp_input_col: str = "REQTSN"
    request_status_tbl: str = "SAPHANADB.RSPMREQUEST"
    request_col_name: str = "REQUEST_TSN"
    data_target: Optional[str] = None
    act_req_join_condition: Optional[str] = None
    include_changelog_tech_cols: Optional[bool] = None
    extra_cols_req_status_tbl: Optional[str] = None
    request_status_tbl_filter: Optional[str] = None
    adso_type: Optional[str] = None
    max_timestamp_custom_schema: str = "timestamp DECIMAL(23,0)"
    default_max_timestamp: str = "1970000000000000000000"
    default_min_timestamp: str = "1970000000000000000000"
    custom_schema: str = "REQTSN DECIMAL(23,0)"


class SAPB4ExtractionUtils(JDBCExtractionUtils):
    """Utils for managing data extraction from SAP B4."""

    def __init__(self, sap_b4_extraction: SAPB4Extraction):
        """Construct SAPB4ExtractionUtils.

        Args:
            sap_b4_extraction: SAP B4 Extraction configurations.
        """
        self._LOGGER: Logger = LoggingHandler(__name__).get_logger()
        self._B4_EXTRACTION = sap_b4_extraction
        self._B4_EXTRACTION.request_status_tbl_filter = (
            self._get_req_status_tbl_filter()
        )
        self._MAX_TIMESTAMP_QUERY = f""" --# nosec
                (SELECT COALESCE(MAX({self._B4_EXTRACTION.request_col_name}),
                    {self._B4_EXTRACTION.default_max_timestamp}) as timestamp
                FROM {self._B4_EXTRACTION.request_status_tbl}
                WHERE {self._B4_EXTRACTION.request_status_tbl_filter})
            """  # nosec: B608
        super().__init__(sap_b4_extraction)

    @staticmethod
    def get_data_target(input_spec_opt: dict) -> str:
        """Get the data_target from the data_target option or derive it.

        By definition data_target is the same for the table and changelog table and
        is the same string ignoring everything before / and the first and last
        character after /. E.g. for a dbtable /BIC/abtable12, the data_target
        would be btable1.

        Args:
            input_spec_opt: options from the input_spec.

        Returns:
            A string with the data_target.
        """
        exclude_chars = """["'\\\\]"""
        data_target: str = input_spec_opt.get(
            "data_target",
            re.sub(exclude_chars, "", input_spec_opt["dbtable"]).split("/")[-1][1:-1],
        )

        return data_target

    def _get_init_query(self) -> Tuple[str, str]:
        """Get a query to do an init load based on a ADSO on a SAP B4 system.

        Returns:
            A query to submit to SAP B4 for the initial data extraction. The query
            is enclosed in parentheses so that Spark treats it as a table and supports
            it in the dbtable option.
        """
        extraction_query = self._get_init_extraction_query()

        predicates_query = f"""
        (SELECT DISTINCT({self._B4_EXTRACTION.partition_column})
        FROM {self._B4_EXTRACTION.dbtable} t)
        """  # nosec: B608

        return extraction_query, predicates_query

    def _get_init_extraction_query(self) -> str:
        """Get the init extraction query based on current timestamp.

        Returns:
            A query to submit to SAP B4 for the initial data extraction.
        """
        changelog_tech_cols = (
            f"""{self._B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn,
                '0' AS datapakid,
                0 AS record,"""
            if self._B4_EXTRACTION.include_changelog_tech_cols
            else ""
        )

        extraction_query = f"""
                (SELECT t.*, {changelog_tech_cols}
                    CAST({self._B4_EXTRACTION.extraction_timestamp}
                        AS DECIMAL(15,0)) AS extraction_start_timestamp
                FROM {self._B4_EXTRACTION.dbtable} t
                )"""  # nosec: B608

        return extraction_query

    def _get_delta_query(self) -> Tuple[str, str]:
        """Get a delta query for an SAP B4 ADSO.

        An SAP B4 ADSO requires a join with a special type of table often called
        requests status table (RSPMREQUEST), in which B4 tracks down the timestamps,
        status and metrics associated with the several data loads that were performed
        into B4. Depending on the type of ADSO (AQ or CL) the join condition and also
        the ADSO/table to consider to extract from will be different.
        For AQ types, there is only the active table, from which we extract both inits
        and deltas and this is also the table used to join with RSPMREQUEST to derive
        the next portion of the data to extract.
        For the CL types, we have an active table/adso from which we extract the init
        and one changelog table from which we extract the delta portions of data.
        Depending, if it is an init or delta one table or the other is also used to join
        with RSPMREQUEST.

        The logic on this function basically ensures that we are reading from the source
        table considering the data that has arrived between the maximum timestamp that
        is available in our target destination and the max timestamp of the extractions
        performed and registered in the RSPMREQUEST table, which follow the filtering
         criteria.

        Returns:
            A query to submit to SAP B4 for the delta data extraction. The query
            is enclosed in parentheses so that Spark treats it as a table and supports
            it in the dbtable option.
        """
        if not self._B4_EXTRACTION.min_timestamp:
            from lakehouse_engine.io.reader_factory import ReaderFactory

            latest_timestamp_data_df = ReaderFactory.get_data(
                InputSpec(
                    spec_id="data_with_latest_timestamp",
                    data_format=self._B4_EXTRACTION.latest_timestamp_data_format,
                    read_type=ReadType.BATCH.value,
                    location=self._B4_EXTRACTION.latest_timestamp_data_location,
                )
            )
            min_timestamp = latest_timestamp_data_df.transform(
                Aggregators.get_max_value(
                    self._B4_EXTRACTION.latest_timestamp_input_col
                )
            ).first()[0]
        else:
            min_timestamp = self._B4_EXTRACTION.min_timestamp

        min_timestamp = (
            min_timestamp
            if min_timestamp
            else self._B4_EXTRACTION.default_min_timestamp
        )

        max_timestamp = (
            self._B4_EXTRACTION.max_timestamp
            if self._B4_EXTRACTION.max_timestamp
            else self._get_max_timestamp(self._MAX_TIMESTAMP_QUERY)
        )

        if self._B4_EXTRACTION.act_req_join_condition:
            join_condition = f"{self._B4_EXTRACTION.act_req_join_condition}"
        else:
            join_condition = f"tbl.reqtsn = req.{self._B4_EXTRACTION.request_col_name}"

        base_query = f""" --# nosec
        FROM {self._B4_EXTRACTION.changelog_table} AS tbl
        JOIN {self._B4_EXTRACTION.request_status_tbl} AS req
            ON {join_condition}
        WHERE {self._B4_EXTRACTION.request_status_tbl_filter}
            AND req.{self._B4_EXTRACTION.request_col_name} > {min_timestamp}
            AND req.{self._B4_EXTRACTION.request_col_name} <= {max_timestamp})
        """

        main_cols = f"""
            (SELECT tbl.*,
                CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0))
                    AS extraction_start_timestamp
            """

        # We join the main columns considered for the extraction with
        # extra_cols_act_request that people might want to use, filtering to only
        # add the comma and join the strings, in case extra_cols_act_request is
        # not None or empty.
        extraction_query_cols = ",".join(
            filter(None, [main_cols, self._B4_EXTRACTION.extra_cols_req_status_tbl])
        )

        extraction_query = extraction_query_cols + base_query

        predicates_query = f"""
        (SELECT DISTINCT({self._B4_EXTRACTION.partition_column})
        {base_query}
        """

        return extraction_query, predicates_query

    def _get_req_status_tbl_filter(self) -> Any:
        if self._B4_EXTRACTION.request_status_tbl_filter:
            return self._B4_EXTRACTION.request_status_tbl_filter
        else:
            if self._B4_EXTRACTION.adso_type == ADSOTypes.AQ.value:
                return f"""
                    STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND
                    LAST_OPERATION_TYPE IN ('C', 'U') AND REQUEST_STATUS IN ('GG', 'GR')
                    AND UPPER(DATATARGET) = UPPER('{self._B4_EXTRACTION.data_target}')
                """
            elif self._B4_EXTRACTION.adso_type == ADSOTypes.CL.value:
                return f"""
                    STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND
                    LAST_OPERATION_TYPE IN ('C', 'U') AND REQUEST_STATUS IN ('GG')
                    AND UPPER(DATATARGET) = UPPER('{self._B4_EXTRACTION.data_target}')
                """
            else:
                raise NotImplementedError(
                    f"The requested ADSO Type is not fully implemented and/or tested."
                    f"Supported ADSO Types: {ADSOTypes.SUPPORTED_TYPES}"
                )


================================================
FILE: lakehouse_engine/utils/extraction/sap_bw_extraction_utils.py
================================================
"""Utilities module for SAP BW extraction processes."""

from dataclasses import dataclass
from logging import Logger
from typing import Optional, Tuple

from lakehouse_engine.core.definitions import InputFormat, InputSpec, ReadType
from lakehouse_engine.transformers.aggregators import Aggregators
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionType,
    JDBCExtractionUtils,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler


@dataclass
class SAPBWExtraction(JDBCExtraction):
    """Configurations available for an Extraction from SAP BW.

    It inherits from SAPBWExtraction configurations, so it can use
    and/or overwrite those configurations.

    These configurations cover:

    - latest_timestamp_input_col: the column containing the actrequest timestamp
        in the dataset in latest_timestamp_data_location. Default:
        "actrequest_timestamp".
    - act_request_table: the name of the SAP BW activation requests table.
        Composed of database.table. Default: SAPPHA.RSODSACTREQ.
    - request_col_name: name of the column having the request to join
        with the activation request table. Default: actrequest.
    - act_req_join_condition: the join condition into activation table
        can be changed using this property.
        Default: 'changelog_tbl.request = act_req.request_col_name'.
    - odsobject: name of BW Object, used for joining with the activation request
        table to get the max actrequest_timestamp to consider while filtering
        the changelog table.
    - include_changelog_tech_cols: whether to include the technical columns
        (usually coming from the changelog) table or not. Default: True.
    - extra_cols_act_request: list of columns to be added from act request table.
        It needs to contain the prefix "act_req.". E.g. "act_req.col1
        as column_one, act_req.col2 as column_two".
    - get_timestamp_from_act_request: whether to get init timestamp
        from act request table or assume current/given timestamp.
    - sap_bw_schema: sap bw schema. Default: SAPPHA.
    - max_timestamp_custom_schema: the custom schema to apply on the calculation of
        the max timestamp to consider for the delta extractions.
        Default: timestamp DECIMAL(23,0).
    - default_max_timestamp: the timestamp to use as default, when it is not possible
        to derive one.
    - default_min_timestamp: the timestamp to use as default, when it is not possible
        to derive one.
    - ods_prefix: the prefix to use when looking for the changelog table in SAP BW.
         Default: "8".
     - logsys: the BW source & receiver system ID to use to get the tsprefix
        (prefix for transfer structures) which is used while deriving the changelog
        table. Default: None & generated based on the schema.
    """

    latest_timestamp_input_col: str = "actrequest_timestamp"
    request_col_name: str = "actrequest"
    act_req_join_condition: Optional[str] = None
    odsobject: Optional[str] = None
    include_changelog_tech_cols: bool = True
    extra_cols_act_request: Optional[str] = None
    get_timestamp_from_act_request: bool = False
    sap_bw_schema: str = "SAPPHA"
    act_request_table: str = f"{sap_bw_schema}.RSODSACTREQ"
    max_timestamp_custom_schema: str = "timestamp DECIMAL(15,0)"
    default_max_timestamp: str = "197000000000000"
    default_min_timestamp: str = "197000000000000"
    ods_prefix: str = "8"
    logsys: Optional[str] = None
    custom_schema: Optional[str] = "REQUEST VARCHAR(30), DATAPAKID VARCHAR(6)"


class SAPBWExtractionUtils(JDBCExtractionUtils):
    """Utils for managing data extraction from particularly relevant JDBC sources."""

    def __init__(self, sap_bw_extraction: SAPBWExtraction):
        """Construct SAPBWExtractionUtils.

        Args:
            sap_bw_extraction: SAP BW Extraction configurations.
        """
        self._LOGGER: Logger = LoggingHandler(__name__).get_logger()
        self._BW_EXTRACTION = sap_bw_extraction
        self._BW_EXTRACTION.changelog_table = self.get_changelog_table()
        self._MAX_TIMESTAMP_QUERY = f""" --# nosec
                (SELECT COALESCE(MAX(timestamp),
                    {self._BW_EXTRACTION.default_max_timestamp}) as timestamp
                FROM {self._BW_EXTRACTION.act_request_table}
                WHERE odsobject = '{self._BW_EXTRACTION.odsobject}'
                 AND operation = 'A' AND status = '0')
            """  # nosec: B608
        super().__init__(sap_bw_extraction)

    def get_changelog_table(self) -> str:
        """Get the changelog table, given an odsobject.

        Returns:
             String to use as changelog_table.
        """
        if (
            self._BW_EXTRACTION.odsobject is not None
            and self._BW_EXTRACTION.changelog_table is None
            and self._BW_EXTRACTION.extraction_type != JDBCExtractionType.INIT.value
        ):
            logsys_cond = self.get_logsys_cond()
            prefix = self._BW_EXTRACTION.ods_prefix
            odsobject = self._BW_EXTRACTION.odsobject

            if self._BW_EXTRACTION.sap_bw_schema:
                system_table = f"{self._BW_EXTRACTION.sap_bw_schema}.RSTSODS"
                pref_table = f"{self._BW_EXTRACTION.sap_bw_schema}.RSBASIDOC"
            else:
                system_table = "RSTSODS"
                pref_table = "RSBASIDOC"

            query = f"""
                    (SELECT ODSNAME_TECH
                    FROM {system_table} o
                    JOIN {pref_table} p ON {logsys_cond}
                    AND o.ODSNAME = '{prefix}{odsobject}_' || p.tsprefix
                    AND USERAPP = 'CHANGELOG' AND VERSION = '000')
                """  # nosec: B608
            self._LOGGER.info(
                f"Deriving changelog_table using the following query: {query}"
            )
            jdbc_args = {
                "url": self._BW_EXTRACTION.url,
                "table": query,
                "properties": {
                    "user": self._BW_EXTRACTION.user,
                    "password": self._BW_EXTRACTION.password,
                    "driver": self._BW_EXTRACTION.driver,
                },
            }
            from lakehouse_engine.io.reader_factory import ReaderFactory

            changelog_df = ReaderFactory.get_data(
                InputSpec(
                    spec_id="changelog_table",
                    data_format=InputFormat.JDBC.value,
                    read_type=ReadType.BATCH.value,
                    jdbc_args=jdbc_args,
                )
            )
            changelog_tbl_nbr = changelog_df.count()
            if changelog_tbl_nbr > 1:
                raise ValueError(
                    f"More than one changelog table found for {odsobject}."
                    f"Aborting. {changelog_df.show()}"
                )
            if changelog_tbl_nbr == 0:
                raise ValueError(f"No changelog table found for {odsobject}. Aborting.")

            changelog_table = (
                f'{self._BW_EXTRACTION.sap_bw_schema}."{changelog_df.first()[0]}"'
                if self._BW_EXTRACTION.sap_bw_schema
                else str(changelog_df.first()[0])
            )
        else:
            changelog_table = (
                self._BW_EXTRACTION.changelog_table
                if self._BW_EXTRACTION.changelog_table
                else f"{self._BW_EXTRACTION.dbtable}_cl"
            )
        self._LOGGER.info(f"The changelog table derived is: '{changelog_table}'")

        return changelog_table

    @staticmethod
    def get_odsobject(input_spec_opt: dict) -> str:
        """Get the odsobject based on the provided options.

        With the table name we may also get the db name, so we need to split.
        Moreover, there might be the need for people to specify odsobject if
        it is different from the dbtable.

        Args:
            input_spec_opt: options from the input_spec.

        Returns:
            A string with the odsobject.
        """
        return str(
            input_spec_opt["dbtable"].split(".")[1]
            if len(input_spec_opt["dbtable"].split(".")) > 1
            else input_spec_opt["dbtable"]
        )

    def get_logsys_cond(self) -> str:
        """Get logsys condition to join & get the tsprefix for the changelog derivation.

        Usually the condition on the else is enough.

        Returns:
            The logsys condition.
        """
        if self._BW_EXTRACTION.logsys:
            logsys = self._BW_EXTRACTION.logsys
            return f"p.slogsys = '{logsys}' AND p.rlogsys = '{logsys}'"
        else:
            return "p.slogsys = p.rlogsys"

    def _get_init_query(self) -> Tuple[str, str]:
        """Get a query to do an init load based on a DSO on a SAP BW system.

        Returns:
            A query to submit to SAP BW for the initial data extraction. The query
            is enclosed in parentheses so that Spark treats it as a table and supports
            it in the dbtable option.
        """
        if self._BW_EXTRACTION.get_timestamp_from_act_request:
            # check if we are dealing with a DSO of type Write Optimised
            if self._BW_EXTRACTION.dbtable == self._BW_EXTRACTION.changelog_table:
                extraction_query = self._get_init_extraction_query_act_req_timestamp()
            else:
                raise AttributeError(
                    "Not able to get the extraction query. The option "
                    "'get_timestamp_from_act_request' is only "
                    "available/useful for DSOs of type Write Optimised."
                )
        else:
            extraction_query = self._get_init_extraction_query()

        predicates_query = f"""
        (SELECT DISTINCT({self._BW_EXTRACTION.partition_column})
        FROM {self._BW_EXTRACTION.dbtable} t)
        """  # nosec: B608

        return extraction_query, predicates_query

    def _get_init_extraction_query(self) -> str:
        """Get extraction query based on given/current timestamp.

        Returns:
            A query to submit to SAP BW for the initial data extraction.
        """
        changelog_tech_cols = (
            f"""'0' AS request,
                CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15, 0))
                 AS actrequest_timestamp,
                '0' AS datapakid,
                0 AS partno,
                0 AS record,"""
            if self._BW_EXTRACTION.include_changelog_tech_cols
            else f"CAST({self._BW_EXTRACTION.extraction_timestamp} "
            f"AS DECIMAL(15, 0))"
            f" AS actrequest_timestamp,"
        )

        extraction_query = f"""
                (SELECT t.*,
                    {changelog_tech_cols}
                    CAST({self._BW_EXTRACTION.extraction_timestamp}
                        AS DECIMAL(15, 0)) AS extraction_start_timestamp
                FROM {self._BW_EXTRACTION.dbtable} t
                )"""  # nosec: B608

        return extraction_query

    def _get_init_extraction_query_act_req_timestamp(self) -> str:
        """Get extraction query assuming the init timestamp from act_request table.

        Returns:
            A query to submit to SAP BW for the initial data extraction from
            write optimised DSOs, receiving the actrequest_timestamp from
            the activation requests table.
        """
        extraction_query = f"""
            (SELECT t.*,
                act_req.timestamp as actrequest_timestamp,
                CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15, 0))
                 AS extraction_start_timestamp
            FROM {self._BW_EXTRACTION.dbtable} t
            JOIN {self._BW_EXTRACTION.act_request_table} AS act_req ON
                t.request = act_req.{self._BW_EXTRACTION.request_col_name}
            WHERE act_req.odsobject = '{self._BW_EXTRACTION.odsobject}'
                AND operation = 'A' AND status = '0'
            )"""  # nosec: B608

        return extraction_query

    def _get_delta_query(self) -> Tuple[str, str]:
        """Get a delta query for an SAP BW DSO.

        An SAP BW DSO requires a join with a special type of table often called
        activation requests table, in which BW tracks down the timestamps associated
        with the several data loads that were performed into BW. Because the changelog
        table only contains the active request id, and that cannot be sorted by the
        downstream consumers to figure out the latest change, we need to join the
        changelog table with this special table to get the activation requests
        timestamps to then use them to figure out the latest changes in the delta load
        logic afterwards.

        Additionally, we also need to know which was the latest timestamp already loaded
        into the lakehouse bronze layer. The latest timestamp should always be available
        in the bronze dataset itself or in a dataset that tracks down all the actrequest
        timestamps that were already loaded. So we get the max value out of the
        respective actrequest timestamp column in that dataset.

        Returns:
            A query to submit to SAP BW for the delta data extraction. The query
            is enclosed in parentheses so that Spark treats it as a table and supports
            it in the dbtable option.
        """
        if not self._BW_EXTRACTION.min_timestamp:
            from lakehouse_engine.io.reader_factory import ReaderFactory

            latest_timestamp_data_df = ReaderFactory.get_data(
                InputSpec(
                    spec_id="data_with_latest_timestamp",
                    data_format=self._BW_EXTRACTION.latest_timestamp_data_format,
                    read_type=ReadType.BATCH.value,
                    location=self._BW_EXTRACTION.latest_timestamp_data_location,
                )
            )
            min_timestamp = latest_timestamp_data_df.transform(
                Aggregators.get_max_value(
                    self._BW_EXTRACTION.latest_timestamp_input_col
                )
            ).first()[0]
        else:
            min_timestamp = self._BW_EXTRACTION.min_timestamp

        max_timestamp = (
            self._BW_EXTRACTION.max_timestamp
            if self._BW_EXTRACTION.max_timestamp
            else self._get_max_timestamp(self._MAX_TIMESTAMP_QUERY)
        )

        if self._BW_EXTRACTION.act_req_join_condition:
            join_condition = f"{self._BW_EXTRACTION.act_req_join_condition}"
        else:
            join_condition = (
                f"changelog_tbl.request = "
                f"act_req.{self._BW_EXTRACTION.request_col_name}"
            )

        base_query = f""" --# nosec
        FROM {self._BW_EXTRACTION.changelog_table} AS changelog_tbl
        JOIN {self._BW_EXTRACTION.act_request_table} AS act_req
            ON {join_condition}
        WHERE act_req.odsobject = '{self._BW_EXTRACTION.odsobject}'
            AND act_req.timestamp > {min_timestamp}
            AND act_req.timestamp <= {max_timestamp}
            AND operation = 'A' AND status = '0')
        """

        main_cols = f"""
            (SELECT changelog_tbl.*,
                act_req.TIMESTAMP AS actrequest_timestamp,
                CAST({self._BW_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0))
                    AS extraction_start_timestamp
            """
        # We join the main columns considered for the extraction with
        # extra_cols_act_request that people might want to use, filtering to only
        # add the comma and join the strings, in case extra_cols_act_request is
        # not None or empty.
        extraction_query_cols = ",".join(
            filter(None, [main_cols, self._BW_EXTRACTION.extra_cols_act_request])
        )

        extraction_query = extraction_query_cols + base_query

        predicates_query = f"""
        (SELECT DISTINCT({self._BW_EXTRACTION.partition_column})
        {base_query}
        """

        return extraction_query, predicates_query


================================================
FILE: lakehouse_engine/utils/extraction/sftp_extraction_utils.py
================================================
"""Utilities module for SFTP extraction processes."""

import stat
from base64 import decodebytes
from datetime import datetime
from enum import Enum
from logging import Logger
from stat import S_ISREG
from typing import Any, List, Set, Tuple

import paramiko as p
from paramiko import Ed25519Key, PKey, RSAKey, Transport
from paramiko.sftp_client import SFTPAttributes, SFTPClient  # type: ignore

from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from lakehouse_engine.utils.logging_handler import LoggingHandler


class SFTPInputFormat(Enum):
    """Formats of algorithm input."""

    CSV = "csv"
    FWF = "fwf"
    JSON = "json"
    XML = "xml"


class SFTPExtractionFilter(Enum):
    """Standardize the types of filters we can have from a SFTP source."""

    file_name_contains = "file_name_contains"
    LATEST_FILE = "latest_file"
    EARLIEST_FILE = "earliest_file"
    GREATER_THAN = "date_time_gt"
    LOWER_THAN = "date_time_lt"


class SFTPExtractionUtils(object):
    """Utils for managing data extraction from particularly relevant SFTP sources."""

    _logger: Logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def get_files_list(
        cls, sftp: SFTPClient, remote_path: str, options_args: dict
    ) -> Set[str]:
        """Get a list of files to be extracted from SFTP.

        The arguments (options_args) to list files are:

        - date_time_gt(str):
            Filter the files greater than the string datetime
            formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS".
        - date_time_lt(str):
            Filter the files lower than the string datetime
            formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS".
        - earliest_file(bool):
            Filter the earliest dated file in the directory.
        - file_name_contains(str):
            Filter files when match the pattern.
        - latest_file(bool):
            Filter the most recent dated file in the directory.
        - sub_dir(bool):
            When true, the engine will search files into subdirectories
            of the remote_path.
            It will consider one level below the remote_path.
            When sub_dir is used with latest_file/earliest_file argument,
            the engine will retrieve the latest_file/earliest_file
            for each subdirectory.

        Args:
            sftp: the SFTP client object.
            remote_path: path of files to be filtered.
            options_args: options from the acon.

        Returns:
            A list containing the file names to be passed to Spark.
        """
        all_items, folder_path = cls._get_folder_items(remote_path, sftp, options_args)

        filtered_files: Set[str] = set()

        try:
            for item, folder in zip(all_items, folder_path):
                file_contains = cls._file_has_pattern(item, options_args)
                file_in_interval = cls._file_in_date_interval(item, options_args)
                if file_contains and file_in_interval:
                    filtered_files.add(folder + item.filename)

            if (
                SFTPExtractionFilter.EARLIEST_FILE.value in options_args.keys()
                or SFTPExtractionFilter.LATEST_FILE.value in options_args.keys()
            ):
                filtered_files = cls._get_earliest_latest_file(
                    sftp, options_args, filtered_files, folder_path
                )

        except Exception as e:
            cls._logger.error(f"SFTP list_files EXCEPTION: - {e}")
        return filtered_files

    @classmethod
    def get_sftp_client(
        cls,
        options_args: dict,
    ) -> Tuple[SFTPClient, Transport]:
        """Get the SFTP client.

        The SFTP client is used to open an SFTP session across an open
        SSH Transport and perform remote file operations.

        Args:
            options_args: dictionary containing SFTP connection parameters.
                The Paramiko arguments expected to connect are:

                - "hostname": the server to connect to.
                - "port": the server port to connect to.
                - "username": the username to authenticate as.
                - "password": used for password authentication.
                - "pkey": optional - an optional public key to use for
                    authentication.
                - "passphrase" – optional - options used for decrypting private
                    keys.
                - "key_filename" – optional - the filename, or list of filenames,
                    of optional private key(s) and/or certs to try for
                    authentication.
                - "timeout" – an optional timeout (in seconds) for the TCP connect.
                - "allow_agent" – optional - set to False to disable
                    connecting to the SSH agent.
                - "look_for_keys" – optional - set to False to disable searching
                    for discoverable private key files in ~/.ssh/.
                - "compress" – optional - set to True to turn on compression.
                - "sock" - optional - an open socket or socket-like object
                    to use for communication to the target host.
                - "gss_auth" – optional - True if you want to use GSS-API
                    authentication.
                - "gss_kex" – optional - Perform GSS-API Key Exchange and
                    user authentication.
                - "gss_deleg_creds" – optional - Delegate GSS-API client
                    credentials or not.
                - "gss_host" – optional - The targets name in the kerberos database.
                - "gss_trust_dns" – optional - Indicates whether or
                    not the DNS is trusted to securely canonicalize the name of the
                    host being connected to (default True).
                - "banner_timeout" – an optional timeout (in seconds)
                    to wait for the SSH banner to be presented.
                - "auth_timeout" – an optional timeout (in seconds)
                    to wait for an authentication response.
                - "disabled_algorithms" – an optional dict passed directly to
                    Transport and its keyword argument of the same name.
                - "transport_factory" – an optional callable which is handed a
                    subset of the constructor arguments (primarily those related
                    to the socket, GSS functionality, and algorithm selection)
                    and generates a Transport instance to be used by this client.
                    Defaults to Transport.__init__.

                The parameter to specify the private key is expected to be in
                RSA format. Attempting a connection with a blank host key is
                not allowed unless the argument "add_auto_policy" is explicitly
                set to True.

        Returns:
            sftp -> a new SFTPClient session object.
            transport -> the Transport for this connection.
        """
        ssh_client = p.SSHClient()
        try:
            if not options_args.get("pkey") and not options_args.get("add_auto_policy"):
                raise WrongArgumentsException(
                    "Get SFTP Client: No host key (pkey) was provided and the "
                    + "add_auto_policy property is false."
                )

            if options_args.get("pkey") and not options_args.get("key_type"):
                raise WrongArgumentsException(
                    "Get SFTP Client: The key_type must be provided when "
                    + "the host key (pkey) is provided."
                )

            if options_args.get("pkey", None) and options_args.get("key_type", None):
                key = cls._get_host_keys(
                    options_args.get("pkey", None), options_args.get("key_type", None)
                )
                ssh_client.get_host_keys().add(
                    hostname=f"[{options_args.get('hostname')}]:"
                    + f"{options_args.get('port')}",
                    keytype="ssh-rsa",
                    key=key,
                )
            elif options_args.get("add_auto_policy", None):
                ssh_client.load_system_host_keys()
                ssh_client.set_missing_host_key_policy(p.WarningPolicy())  # nosec: B507
            else:
                ssh_client.load_system_host_keys()
                ssh_client.set_missing_host_key_policy(p.RejectPolicy())

            ssh_client.connect(
                hostname=options_args.get("hostname"),
                port=options_args.get("port", 22),
                username=options_args.get("username", None),
                password=options_args.get("password", None),
                key_filename=options_args.get("key_filename", None),
                timeout=options_args.get("timeout", None),
                allow_agent=options_args.get("allow_agent", True),
                look_for_keys=options_args.get("look_for_keys", True),
                compress=options_args.get("compress", False),
                sock=options_args.get("sock", None),
                gss_auth=options_args.get("gss_auth", False),
                gss_kex=options_args.get("gss_kex", False),
                gss_deleg_creds=options_args.get("gss_deleg_creds", False),
                gss_host=options_args.get("gss_host", False),
                banner_timeout=options_args.get("banner_timeout", None),
                auth_timeout=options_args.get("auth_timeout", None),
                gss_trust_dns=options_args.get("gss_trust_dns", None),
                passphrase=options_args.get("passphrase", None),
                disabled_algorithms=options_args.get("disabled_algorithms", None),
                transport_factory=options_args.get("transport_factory", None),
            )

            sftp = ssh_client.open_sftp()
            transport = ssh_client.get_transport()
        except ConnectionError as e:
            cls._logger.error(e)
            raise
        return sftp, transport

    @classmethod
    def validate_format(cls, files_format: str) -> str:
        """Validate the file extension based on the format definitions.

        Args:
            files_format: a string containing the file extension.

        Returns:
            The string validated and formatted.
        """
        formats_allowed = [
            SFTPInputFormat.CSV.value,
            SFTPInputFormat.FWF.value,
            SFTPInputFormat.JSON.value,
            SFTPInputFormat.XML.value,
        ]

        if files_format not in formats_allowed:
            raise WrongArgumentsException(
                f"The formats allowed for SFTP are {formats_allowed}."
            )

        return files_format

    @classmethod
    def validate_location(cls, location: str) -> str:
        """Validate the location. Add "/" in the case it does not exist.

        Args:
            location: file path.

        Returns:
            The location validated.
        """
        return location if location.rfind("/") == len(location) - 1 else location + "/"

    @classmethod
    def _file_has_pattern(cls, item: SFTPAttributes, options_args: dict) -> bool:
        """Check if a file follows the pattern used for filtering.

        Args:
            item: item available in SFTP directory.
            options_args: options from the acon.

        Returns:
            A boolean telling whether the file contains a pattern or not.
        """
        file_to_consider = True

        if SFTPExtractionFilter.file_name_contains.value in options_args.keys():
            if not (
                options_args.get(SFTPExtractionFilter.file_name_contains.value)
                in item.filename
                and (S_ISREG(item.st_mode) or cls._is_compressed(item.filename))
            ):
                file_to_consider = False

        return file_to_consider

    @classmethod
    def _file_in_date_interval(
        cls,
        item: SFTPAttributes,
        options_args: dict,
    ) -> bool:
        """Check if the file is in the expected date interval.

        The logic is applied based on the arguments greater_than and lower_than.
        i.e:

        - if greater_than and lower_than have values,
        then it performs a between.
        - if only lower_than has values,
        then only values lower than the input value will be retrieved.
        - if only greater_than has values,
        then only values greater than the input value will be retrieved.

        Args:
            item: item available in SFTP directory.
            options_args: options from the acon.

        Returns:
            A boolean telling whether the file is in the expected date interval or not.
        """
        file_to_consider = True

        if (
            SFTPExtractionFilter.LOWER_THAN.value in options_args.keys()
            or SFTPExtractionFilter.GREATER_THAN.value in options_args.keys()
            and (S_ISREG(item.st_mode) or cls._is_compressed(item.filename))
        ):
            lower_than = options_args.get(
                SFTPExtractionFilter.LOWER_THAN.value, "9999-12-31"
            )
            greater_than = options_args.get(
                SFTPExtractionFilter.GREATER_THAN.value, "1900-01-01"
            )

            file_date = datetime.fromtimestamp(item.st_mtime)

            if not (
                (
                    lower_than == greater_than
                    and cls._validate_date(greater_than)
                    <= file_date
                    <= cls._validate_date(lower_than)
                )
                or (
                    cls._validate_date(greater_than)
                    < file_date
                    < cls._validate_date(lower_than)
                )
            ):
                file_to_consider = False

        return file_to_consider

    @classmethod
    def _get_earliest_latest_file(
        cls,
        sftp: SFTPClient,
        options_args: dict,
        list_filter_files: Set[str],
        folder_path: List,
    ) -> Set[str]:
        """Get the earliest or latest file of a directory.

        Args:
            sftp: the SFTP client object.
            options_args: options from the acon.
            list_filter_files: set of file names to filter from.
            folder_path: the location of files.

        Returns:
            A set containing the earliest/latest file name.
        """
        list_earl_lat_files: Set[str] = set()

        for folder in folder_path:
            file_date = 0
            file_name = ""
            all_items, _ = cls._get_folder_items(f"{folder}", sftp, options_args)
            for item in all_items:
                if (
                    folder + item.filename in list_filter_files
                    and (S_ISREG(item.st_mode) or cls._is_compressed(item.filename))
                    and (
                        options_args.get("earliest_file")
                        and (file_date == 0 or item.st_mtime < file_date)
                    )
                    or (
                        options_args.get("latest_file")
                        and (file_date == 0 or item.st_mtime > file_date)
                    )
                ):
                    file_date = item.st_mtime
                    file_name = folder + item.filename
            list_earl_lat_files.add(file_name)

        return list_earl_lat_files

    @classmethod
    def _get_folder_items(
        cls, remote_path: str, sftp: SFTPClient, options_args: dict
    ) -> Tuple:
        """Get the files and the directory to be processed.

        Args:
            remote_path: root folder path.
            sftp: a SFTPClient session object.
            options_args: options from the acon.

        Returns:
            A tuple with a list of items (file object) and a list of directories.
        """
        sub_dir = options_args.get("sub_dir", False)
        all_items: List[SFTPAttributes] = sftp.listdir_attr(remote_path)
        items: List[SFTPAttributes] = []
        folders: List = []

        for item in all_items:
            is_dir = stat.S_ISDIR(item.st_mode)
            if is_dir and sub_dir and not item.filename.endswith((".gz", ".zip")):
                dirs = sftp.listdir_attr(f"{remote_path}{item.filename}")
                for file in dirs:
                    items.append(file)
                    folders.append(f"{remote_path}{item.filename}/")
            else:
                items.append(item)
                folders.append(remote_path)

        return items, folders

    @classmethod
    def _get_host_keys(cls, pkey: str, key_type: str) -> PKey:
        """Get the pkey that will be added to the server.

        Args:
            pkey: a string with a host key value.
            key_type: the type of key (rsa or ed25519).

        Returns:
            A PKey that will be used to authenticate the connection.
        """
        key: RSAKey | Ed25519Key = None
        if pkey and key_type.lower() == "rsa":
            b_pkey = bytes(pkey, "UTF-8")
            key = p.RSAKey(data=decodebytes(b_pkey))
        elif pkey and key_type.lower() == "ed25519":
            b_pkey = bytes(pkey, "UTF-8")
            key = p.Ed25519Key(data=decodebytes(b_pkey))

        return key

    @classmethod
    def _is_compressed(cls, filename: str) -> Any:
        """Validate if it is a compressed file.

        Args:
            filename: name of the file to be validated.

        Returns:
            A boolean with the result.
        """
        return filename.endswith((".gz", ".zip"))

    @classmethod
    def _validate_date(cls, date_text: str) -> datetime:
        """Validate the input date format.

        Args:
            date_text: a string with the date or datetime value.
            The expected formats are:
                YYYY-MM-DD and YYYY-MM-DD HH:MM:SS

        Returns:
            The datetime validated and formatted.
        """
        for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S"):
            try:
                if date_text is not None:
                    return datetime.strptime(date_text, fmt)
            except ValueError:
                pass
        raise ValueError(
            "Incorrect data format, should be YYYY-MM-DD or YYYY-MM-DD HH:MM:SS."
        )


================================================
FILE: lakehouse_engine/utils/file_utils.py
================================================
"""Utilities for file name based operations."""

import re
from os import listdir
from typing import List


def get_file_names_without_file_type(
    path: str, file_type: str, exclude_regex: str
) -> list:
    """Function to retrieve list of file names in a folder.

    This function filters by file type and removes the extension of the file name
    it returns.

    Args:
        path: path to the folder to list files
        file_type: type of the file to include in list
        exclude_regex: regex of file names to exclude

    Returns:
        A list of file names without file type.
    """
    file_list: List[str] = []

    for file in listdir(path):
        if not re.search(exclude_regex, file) and file.endswith(file_type):
            file_list.append(file.split(".")[0])

    return file_list


def get_directory_path(path: str) -> str:
    """Add '/' to the end of the path of a directory.

    Args:
        path: directory to be processed

    Returns:
        Directory path stripped and with '/' at the end.
    """
    path = path.strip()
    return path if path[-1] == "/" else path + "/"


================================================
FILE: lakehouse_engine/utils/gab_utils.py
================================================
"""Module to define GAB Utility classes."""

import ast
import calendar
import json
from datetime import datetime
from typing import Optional

import pendulum
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, lit, struct, to_json

from lakehouse_engine.core.definitions import GABCadence, GABDefaults
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.logging_handler import LoggingHandler


class GABUtils(object):
    """Class containing utility functions for GAB."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    def logger(
        self,
        run_start_time: datetime,
        run_end_time: datetime,
        start: str,
        end: str,
        query_id: str,
        query_label: str,
        cadence: str,
        stage_file_path: str,
        query: str,
        status: str,
        error_message: Exception | str,
        target_database: str,
    ) -> None:
        """Store the execution of each stage in the log events table.

        Args:
            run_start_time: execution start time.
            run_end_time: execution end time.
            start: use case start date.
            end: use case end date.
            query_id: gab configuration table use case identifier.
            query_label: gab configuration table use case name.
            cadence: cadence to process.
            stage_file_path: stage file path.
            query: query to execute.
            status: status of the query execution.
            error_message: error message if present.
            target_database: target database to write.
        """
        ins = """
        INSERT INTO {database}.gab_log_events
        VALUES (
            '{run_start_time}',
            '{run_end_time}',
            '{start}',
            '{end}',
            {query_id},
            '{query_label}',
            '{cadence}',
            '{stage_file_path}',
            '{query}',
            '{status}',
            '{error_message}'
        )""".format(  # nosec: B608
            database=target_database,
            run_start_time=run_start_time,
            run_end_time=run_end_time,
            start=start,
            end=end,
            query_id=query_id,
            query_label=query_label,
            cadence=cadence,
            stage_file_path=stage_file_path,
            query=self._escape_quote(query),
            status=status,
            error_message=(
                self._escape_quote(str(error_message))
                if status == "Failed"
                else error_message
            ),
        )

        ExecEnv.SESSION.sql(ins)

    @classmethod
    def _escape_quote(cls, to_escape: str) -> str:
        """Escape quote on string.

        Args:
            to_escape: string to escape.
        """
        return to_escape.replace("'", r"\'").replace('"', r"\"")

    @classmethod
    def get_json_column_as_dict(
        cls, lookup_query_builder: DataFrame, query_id: str, query_column: str
    ) -> dict:  # type: ignore
        """Get JSON column as dictionary.

        Args:
            lookup_query_builder: gab configuration data.
            query_id: gab configuration table use case identifier.
            query_column: column to get as json.
        """
        column_df = lookup_query_builder.filter(
            col("query_id") == lit(query_id)
        ).select(col(query_column))

        column_df_json = column_df.select(
            to_json(struct([column_df[x] for x in column_df.columns]))
        ).collect()[0][0]

        json_column = json.loads(column_df_json)

        for mapping in json_column.values():
            column_as_json = ast.literal_eval(mapping)

        return column_as_json  # type: ignore

    @classmethod
    def extract_columns_from_mapping(
        cls,
        columns: dict,
        is_dimension: bool,
        extract_column_without_alias: bool = False,
        table_alias: Optional[str] = None,
        is_extracted_value_as_name: bool = True,
    ) -> tuple[list[str], list[str]] | list[str]:
        """Extract and transform columns to SQL select statement.

        Args:
            columns: data to extract the columns.
            is_dimension: flag identifying if is a dimension or a metric.
            extract_column_without_alias: flag to inform if it's to extract columns
                without aliases.
            table_alias: name or alias from the source table.
            is_extracted_value_as_name: identify if the extracted value is the
                column name.
        """
        column_with_alias = (
            "".join([table_alias, ".", "{} as {}"]) if table_alias else "{} as {}"
        )
        column_without_alias = (
            "".join([table_alias, ".", "{}"]) if table_alias else "{}"
        )

        extracted_columns_with_alias = []
        extracted_columns_without_alias = []
        for column_name, column_value in columns.items():
            if extract_column_without_alias:
                extracted_column_without_alias = column_without_alias.format(
                    cls._get_column_format_without_alias(
                        is_dimension,
                        column_name,
                        column_value,
                        is_extracted_value_as_name,
                    )
                )
                extracted_columns_without_alias.append(extracted_column_without_alias)

            extracted_column_with_alias = column_with_alias.format(
                *cls._extract_column_with_alias(
                    is_dimension,
                    column_name,
                    column_value,
                    is_extracted_value_as_name,
                )
            )
            extracted_columns_with_alias.append(extracted_column_with_alias)

        return (
            (extracted_columns_with_alias, extracted_columns_without_alias)
            if extract_column_without_alias
            else extracted_columns_with_alias
        )

    @classmethod
    def _extract_column_with_alias(
        cls,
        is_dimension: bool,
        column_name: str,
        column_value: str | dict,
        is_extracted_value_as_name: bool = True,
    ) -> tuple[str, str]:
        """Extract column name with alias.

        Args:
            is_dimension: flag indicating if the column is a dimension.
            column_name: name of the column.
            column_value: value of the column.
            is_extracted_value_as_name: flag indicating if the name of the column is the
                extracted value.
        """
        extracted_value = (
            column_value
            if is_dimension
            else (column_value["metric_name"])  # type: ignore
        )

        return (
            (extracted_value, column_name)  # type: ignore
            if is_extracted_value_as_name
            else (column_name, extracted_value)
        )

    @classmethod
    def _get_column_format_without_alias(
        cls,
        is_dimension: bool,
        column_name: str,
        column_value: str | dict,
        is_extracted_value_as_name: bool = True,
    ) -> str:
        """Extract column name without alias.

        Args:
            is_dimension: flag indicating if the column is a dimension.
            column_name: name of the column.
            column_value: value of the column.
            is_extracted_value_as_name: flag indicating if the name of the column is the
                extracted value.
        """
        extracted_value: str = (
            column_value
            if is_dimension
            else (column_value["metric_name"])  # type: ignore
        )

        return extracted_value if is_extracted_value_as_name else column_name

    @classmethod
    def get_cadence_configuration_at_end_date(cls, end_date: datetime) -> dict:
        """A dictionary that corresponds to the conclusion of a cadence.

        Any end date inputted by the user we check this end date is actually end of
            a cadence (YEAR, QUARTER, MONTH, WEEK).
        If the user input is 2024-03-31 this is a month end and a quarter end that
            means any use cases configured as month or quarter need to be calculated.

        Args:
            end_date: base end date.
        """
        init_end_date_dict = {}

        expected_end_cadence_date = pendulum.datetime(
            int(end_date.strftime("%Y")),
            int(end_date.strftime("%m")),
            int(end_date.strftime("%d")),
        ).replace(tzinfo=None)

        # Validating YEAR cadence
        if end_date == expected_end_cadence_date.last_of("year"):
            init_end_date_dict["YEAR"] = "N"

        # Validating QUARTER cadence
        if end_date == expected_end_cadence_date.last_of("quarter"):
            init_end_date_dict["QUARTER"] = "N"

        # Validating MONTH cadence
        if end_date == datetime(
            int(end_date.strftime("%Y")),
            int(end_date.strftime("%m")),
            calendar.monthrange(
                int(end_date.strftime("%Y")), int(end_date.strftime("%m"))
            )[1],
        ):
            init_end_date_dict["MONTH"] = "N"

        # Validating WEEK cadence
        if end_date == expected_end_cadence_date.end_of("week").replace(
            hour=0, minute=0, second=0, microsecond=0
        ):
            init_end_date_dict["WEEK"] = "N"

        init_end_date_dict["DAY"] = "N"

        return init_end_date_dict

    def get_reconciliation_cadences(
        self,
        cadence: str,
        selected_reconciliation_window: dict,
        cadence_configuration_at_end_date: dict,
        rerun_flag: str,
    ) -> dict:
        """Get reconciliation cadences based on the use case configuration.

        Args:
            cadence: cadence to process.
            selected_reconciliation_window: configured use case reconciliation window.
            cadence_configuration_at_end_date: cadences to execute at the end date.
            rerun_flag: flag indicating if it's a rerun or a normal run.
        """
        configured_cadences = self._get_configured_cadences_by_snapshot(
            cadence, selected_reconciliation_window, cadence_configuration_at_end_date
        )

        return self._get_cadences_to_execute(
            configured_cadences, cadence, cadence_configuration_at_end_date, rerun_flag
        )

    @classmethod
    def _get_cadences_to_execute(
        cls,
        configured_cadences: dict,
        cadence: str,
        cadence_configuration_at_end_date: dict,
        rerun_flag: str,
    ) -> dict:
        """Get cadences to execute.

        Args:
            cadence: cadence to process.
            configured_cadences: configured use case reconciliation window.
            cadence_configuration_at_end_date: cadences to execute at the end date.
            rerun_flag: flag indicating if it's a rerun or a normal run.
        """
        cadences_to_execute = {}
        cad_order = GABCadence.get_ordered_cadences()

        for snapshot_cadence, snapshot_flag in configured_cadences.items():
            if (
                (cad_order[cadence] > cad_order[snapshot_cadence])
                and (rerun_flag == "Y")
            ) or snapshot_cadence in cadence_configuration_at_end_date:
                cadences_to_execute[snapshot_cadence] = snapshot_flag
            elif snapshot_cadence not in cadence_configuration_at_end_date:
                continue

        return cls._sort_cadences_to_execute(cadences_to_execute, cad_order)

    @classmethod
    def _sort_cadences_to_execute(
        cls, cadences_to_execute: dict, cad_order: dict
    ) -> dict:
        """Sort the cadences to execute.

        Args:
            cadences_to_execute: cadences to execute.
            cad_order: all cadences with order.
        """
        # ordering it because when grouping cadences with snapshot and without snapshot
        # can impact the cadence ordering.
        sorted_cadences_to_execute: dict = dict(
            sorted(
                cadences_to_execute.items(),
                key=lambda item: cad_order.get(item[0]),  # type: ignore
            )
        )
        # ordering cadences to execute it from bigger (YEAR) to smaller (DAY)
        cadences_to_execute_items = []

        for cadence_name, cadence_value in sorted_cadences_to_execute.items():
            cadences_to_execute_items.append((cadence_name, cadence_value))

        cadences_sorted_by_bigger_cadence_to_execute: dict = dict(
            reversed(cadences_to_execute_items)
        )

        return cadences_sorted_by_bigger_cadence_to_execute

    @classmethod
    def _get_configured_cadences_by_snapshot(
        cls,
        cadence: str,
        selected_reconciliation_window: dict,
        cadence_configuration_at_end_date: dict,
    ) -> dict:
        """Get configured cadences to execute.

        Args:
            cadence: selected cadence.
            selected_reconciliation_window: configured use case reconciliation window.
            cadence_configuration_at_end_date: cadences to execute at the end date.

        Returns:
            Each cadence with the corresponding information if it's to execute with
                snapshot or not.
        """
        cadences_by_snapshot = {}

        (
            no_snapshot_cadences,
            snapshot_cadences,
        ) = cls._generate_reconciliation_by_snapshot(
            cadence, selected_reconciliation_window
        )

        for snapshot_cadence, snapshot_flag in no_snapshot_cadences.items():
            if snapshot_cadence in cadence_configuration_at_end_date:
                cadences_by_snapshot[snapshot_cadence] = snapshot_flag

                cls._LOGGER.info(f"{snapshot_cadence} is present in {cadence} cadence")
                break

        cadences_by_snapshot.update(snapshot_cadences)

        if (not cadences_by_snapshot) and (
            cadence in cadence_configuration_at_end_date
        ):
            cadences_by_snapshot[cadence] = "N"

        return cadences_by_snapshot

    @classmethod
    def _generate_reconciliation_by_snapshot(
        cls, cadence: str, selected_reconciliation_window: dict
    ) -> tuple[dict, dict]:
        """Generate reconciliation by snapshot.

        Args:
            cadence: cadence to process.
            selected_reconciliation_window: configured use case reconciliation window.
        """
        cadence_snapshot_configuration = {cadence: "N"}
        for cadence in GABCadence.get_cadences():
            cls._add_cadence_snapshot_to_cadence_snapshot_config(
                cadence, selected_reconciliation_window, cadence_snapshot_configuration
            )
        cadence_snapshot_configuration = dict(
            sorted(
                cadence_snapshot_configuration.items(),
                key=(
                    lambda item: GABCadence.get_ordered_cadences().get(  # type: ignore
                        item[0]
                    )
                ),
            )
        )

        cadence_snapshot_configuration = dict(
            reversed(list(cadence_snapshot_configuration.items()))
        )

        cadences_without_snapshot = {
            key: value
            for key, value in cadence_snapshot_configuration.items()
            if value == "N"
        }

        cadences_with_snapshot = {
            key: value
            for key, value in cadence_snapshot_configuration.items()
            if value == "Y"
        }

        return cadences_with_snapshot, cadences_without_snapshot

    @classmethod
    def _add_cadence_snapshot_to_cadence_snapshot_config(
        cls,
        cadence: str,
        selected_reconciliation_window: dict,
        cadence_snapshot_configuration: dict,
    ) -> None:
        """Add the selected reconciliation to cadence snapshot configuration.

        Args:
            cadence: selected cadence.
            selected_reconciliation_window:  configured use case reconciliation window.
            cadence_snapshot_configuration: cadence snapshot configuration dictionary
                who will be updated with the new value.
        """
        if cadence in selected_reconciliation_window:
            cadence_snapshot_configuration[cadence] = selected_reconciliation_window[
                cadence
            ]["snapshot"]

    @classmethod
    def format_datetime_to_default(cls, date_to_format: datetime) -> str:
        """Format datetime to GAB default format.

        Args:
            date_to_format: date to format.
        """
        return datetime.date(date_to_format).strftime(GABDefaults.DATE_FORMAT.value)


class GABPartitionUtils(object):
    """Class to extract a partition based in a date period."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def get_years(cls, start_date: str, end_date: str) -> list[str]:
        """Return a list of distinct years from the input parameters.

        Args:
            start_date: start of the period.
            end_date: end of the period.
        """
        year = []
        if start_date > end_date:
            raise ValueError(
                "Input Error: Invalid start_date and end_date. "
                "Start_date is greater than end_date"
            )

        for i in range(int(start_date[0:4]), int(end_date[0:4]) + 1):
            year.append(str(i))

        return year

    @classmethod
    def get_partition_condition(cls, start_date: str, end_date: str) -> str:
        """Return year,month and day partition statement from the input parameters.

        Args:
            start_date: start of the period.
            end_date: end of the period.
        """
        years = cls.get_years(start_date, end_date)
        if len(years) > 1:
            partition_condition = cls._get_multiple_years_partition(
                start_date, end_date, years
            )
        else:
            partition_condition = cls._get_single_year_partition(start_date, end_date)
        return partition_condition

    @classmethod
    def _get_multiple_years_partition(
        cls, start_date: str, end_date: str, years: list[str]
    ) -> str:
        """Return partition when executing multiple years (>1).

        Args:
            start_date: start of the period.
            end_date: end of the period.
            years: list of years.
        """
        start_date_month = cls._extract_date_part_from_date("MONTH", start_date)
        start_date_day = cls._extract_date_part_from_date("DAY", start_date)

        end_date_month = cls._extract_date_part_from_date("MONTH", end_date)
        end_date_day = cls._extract_date_part_from_date("DAY", end_date)

        year_statement = "(year = {0} and (".format(years[0]) + "{})"
        if start_date_month != "12":
            start_date_partition = year_statement.format(
                "(month = {0} and day between {1} and 31)".format(
                    start_date_month, start_date_day
                )
                + " or (month between {0} and 12)".format(int(start_date_month) + 1)
            )
        else:
            start_date_partition = year_statement.format(
                "month = {0} and day between {1} and 31".format(
                    start_date_month, start_date_day
                )
            )

        period_years_partition = ""

        if len(years) == 3:
            period_years_partition = ") or (year = {0}".format(years[1])
        elif len(years) > 3:
            period_years_partition = ") or (year between {0} and {1})".format(
                years[1], years[-2]
            )

        if end_date_month != "01":
            end_date_partition = (
                ") or (year = {0} and ((month between 01 and {1})".format(
                    years[-1], int(end_date_month) - 1
                )
                + " or (month = {0} and day between 1 and {1})))".format(
                    end_date_month, end_date_day
                )
            )
        else:
            end_date_partition = (
                ") or (year = {0} and month = 1 and day between 01 and {1})".format(
                    years[-1], end_date_day
                )
            )
        partition_condition = (
            start_date_partition + period_years_partition + end_date_partition
        )

        return partition_condition

    @classmethod
    def _get_single_year_partition(cls, start_date: str, end_date: str) -> str:
        """Return partition when executing a single year.

        Args:
            start_date: start of the period.
            end_date: end of the period.
        """
        start_date_year = cls._extract_date_part_from_date("YEAR", start_date)
        start_date_month = cls._extract_date_part_from_date("MONTH", start_date)
        start_date_day = cls._extract_date_part_from_date("DAY", start_date)

        end_date_year = cls._extract_date_part_from_date("YEAR", end_date)
        end_date_month = cls._extract_date_part_from_date("MONTH", end_date)
        end_date_day = cls._extract_date_part_from_date("DAY", end_date)

        if start_date_month != end_date_month:
            months = []
            for i in range(int(start_date_month), int(end_date_month) + 1):
                months.append(i)

            start_date_partition = (
                "year = {0} and ((month={1} and day between {2} and 31)".format(
                    start_date_year, months[0], start_date_day
                )
            )
            period_years_partition = ""
            if len(months) == 2:
                period_years_partition = start_date_partition
            elif len(months) == 3:
                period_years_partition = (
                    start_date_partition + " or (month = {0})".format(months[1])
                )
            elif len(months) > 3:
                period_years_partition = (
                    start_date_partition
                    + " or (month between {0} and {1})".format(months[1], months[-2])
                )
            partition_condition = (
                period_years_partition
                + " or (month = {0} and day between 1 and {1}))".format(
                    end_date_month, end_date_day
                )
            )
        else:
            partition_condition = (
                "year = {0} and month = {1} and day between {2} and {3}".format(
                    end_date_year, end_date_month, start_date_day, end_date_day
                )
            )

        return partition_condition

    @classmethod
    def _extract_date_part_from_date(cls, part: str, date: str) -> str:
        """Extract date part from string date.

        Args:
            part: date part (possible values: DAY, MONTH, YEAR)
            date: string date.
        """
        if "DAY" == part.upper():
            return date[8:10]
        elif "MONTH" == part.upper():
            return date[5:7]
        else:
            return date[0:4]


================================================
FILE: lakehouse_engine/utils/logging_handler.py
================================================
"""Module to configure project logging."""

import logging
import re

FORMATTER = logging.Formatter("%(asctime)s — %(name)s — %(levelname)s — %(message)s")
SENSITIVE_KEYS_REG = [
    {  # Enclosed in ''.
        # Stops replacing when it finds comma and space, space or end of line.
        "regex": r"'(kafka\.ssl\.keystore\.password|kafka\.ssl\.truststore\.password"
        r"|password|secret|credential|credentials|pass|key)'[ ]*:"
        r"[ ]*'.*?(, | |}|$)",
        "replace": "'masked_cred': '******', ",
    },
    {  # Enclosed in "".
        # Stops replacing when it finds comma and space, space or end of line.
        "regex": r'"(kafka\.ssl\.keystore\.password|kafka\.ssl\.truststore\.password'
        r'|password|secret|credential|credentials|pass|key)"[ ]*:'
        r'[ ]*".*?(, | |}|$)',
        "replace": '"masked_cred": "******", ',
    },
    {  # Not enclosed in '' or "".
        # Stops replacing when it finds comma and space, space or end of line.
        "regex": r"(kafka\.ssl\.keystore\.password|kafka\.ssl\.truststore\.password"
        r"|password|secret|credential|credentials|pass|key)[ ]*:"
        r"[ ]*.*?(, | |}|$)",
        "replace": "masked_cred: ******, ",
    },
]


class FilterSensitiveData(logging.Filter):
    """Logging filter to hide sensitive data from being shown in the logs."""

    def filter(self, record: logging.LogRecord) -> bool:  # noqa: A003
        """Hide sensitive information from being shown in the logs.

        Based on the configured regex and replace strings, the content of the log
        records is replaced and then all the records are allowed to be logged
        (return True).

        Args:
            record: the LogRecord event being logged.

        Returns:
            The transformed record to be logged.
        """
        for key_reg in SENSITIVE_KEYS_REG:
            record.msg = re.sub(key_reg["regex"], key_reg["replace"], str(record.msg))
        return True


class LoggingHandler(object):
    """Handle the logging of the lakehouse engine project."""

    def __init__(self, class_name: str):
        """Construct a LoggingHandler instance.

        Args:
            class_name: name of the class to be indicated in the logs.
        """
        self._logger: logging.Logger = logging.getLogger(class_name)
        self._logger.setLevel(logging.DEBUG)
        self._logger.addFilter(FilterSensitiveData())
        lsh = logging.StreamHandler()
        lsh.setLevel(logging.DEBUG)
        lsh.setFormatter(FORMATTER)
        if not self._logger.hasHandlers():
            # avoid keep adding handlers and therefore duplicate messages
            self._logger.addHandler(lsh)

    def get_logger(self) -> logging.Logger:
        """Get the _logger instance variable.

        Returns:
            logging.Logger: the logger object.
        """
        return self._logger


================================================
FILE: lakehouse_engine/utils/rest_api.py
================================================
"""Module to handle REST API operations."""

import time
from enum import Enum

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from lakehouse_engine.utils.logging_handler import LoggingHandler

LOG = LoggingHandler(__name__).get_logger()
DEFAULT_CONTENT_TYPE = "application/json"


class RestMethods(Enum):
    """Methods for REST API calls."""

    POST = "POST"
    PUT = "PUT"
    ALLOWED_METHODS = ["POST", "PUT"]


class RestStatusCodes(Enum):
    """REST Status Code."""

    RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
    OK_STATUS_CODES = [200]


class RESTApiException(requests.RequestException):
    """Class representing any possible REST API Exception."""

    def __init__(self, message: str) -> None:
        """Construct RESTApiException instances.

        Args:
            message: message to display on exception event.
        """
        super().__init__(message)


def get_basic_auth(username: str, password: str) -> requests.auth.HTTPBasicAuth:
    """Get the basic authentication object to authenticate REST requests.

    Args:
        username: username.
        password: password.

    Returns:
        requests.auth.HTTPBasicAuth: the HTTPBasicAuth object.
    """
    return requests.auth.HTTPBasicAuth(username, password)


def get_configured_session(
    sleep_seconds: float = 0.2,
    total_retries: int = 5,
    backoff_factor: int = 2,
    retry_status_codes: list = None,
    allowed_methods: list = None,
    protocol: str = "https://",
) -> requests.Session:
    """Get a configured requests Session with exponential backoff.

    Args:
        sleep_seconds: seconds to sleep before each request to avoid rate limits.
        total_retries: number of times to retry.
        backoff_factor: factor for the exponential backoff.
        retry_status_codes: list of status code that triggers a retry.
        allowed_methods: http methods that are allowed for retry.
        protocol: http:// or https://.

    Returns
        requests.Session: the configured session.
    """
    retry_status_codes = (
        retry_status_codes
        if retry_status_codes
        else RestStatusCodes.RETRY_STATUS_CODES.value
    )
    allowed_methods = (
        allowed_methods if allowed_methods else RestMethods.ALLOWED_METHODS.value
    )
    time.sleep(sleep_seconds)
    session = requests.Session()
    retries = Retry(
        total=total_retries,
        backoff_factor=backoff_factor,
        status_forcelist=retry_status_codes,
        allowed_methods=allowed_methods,
    )
    session.mount(protocol, HTTPAdapter(max_retries=retries))
    return session


def execute_api_request(
    method: str,
    url: str,
    headers: dict = None,
    basic_auth_dict: dict = None,
    json: dict = None,
    files: dict = None,
    sleep_seconds: float = 0.2,
) -> requests.Response:
    """Execute a REST API request.

    Args:
        method: REST method (e.g., POST or PUT).
        url: url of the api.
        headers: request headers.
        basic_auth_dict: basic http authentication details
            (e.g., {"username": "x", "password": "y"}).
        json: json payload to send in the request.
        files: files payload to send in the request.
        sleep_seconds: for how many seconds to sleep to avoid error 429.

    Returns:
        response from the HTTP request.
    """
    basic_auth: requests.auth.HTTPBasicAuth = None
    if basic_auth_dict:
        basic_auth = get_basic_auth(
            basic_auth_dict["username"], basic_auth_dict["password"]
        )

    return get_configured_session(sleep_seconds=sleep_seconds).request(
        method=method,
        url=url,
        headers=headers,
        auth=basic_auth,
        json=json,
        files=files,
    )


================================================
FILE: lakehouse_engine/utils/schema_utils.py
================================================
"""Utilities to facilitate dataframe schema management."""

from logging import Logger
from typing import Any, List, Optional

from pyspark.sql.functions import col
from pyspark.sql.types import StructType

from lakehouse_engine.core.definitions import InputSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.storage.file_storage_functions import FileStorageFunctions


class SchemaUtils(object):
    """Schema utils that help retrieve and manage schemas of dataframes."""

    _logger: Logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def from_file(file_path: str, disable_dbfs_retry: bool = False) -> StructType:
        """Get a spark schema from a file (spark StructType json file) in a file system.

        Args:
            file_path: path of the file in a file system. [Check here](
                https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html).
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            Spark schema struct type.
        """
        return StructType.fromJson(
            FileStorageFunctions.read_json(file_path, disable_dbfs_retry)
        )

    @staticmethod
    def from_file_to_dict(file_path: str, disable_dbfs_retry: bool = False) -> Any:
        """Get a dict with the spark schema from a file in a file system.

        Args:
            file_path: path of the file in a file system. [Check here](
                https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html).
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
             Spark schema in a dict.
        """
        return FileStorageFunctions.read_json(file_path, disable_dbfs_retry)

    @staticmethod
    def from_dict(struct_type: dict) -> StructType:
        """Get a spark schema from a dict.

        Args:
            struct_type: dict containing a spark schema structure. [Check here](
                https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html).

        Returns:
             Spark schema struct type.
        """
        return StructType.fromJson(struct_type)

    @staticmethod
    def from_table_schema(table: str) -> StructType:
        """Get a spark schema from a table.

        Args:
            table: table name from which to inherit the schema.

        Returns:
            Spark schema struct type.
        """
        return ExecEnv.SESSION.read.table(table).schema

    @classmethod
    def from_input_spec(cls, input_spec: InputSpec) -> Optional[StructType]:
        """Get a spark schema from an input specification.

        This covers scenarios where the schema is provided as part of the input
        specification of the algorithm. Schema can come from the table specified in the
        input specification (enforce_schema_from_table) or by the dict with the spark
        schema provided there also.

        Args:
            input_spec: input specification.

        Returns:
            spark schema struct type.
        """
        if input_spec.enforce_schema_from_table:
            cls._logger.info(
                f"Reading schema from table: {input_spec.enforce_schema_from_table}"
            )
            return SchemaUtils.from_table_schema(input_spec.enforce_schema_from_table)
        elif input_spec.schema_path:
            cls._logger.info(f"Reading schema from file: {input_spec.schema_path}")
            return SchemaUtils.from_file(
                input_spec.schema_path, input_spec.disable_dbfs_retry
            )
        elif input_spec.schema:
            cls._logger.info(
                f"Reading schema from configuration file: {input_spec.schema}"
            )
            return SchemaUtils.from_dict(input_spec.schema)
        else:
            cls._logger.info("No schema was provided... skipping enforce schema")
            return None

    @staticmethod
    def _get_prefix_alias(num_chars: int, prefix: str, shorten_names: bool) -> str:
        """Get prefix alias for a field."""
        return (
            f"""{'_'.join(
                [item[:num_chars] for item in prefix.split('.')]
            )}_"""
            if shorten_names
            else f"{prefix}_".replace(".", "_")
        )

    @staticmethod
    def schema_flattener(
        schema: StructType,
        prefix: str = None,
        level: int = 1,
        max_level: int = None,
        shorten_names: bool = False,
        alias: bool = True,
        num_chars: int = 7,
        ignore_cols: List = None,
    ) -> List:
        """Recursive method to flatten the schema of the dataframe.

        Args:
            schema: schema to be flattened.
            prefix: prefix of the struct to get the value for. Only relevant
                for being used in the internal recursive logic.
            level: level of the depth in the schema being flattened. Only relevant
                for being used in the internal recursive logic.
            max_level: level until which you want to flatten the schema. Default: None.
            shorten_names: whether to shorten the names of the prefixes of the fields
                being flattened or not. Default: False.
            alias: whether to define alias for the columns being flattened or
                not. Default: True.
            num_chars: number of characters to consider when shortening the names of
                the fields. Default: 7.
            ignore_cols: columns which you don't want to flatten. Default: None.

        Returns:
            A function to be called in .transform() spark function.
        """
        cols = []
        ignore_cols = ignore_cols if ignore_cols else []
        for field in schema.fields:
            name = prefix + "." + field.name if prefix else field.name
            field_type = field.dataType

            if (
                isinstance(field_type, StructType)
                and name not in ignore_cols
                and (max_level is None or level <= max_level)
            ):
                cols += SchemaUtils.schema_flattener(
                    schema=field_type,
                    prefix=name,
                    level=level + 1,
                    max_level=max_level,
                    shorten_names=shorten_names,
                    alias=alias,
                    num_chars=num_chars,
                    ignore_cols=ignore_cols,
                )
            else:
                if alias and prefix:
                    prefix_alias = SchemaUtils._get_prefix_alias(
                        num_chars, prefix, shorten_names
                    )
                    cols.append(col(name).alias(f"{prefix_alias}{field.name}"))
                else:
                    cols.append(col(name))
        return cols


================================================
FILE: lakehouse_engine/utils/sharepoint_utils.py
================================================
"""Utilities for sharepoint API operations."""

from __future__ import annotations

import os
import shutil
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Generator, List, cast

import requests
from pyspark.sql import DataFrame
from requests import RequestException
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

from lakehouse_engine.core.definitions import SharepointFile
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.exceptions import SharePointAPIError
from lakehouse_engine.utils.logging_handler import LoggingHandler

_logger = LoggingHandler(__name__).get_logger()


class SharepointUtils(object):
    """Class with methods to connect and extract data from Sharepoint."""

    def __init__(
        self,
        client_id: str,
        tenant_id: str,
        local_path: str,
        api_version: str,
        site_name: str,
        drive_name: str,
        file_name: str,
        secret: str,
        folder_relative_path: str = None,
        chunk_size: int = 5 * 1024 * 1024,  # 5 MB
        local_options: dict = None,
        conflict_behaviour: str = "replace",
        file_pattern: str = None,
        file_type: str = None,
    ):
        """Instantiate objects of the SharepointUtils class.

        Args:
            client_id: application (client) ID of your Azure AD app.
            tenant_id: tenant ID (directory ID) from Azure AD for authentication.
            local_path: local directory path (Volume) where the files are temporarily
            stored.
            api_version: Graph API version to use.
            site_name: name of the Sharepoint site where the files are stored.
            drive_name: name of the document library or drive in Sharepoint.
            file_name: name of the file to be stored in sharepoint.
            secret: client secret for authentication.
            folder_relative_path: optional; relative path within the
            drive(drive_name) where the file will be stored.
            chunk_size: Optional; size of file chunks to be uploaded/downloaded
            in bytes (default is 5 MB).
            local_options: Optional; additional options for customizing write
            action to local path.
            conflict_behaviour: Optional; defines how conflicts in file uploads are
            handled('replace', 'fail', etc.).
            file_pattern: Optional; pattern to match files in Sharepoint (e.g.,
            'data_*').
            file_type: Optional; type of the file to be stored in Sharepoint (e.g.,
            'csv').

        Returns:
            A SharepointUtils object.
        """
        self.client_id = client_id
        self.tenant_id = tenant_id
        self.local_path = local_path
        self.api_version = api_version
        self.site_name = site_name
        self.drive_name = drive_name
        self.file_name = file_name
        self.secret = secret
        self.folder_relative_path = folder_relative_path
        self.chunk_size = chunk_size
        self.local_options = local_options
        self.conflict_behaviour = conflict_behaviour
        self.site_id = None
        self.drive_id = None
        self.token = None
        self.file_pattern = file_pattern
        self.file_type = file_type

        self._create_app()

    def _get_token(self) -> None:
        """Fetch and store a valid access token for Sharepoint API."""
        try:
            self.token = self.app.acquire_token_for_client(
                scopes=[f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/.default"]
            )
        except Exception as err:
            _logger.error(f"Token acquisition error: {err}")

    def _create_app(self) -> None:
        """Create an MSAL (Microsoft Authentication Library) instance.

        This is used to handle authentication and authorization with Azure AD.
        """
        import msal

        self.app = msal.ConfidentialClientApplication(
            client_id=self.client_id,
            authority=f"{ExecEnv.ENGINE_CONFIG.sharepoint_authority}/{self.tenant_id}",
            client_credential=self.secret,
        )

        self._get_token()

    @retry(
        stop=stop_after_attempt(5),
        wait=wait_exponential(multiplier=30, min=30, max=150),
        retry=retry_if_exception_type(
            (RequestException, SharePointAPIError)
        ),  # Retry on these exceptions
    )
    def _make_request(
        self,
        endpoint: str,
        method: str = "GET",
        headers: dict = None,
        json_options: dict = None,
        data: object = None,
        stream: bool = False,
    ) -> requests.Response:
        """Execute API requests to Microsoft Graph API.

        !!! note
            If you try to upload large files sequentially,you may encounter
            a 503 "serviceNotAvailable" error. To mitigate this, consider using
            coalesce in the Acon transform specification. However, be aware that
            increasing the number of partitions also increases the likelihood of
            server throttling

        Args:
            endpoint: The API endpoint to call.
            headers: A dictionary containing the necessary headers.
            json_options: Optional; JSON data to include in the request body.
            method: The HTTP method to use ('GET', 'POST', 'PUT', etc.).
            data: Optional; additional data (e.g., file content) on request body.

        Returns:
            A Response object from the request library.

        Raises:
            SharePointAPIError: If there is an issue with the Sharepoint
            API request.
        """
        self._get_token()

        # Required to avoid cicd issue
        if not self.token or "access_token" not in self.token:
            raise SharePointAPIError("Authentication token is missing or invalid.")

        try:
            if "access_token" in self.token:
                response = requests.request(
                    method=method,
                    url=endpoint,
                    headers=(
                        headers
                        if headers
                        else {"Authorization": "Bearer " + self.token["access_token"]}
                    ),
                    json=json_options,
                    data=data,
                    stream=stream,
                )
                return response
        except RequestException as error:
            raise SharePointAPIError(f"{error}")

    def _parse_json(self, response: requests.Response, context: str) -> Dict[str, Any]:
        """Parse JSON response and raise on errors.

        Args:
            response: HTTP response object.
            context: Operation context for error logging.

        Returns:
            Parsed JSON as a dictionary.

        Raises:
            HTTPError: If the request fails.
            ValueError: If the response is not valid JSON.
        """
        try:
            response.raise_for_status()
        except requests.HTTPError as e:
            _logger.error(
                "HTTP error while %s: %s | body: %s", context, e, response.text[:200]
            )
            raise
        try:
            data = response.json()
            if not isinstance(data, dict):
                raise ValueError(f"Expected dict JSON while {context}")
            return data
        except (requests.JSONDecodeError, ValueError):
            _logger.error(
                "Non-JSON or wrong type while %s. Body preview: %s",
                context,
                response.text[:200],
            )
            raise

    def _get_site_id(self) -> str:
        """Get site ID from site name, with caching.

        Returns:
            Site ID as a string.

        Raises:
            SharepointAPIError: If the request fails.
            RuntimeError: For unexpected errors or missing site ID.
        """
        if self.site_id is not None:
            return self.site_id

        endpoint = (
            f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}"
            f"/sites/{ExecEnv.ENGINE_CONFIG.sharepoint_company_domain}:/"
            f"sites/{self.site_name}"
        )
        try:
            response = self._make_request(endpoint=endpoint)
            response_data = self._parse_json(
                response, f"getting site id for site '{self.site_name}'"
            )

            self.site_id = response_data.get("id")

            if not self.site_id:
                raise ValueError(
                    f"Site ID not found for site '{self.site_name}' in the API "
                    f"response: {response_data}"
                )

            return self.site_id

        except RequestException as error:
            raise SharePointAPIError(f"{error}")
        except Exception as e:
            raise RuntimeError(
                f"Unexpected error while reading site ID for site '{self.site_name}':"
                f"{e}"
            )

    def _get_drive_id(self) -> str:
        """Get drive ID from site ID and drive name, with caching.

        Returns:
            Drive ID as a string.

        Raises:
            SharepointAPIError: If the request fails.
            ValueError: If no drive is found.
        """
        if self.drive_id is not None:
            return str(self.drive_id)

        site_id = self._get_site_id()

        endpoint = (
            f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/"
            f"{self.api_version}/sites/{site_id}/drives"
        )

        try:
            response = self._make_request(endpoint=endpoint)
            response_data = self._parse_json(response, "listing drives for site")

            drives = response_data.get("value", [])
            if not drives:
                raise ValueError(f"No drives found for site '{self.site_id}'.")

            for drive in drives:
                if self.drive_name.strip().lower() == drive["name"].strip().lower():
                    drive_id = drive["id"]
                    self.drive_id = drive_id
                    return str(drive_id)

            raise ValueError(
                f"Drive '{self.drive_name}' could not be found in site '{site_id}'."
            )

        except RequestException as error:
            raise SharePointAPIError(f"Request error: {error}")

    def check_if_endpoint_exists(
        self, folder_root_path: str = None, raise_error: bool = True
    ) -> bool:
        """Check if a Sharepoint drive or folder exists.

        Args:
            folder_root_path: Optional folder path to check.
            raise_error: Raise error if the folder doesn't exist.

        Returns:
            True if the endpoint exists, False otherwise.

        Raises:
            SharepointAPIError: If the endpoint doesn't exist and raise_error is True.
        """
        try:
            site_id = self._get_site_id()
            drive_id = self._get_drive_id()

            if not folder_root_path:
                return True

            endpoint = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/"
                f"{self.api_version}/sites/{site_id}/drives/{drive_id}"
                f"/root:/{folder_root_path}"
            )

            response = self._make_request(endpoint=endpoint)
            response.raise_for_status()
            return True

        except requests.HTTPError as error:
            if error.response.status_code == 404:
                _logger.warning(f"Sharepoint path doesn't exist: {folder_root_path}")
                if raise_error:
                    raise SharePointAPIError(
                        f"Path '{folder_root_path}' doesn't exist!"
                    )
                return False
            raise

    def check_if_local_path_exists(self, local_path: str) -> None:
        """Verify that a local path exists.

        Args:
            local_path: Local folder where files are temporarily stored.

        Raises:
            SharePointAPIError: If the path cannot be read.
        """
        try:
            os.listdir(local_path)
        except IOError as error:
            raise SharePointAPIError(f"{error}")

    def save_to_staging_area(self, sp_file: SharepointFile) -> str:
        """Save a Sharepoint file locally (direct write or streaming).

        If the file is under the threshold and already loaded in memory, write its
        content directly.
        Otherwise, download the file via streaming to avoid memory overload.

        Args:
            sp_file: File metadata and content.

        Returns:
            Local file path.

        Raises:
            SharePointAPIError: On download or write failure.
        """
        try:
            if sp_file.content and sp_file.content_size < (500 * 1024 * 1024):
                _logger.info(
                    f"Writing '{sp_file.file_name}' via direct write (under 500MB)."
                )
                return self.write_bytes_to_local_file(sp_file)

            _logger.info(
                f"Writing '{sp_file.file_name}' via streaming (500MB+ or content not"
                f" loaded)."
            )
            return self.download_file_streaming(sp_file)

        except Exception as e:
            raise SharePointAPIError(f"Failed to write '{sp_file.file_name}': {e}")

    def download_file_streaming(self, sp_file: SharepointFile) -> str:
        """Download a large file from Sharepoint in chunks to a local path.

        Uses the configured chunk size to avoid memory overload with large files.

        Args:
            sp_file: File with remote path and name.

        Returns:
            Local file path.

        Raises:
            SharePointAPIError: If the download fails.
        """
        try:
            site_id = self._get_site_id()
            drive_id = self._get_drive_id()
            url = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                f"sites/{site_id}/drives/{drive_id}/root:/{sp_file.file_path}:/content"
            )

            local_file_path = Path(self.local_path) / sp_file.file_name
            local_file_path.parent.mkdir(parents=True, exist_ok=True)

            with self._make_request(endpoint=url, stream=True) as response:
                response.raise_for_status()
                with open(local_file_path, "wb") as file:
                    for chunk in response.iter_content(chunk_size=self.chunk_size):
                        if chunk:
                            file.write(chunk)

            return str(local_file_path)

        except requests.RequestException as error:
            raise SharePointAPIError(f"Failed to stream download: {error}")

    def write_bytes_to_local_file(self, sp_file: SharepointFile) -> str:
        """Write Sharepoint file content (bytes) to a local path.

        Args:
            sp_file: File with content and metadata.

        Returns:
            Local file path.

        Raises:
            ValueError: If content is missing.
            RuntimeError: If writing to disk fails.
        """
        if not sp_file.content:
            raise ValueError(
                f"Cannot write file '{sp_file.file_name}': Content is empty."
            )

        try:
            # Local base path (e.g., Unity Volumes, DBFS, or other mounted storage)
            local_base_path = Path(self.local_path)
            local_base_path.mkdir(parents=True, exist_ok=True)
            file_path = local_base_path / sp_file.file_name
            file_path.write_bytes(sp_file.content)
            return str(file_path)
        except Exception as e:
            raise RuntimeError(
                f"Failed to write file '{sp_file.file_name}' to Unity Volume: {e}"
            )

    def write_to_local_path(self, df: DataFrame) -> None:
        """Write a Spark DataFrame to a local path (Volume) in CSV format.

        This method writes the provided Spark DataFrame to a specified local directory,
        saving it in CSV format. The method renames the output file from its default
        "part-*" naming convention to a specified file name.
        The dictionary local_options enables the customisation of the write action.
        The customizable options can be found here:
        https://spark.apache.org/docs/3.5.1/sql-data-sources-csv.html.

        Args:
            df: The Spark DataFrame to write to the local file system.

        Returns:
            None.

        Raises:
            IOError: If there is an issue during the file writing process.
        """
        try:
            df.coalesce(1).write.mode("overwrite").save(
                path=self.local_path,
                format="csv",
                **self.local_options if self.local_options else {},
            )
            self._rename_local_file(self.local_path, self.file_name)
        except IOError as error:
            raise SharePointAPIError(f"{error}")

    def _rename_local_file(self, local_path: str, file_name: str) -> None:
        """Rename a local file that starts with 'part-' to the desired file name.

        Args:
            local_path: The directory where the file is located.
            file_name: The new file name for the local file.
        """
        files_in_dir = os.listdir(local_path)

        part_file = [f for f in files_in_dir if f.startswith("part-")][0]

        try:
            os.rename(
                os.path.join(local_path, part_file), os.path.join(local_path, file_name)
            )
        except IOError as error:
            raise SharePointAPIError(f"{error}")

    def write_to_sharepoint(self) -> None:
        """Upload a local file to Sharepoint in chunks using the Microsoft Graph API.

        This method creates an upload session and uploads a local CSV file to a
        Sharepoint document library.
        The file is divided into chunks (based on the `chunk_size` specified)
        to handle large file uploads and send sequentially using the upload URL
        returned from the Graph API.

        The method uses instance attributes such as `api_domain`, `api_version`,
        `site_name`, `drive_name`, `folder_relative_path`, and `file_name` to
        construct the necessary API calls and upload the file to the specified
        location in Sharepoint.

        Returns:
            None.

        Raises:
            APIError: If an error occurs during any stage of the upload
            (e.g., failure to create upload session,issues during chunk upload).
        """
        drive_id = self._get_drive_id()

        if self.folder_relative_path:
            endpoint = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}"
                f"/{self.api_version}/drives/{drive_id}/items/root:"
                f"/{self.folder_relative_path}/{self.file_name}.csv:"
                f"/createUploadSession"
            )
        else:
            endpoint = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}"
                f"/{self.api_version}/drives/{drive_id}/items/root:"
                f"/{self.file_name}.csv:/createUploadSession"
            )

        response = self._make_request(method="POST", endpoint=endpoint)
        response.raise_for_status()
        upload_session = response.json()
        upload_url = upload_session["uploadUrl"]

        upload_file = str(Path(self.local_path) / self.file_name)
        stat = os.stat(upload_file)
        size = stat.st_size

        with open(upload_file, "rb") as data:
            start = 0
            while start < size:
                chunk = data.read(self.chunk_size)
                bytes_read = len(chunk)
                upload_range = f"bytes {start}-{start + bytes_read - 1}/{size}"
                headers = {
                    "Content-Length": str(bytes_read),
                    "Content-Range": upload_range,
                }
                response = self._make_request(
                    method="PUT", endpoint=upload_url, headers=headers, data=chunk
                )
                response.raise_for_status()
                start += bytes_read

    def delete_local_path(self) -> None:
        """Delete and recreate the local path used for temporary storage.

        Raises:
            SharePointAPIError: If deletion or recreation fails.
        """
        try:
            local_path = Path(self.local_path)
            if local_path.exists():
                shutil.rmtree(local_path)
            local_path.mkdir(parents=True, exist_ok=True)
        except Exception as e:
            raise SharePointAPIError(f"Failed to clear or recreate local path: {e}")

    @contextmanager
    def staging_area(self) -> Generator[str, None, None]:
        """Provide a clean local staging folder for Sharepoint files.

        Yield the local path after ensuring it's empty. Cleans up after use.

        Yield:
            Path to the staging folder as a string.
        """
        self.delete_local_path()
        try:
            yield self.local_path
        finally:
            try:
                self.delete_local_path()
            except Exception as e:
                _logger.warning(f"Failed to clean up local path: {e}")

    def list_items_in_path(self, path: str) -> list[Any]:
        """List items (files/folders) at a Sharepoint path.

        Args:
            path: Relative folder or file path.

        Returns:
            List of items; files include @microsoft.graph.downloadUrl.

        Raises:
            ValueError: If the path is invalid or not found.
        """
        site_id = self._get_site_id()
        drive_id = self._get_drive_id()

        path = path.strip("/")
        if not path:
            resp = self._make_request(
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                f"sites/{site_id}/drives/{drive_id}/root/children"
            )
            data = self._parse_json(resp, "listing root children")
            return cast(List[dict[str, Any]], data.get("value", []))

        path_parts = path.split("/")

        # start from root children
        resp = self._make_request(
            f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/sites/"
            f"{site_id}/drives/{drive_id}/root/children"
        )
        data = self._parse_json(resp, "listing root children")
        items = cast(List[dict[str, Any]], data.get("value", []))

        for component in path_parts:
            current_item = next(
                (item for item in items if item.get("name") == component), None
            )

            if not current_item:
                raise ValueError(f"Path component '{component}' not found in '{path}'.")

            if "folder" in current_item:
                # descend into folder
                resp = self._make_request(
                    f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                    f"sites/{site_id}/drives/{drive_id}/items/"
                    f"{current_item['id']}/children"
                )
                data = self._parse_json(resp, f"listing children for '{component}'")
                items = cast(List[dict[str, Any]], data.get("value", []))
            else:
                # it's a file; ensure we have downloadUrl
                if "@microsoft.graph.downloadUrl" not in current_item:
                    resp = self._make_request(
                        f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/"
                        f"{self.api_version}/sites/{site_id}/drives/{drive_id}/"
                        f"items/{current_item['id']}"
                    )
                    current_item = self._parse_json(
                        resp, f"fetching file metadata for item id {current_item['id']}"
                    )
                return [current_item]

        return items

    def get_file_metadata(self, file_path: str) -> SharepointFile:
        """Fetch file metadata and content from Sharepoint.

        Args:
            file_path: Full Sharepoint path (e.g., 'folder/file.csv').

        Returns:
            SharepointFile with metadata and bytes content.

        Raises:
            ValueError: If required metadata is missing or path is invalid.
            requests.HTTPError: On HTTP errors during retrieval.
        """
        site_id = self._get_site_id()
        drive_id = self._get_drive_id()

        file_metadata_url = (
            f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/"
            f"{self.api_version}/sites/{site_id}/drives/{drive_id}/root:/{file_path}"
        )

        # Get metadata
        metadata_response = self._make_request(endpoint=file_metadata_url, method="GET")
        metadata = self._parse_json(
            metadata_response,
            f"fetching metadata for '{file_path}'",
        )

        file_name = metadata.get("name")
        time_created = metadata.get("createdDateTime", "")
        time_modified = metadata.get("lastModifiedDateTime", "")
        download_url = metadata.get("@microsoft.graph.downloadUrl")

        if not file_name or not download_url:
            raise ValueError(
                f"Missing required metadata for '{file_path}': "
                f"name={file_name!r}, "
                f"downloadUrl={'present' if download_url else 'absent'}"
            )

        # Download file content (bytes)
        content_response = self._make_request(endpoint=download_url, method="GET")
        content_response.raise_for_status()
        file_content = content_response.content

        if "/" not in file_path:
            raise ValueError(
                f"Invalid file path: '{file_path}'. Expected a folder/file structure."
            )
        folder = file_path.rsplit("/", 1)[0]

        return SharepointFile(
            file_name=file_name,
            time_created=time_created,
            time_modified=time_modified,
            content=file_content,
            _folder=folder,
        )

    def archive_sharepoint_file(
        self, sp_file: SharepointFile, to_path: str | None, *, move_enabled: bool = True
    ) -> None:
        """Rename (timestamp) and optionally move a Sharepoint file.

        Args:
            sp_file: File to archive.
            to_path: Destination folder (if moving).
            move_enabled: Whether to move after rename.

        Raises:
            SharePointAPIError: If the request fails.
        """
        # If already archived (renamed+moved before), don't repeat
        if getattr(sp_file, "_already_archived", False) and move_enabled and to_path:
            _logger.info(
                "Skipping archive: file already archived -> %s", sp_file.file_name
            )
            return

        try:
            if not getattr(sp_file, "skip_rename", False):
                new_file_name = self._rename_sharepoint_file(sp_file)
                sp_file.file_name = new_file_name
                sp_file.skip_rename = True

            if not move_enabled or not to_path:
                _logger.info(
                    """Archiving disabled or no target folder;
                     Renamed only and left in place: '%s'.""",
                    sp_file.file_path,
                )
                return

            self._move_file_in_sharepoint(sp_file, to_path)
            sp_file._already_archived = True
            _logger.info("Archived '%s' to '%s'.", sp_file.file_name, to_path)

        except requests.RequestException as e:
            _logger.error(
                "Request failed while archiving '%s': %s", sp_file.file_name, e
            )
            raise SharePointAPIError(f"Request failed: {e}")

    def _rename_sharepoint_file(self, sp_file: SharepointFile) -> str:
        """Prefix file name with a timestamp (skip if already renamed).

        Args:
            sp_file: File to rename.

        Returns:
            New file name.

        Raises:
            SharePointAPIError: If the rename request fails.
        """
        try:
            if getattr(sp_file, "skip_rename", False):
                _logger.info(
                    f"Skipping rename for already-prefixed file: {sp_file.file_name}"
                )
                return sp_file.file_name

            _logger.info(f"Renaming file at '{sp_file.file_path}'.")

            site_id = self._get_site_id()
            drive_id = self._get_drive_id()
            current_date_formatted = datetime.now().strftime("%Y%m%d%H%M%S")
            new_file_name = f"{current_date_formatted}_{sp_file.file_name}"

            url_get_file = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                f"sites/{site_id}/drives/{drive_id}/root:/{sp_file.file_path}"
            )
            resp = self._make_request(endpoint=url_get_file, method="GET")
            file_info = self._parse_json(
                resp, f"fetching file info at '{sp_file.file_path}'"
            )
            file_id = file_info.get("id")
            if not file_id:
                raise ValueError(
                    f"File '{sp_file.file_name}' not found in '{sp_file.file_path}'."
                )

            url_rename_file = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                f"sites/{site_id}/drives/{drive_id}/items/{file_id}"
            )
            rename_payload = {"name": new_file_name}
            rename_resp = self._make_request(
                endpoint=url_rename_file, method="PATCH", json_options=rename_payload
            )
            rename_resp.raise_for_status()

            _logger.info(f"File '{sp_file.file_name}' renamed to '{new_file_name}'.")
            sp_file.file_name = new_file_name
            return new_file_name

        except requests.RequestException as e:
            _logger.error(
                f"Request failed while renaming file '{sp_file.file_name}': {e}"
            )
            raise SharePointAPIError(f"Request failed: {e}")

    def _move_file_in_sharepoint(self, sp_file: SharepointFile, to_path: str) -> None:
        """Move a file to another folder in Sharepoint.

        Args:
            sp_file: File to move.
            to_path: Destination path.

        Raises:
            ValueError: If the file ID cannot be resolved.
            SharePointAPIError: If the move request fails.
        """
        try:
            _logger.info(
                f"Moving file '{sp_file.file_name}' from '{sp_file.file_path}' to "
                f"'{to_path}'."
            )

            site_id = self._get_site_id()
            drive_id = self._get_drive_id()

            if not self.check_if_endpoint_exists(
                folder_root_path=to_path, raise_error=False
            ):
                self._create_folder_in_sharepoint(to_path)
                # Create the folder if it doesn't exist; raise_error = false so it
                # doesn't throw error
                _logger.info(f"Created archive folder: {to_path}")

            url_get_file = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                f"sites/{site_id}/drives/{drive_id}/root:/{sp_file.file_path}"
            )

            response = self._make_request(endpoint=url_get_file, method="GET")
            file_info = self._parse_json(
                response,
                f"getting file id for move '{sp_file.file_path}'",
            )

            file_id = file_info.get("id")

            if not file_id:
                raise ValueError(
                    f"File '{sp_file.file_name}' not found in '{sp_file.file_path}'."
                )

            url_move_file = (
                f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                f"sites/{site_id}/drives/{drive_id}/items/{file_id}"
            )

            new_parent_reference = {
                "parentReference": {"path": f"/drive/root:/{to_path}"},
                "name": sp_file.file_name,
            }

            response = self._make_request(
                endpoint=url_move_file,
                method="PATCH",
                json_options=new_parent_reference,
            )
            response.raise_for_status()

            _logger.info(
                f"File '{sp_file.file_name}' successfully moved to '{to_path}'."
            )

        except requests.RequestException as e:
            _logger.error(
                f"Request failed while moving file '{sp_file.file_name}': {e}"
            )
            raise SharePointAPIError(f"Request failed: {e}")

    def _create_folder_in_sharepoint(self, folder_path: str) -> None:
        """Create the final folder in a Sharepoint path.

        Args:
            folder_path: Full folder path to create.

        Raises:
            SharePointAPIError: If folder creation fails.
        """
        try:
            site_id = self._get_site_id()
            drive_id = self._get_drive_id()

            parent_path, folder_name = (
                folder_path.rsplit("/", 1) if "/" in folder_path else ("", folder_path)
            )
            parent_path = parent_path.strip("/")  # Clean path just in case

            _logger.info(
                f"Creating folder '{folder_name}' inside '{parent_path or 'root'}'"
            )

            if parent_path:
                endpoint = (
                    f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                    f"sites/{site_id}/drives/{drive_id}/root:/{parent_path}:/children"
                )
            else:
                endpoint = (
                    f"{ExecEnv.ENGINE_CONFIG.sharepoint_api_domain}/{self.api_version}/"
                    f"sites/{site_id}/drives/{drive_id}/root/children"
                )

            folder_metadata = {"name": folder_name, "folder": {}}

            response = self._make_request(
                endpoint=endpoint, method="POST", json_options=folder_metadata
            )
            response.raise_for_status()

            _logger.info(f"Folder '{folder_path}' created successfully.")

        except requests.RequestException as e:
            _logger.error(f"Failed to create folder '{folder_path}': {e}")
            raise SharePointAPIError(f"Error creating folder '{folder_path}': {e}")


================================================
FILE: lakehouse_engine/utils/spark_utils.py
================================================
"""Utilities to facilitate spark dataframe management."""

from pyspark.sql import DataFrame

from lakehouse_engine.core.exec_env import ExecEnv


class SparkUtils(object):
    """Spark utils that help retrieve and manage dataframes."""

    @staticmethod
    def create_temp_view(
        df: DataFrame, view_name: str, return_prefix: bool = False
    ) -> None | str:
        """Create a temporary view from a dataframe.

        If the execution environment is serverless, it creates a temporary view,
        otherwise it creates a global temporary view.
        Serverless environments don't support global temporary views, so we need to
        create a temporary view in that case, but it still gets accessible from other
        queries in the same session.
        In non-serverless environments, we create a global temporary view to make
        sure it is accessible from other sessions as well.

        Args:
            df: dataframe to create the view from.
            view_name: name of the view to create.
            return_prefix: whether to return the prefix to use in queries
            for this view or not.

        Returns:
            None or the prefix to use in queries for this view, depending on the
            value of return_prefix.
        """
        if ExecEnv.IS_SERVERLESS:
            df.createOrReplaceTempView(view_name)
            prefix = ""
        else:
            df.createOrReplaceGlobalTempView(view_name)
            prefix = "global_temp."
        if return_prefix:
            return prefix
        return None


================================================
FILE: lakehouse_engine/utils/sql_parser_utils.py
================================================
"""Module to parse sql files."""

from lakehouse_engine.core.definitions import SQLParser


class SQLParserUtils(object):
    """Parser utilities class."""

    def split_sql_commands(
        self,
        sql_commands: str,
        delimiter: str,
        advanced_parser: bool,
    ) -> list[str]:
        """Read the sql commands of a file to choose how to split them.

        Args:
            sql_commands: commands to be split.
            delimiter: delimiter to split the sql commands.
            advanced_parser: boolean to define if we need to use a complex split.

        Returns:
            List with the sql commands.
        """
        if advanced_parser:
            self.sql_commands: str = sql_commands
            self.delimiter: str = delimiter
            self.separated_sql_commands: list[str] = []
            self.split_index: int = 0
            return self._split_sql_commands()
        else:
            return sql_commands.split(delimiter)

    def _split_sql_commands(self) -> list[str]:
        """Read the sql commands of a file to split them based on a delimiter.

        Returns:
            List with the sql commands.
        """
        single_quotes: int = 0
        double_quotes: int = 0
        one_line_comment: int = 0
        multiple_line_comment: int = 0

        for index, char in enumerate(self.sql_commands):
            if char == SQLParser.SINGLE_QUOTES.value and self._character_validation(
                value=[double_quotes, one_line_comment, multiple_line_comment]
            ):
                single_quotes = self._update_value(
                    value=single_quotes,
                    condition=self._character_validation(
                        value=self._get_substring(first_char=index - 1, last_char=index)
                    ),
                    operation="+-",
                )
            elif char == SQLParser.DOUBLE_QUOTES.value and self._character_validation(
                value=[single_quotes, one_line_comment, multiple_line_comment]
            ):
                double_quotes = self._update_value(
                    value=double_quotes,
                    condition=self._character_validation(
                        value=self._get_substring(first_char=index - 1, last_char=index)
                    ),
                    operation="+-",
                )
            elif char == SQLParser.SINGLE_TRACE.value and self._character_validation(
                value=[double_quotes, single_quotes, multiple_line_comment]
            ):
                one_line_comment = self._update_value(
                    value=one_line_comment,
                    condition=(
                        self._get_substring(first_char=index, last_char=index + 2)
                        == SQLParser.DOUBLE_TRACES.value
                    ),
                    operation="+",
                )
            elif (
                char == SQLParser.SLASH.value or char == SQLParser.STAR.value
            ) and self._character_validation(
                value=[double_quotes, single_quotes, one_line_comment]
            ):
                multiple_line_comment = self._update_value(
                    value=multiple_line_comment,
                    condition=self._get_substring(first_char=index, last_char=index + 2)
                    in SQLParser.MULTIPLE_LINE_COMMENT.value,
                    operation="+-",
                )

            one_line_comment = self._update_value(
                value=one_line_comment,
                condition=char == SQLParser.PARAGRAPH.value,
                operation="-",
            )

            self._validate_command_is_closed(
                index=index,
                dependencies=self._character_validation(
                    value=[
                        single_quotes,
                        double_quotes,
                        one_line_comment,
                        multiple_line_comment,
                    ]
                ),
            )

        return self.separated_sql_commands

    def _get_substring(self, first_char: int = None, last_char: int = None) -> str:
        """Get the substring based on the indexes passed as arguments.

        Args:
            first_char: represents the first index of the string.
            last_char: represents the last index of the string.

        Returns:
            The substring based on the indexes passed as arguments.
        """
        return self.sql_commands[first_char:last_char]

    def _validate_command_is_closed(self, index: int, dependencies: int) -> None:
        """Validate based on the delimiter if we have the closing of a sql command.

        Args:
            index: index of the character in a string.
            dependencies: represents an int to validate if we are outside of quotes,...
        """
        if (
            self._get_substring(first_char=index, last_char=index + len(self.delimiter))
            == self.delimiter
            and dependencies
        ):
            self._add_new_command(
                sql_command=self._get_substring(
                    first_char=self.split_index, last_char=index
                )
            )
            self.split_index = index + len(self.delimiter)

        if self._get_substring(
            first_char=index, last_char=index + len(self.delimiter)
        ) != self.delimiter and index + len(self.delimiter) == len(self.sql_commands):
            self._add_new_command(
                sql_command=self._get_substring(
                    first_char=self.split_index, last_char=len(self.sql_commands)
                )
            )

    def _character_validation(self, value: str | list) -> bool:
        """Validate if character is the opening/closing/inside of a comment.

        Args:
            value: represent the value associated to different validated
            types or a character to be analyzed.

        Returns:
            Boolean that indicates if character found is the opening
            or closing of a comment, is inside of quotes, comments,...
        """
        if value.__class__.__name__ == "list":
            return sum(value) == 0
        else:
            return value != SQLParser.BACKSLASH.value

    def _add_new_command(self, sql_command: str) -> None:
        """Add a newly found command to list of sql commands to execute.

        Args:
            sql_command: command to be added to list.
        """
        self.separated_sql_commands.append(str(sql_command))

    def _update_value(self, value: int, operation: str, condition: bool = False) -> int:
        """Update value associated to different types of comments or quotes.

        Args:
            value: value to be updated
            operation: operation that we want to perform on the value.
            condition: validate if we have a condition associated to the value.

        Returns:
            A integer that represents the updated value.
        """
        if condition and operation == "+-":
            value = value + 1 if value == 0 else value - 1
        elif condition and operation == "+":
            value = value + 1 if value == 0 else value
        elif condition and operation == "-":
            value = value - 1 if value == 1 else value

        return value


================================================
FILE: lakehouse_engine/utils/storage/__init__.py
================================================
"""Utilities to interact with storage systems."""


================================================
FILE: lakehouse_engine/utils/storage/dbfs_storage.py
================================================
"""Module to represent a DBFS file storage system."""

from typing import Any
from urllib.parse import ParseResult, urlunparse

from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.storage.file_storage import FileStorage


class DBFSStorage(FileStorage):
    """Class to represent a DBFS file storage system."""

    _LOGGER = LoggingHandler(__name__).get_logger()
    _MAX_INT = 2147483647

    @classmethod
    def get_file_payload(cls, url: ParseResult) -> Any:
        """Get the content of a file.

        Args:
            url: url of the file.

        Returns:
            File payload/content.
        """
        from lakehouse_engine.core.exec_env import ExecEnv

        str_url = urlunparse(url)
        cls._LOGGER.info(f"Trying with dbfs_storage: Reading from file: {str_url}")
        return DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.head(
            str_url, cls._MAX_INT
        )

    @classmethod
    def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
        """Write payload into a file.

        Args:
            url: url of the file.
            content: content to write into the file.
        """
        from lakehouse_engine.core.exec_env import ExecEnv

        str_url = urlunparse(url)
        cls._LOGGER.info(f"Trying with dbfs_storage: Writing into file: {str_url}")
        DatabricksUtils.get_db_utils(ExecEnv.SESSION).fs.put(str_url, content, True)


================================================
FILE: lakehouse_engine/utils/storage/file_storage.py
================================================
"""Module for abstract representation of a storage system holding files."""

from abc import ABC, abstractmethod
from typing import Any
from urllib.parse import ParseResult


class FileStorage(ABC):
    """Abstract file storage class."""

    @classmethod
    @abstractmethod
    def get_file_payload(cls, url: ParseResult) -> Any:
        """Get the payload of a file.

        Args:
            url: url of the file.

        Returns:
            File payload/content.
        """
        pass

    @classmethod
    @abstractmethod
    def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
        """Write payload into a file.

        Args:
            url: url of the file.
            content: content to write into the file.
        """
        pass


================================================
FILE: lakehouse_engine/utils/storage/file_storage_functions.py
================================================
"""Module for common file storage functions."""

import json
from abc import ABC
from typing import Any
from urllib.parse import ParseResult, urlparse

import boto3

from lakehouse_engine.utils.storage.dbfs_storage import DBFSStorage
from lakehouse_engine.utils.storage.local_fs_storage import LocalFSStorage
from lakehouse_engine.utils.storage.s3_storage import S3Storage


class FileStorageFunctions(ABC):  # noqa: B024
    """Class for common file storage functions."""

    @classmethod
    def read_json(cls, path: str, disable_dbfs_retry: bool = False) -> Any:
        """Read a json file.

        The file should be in a supported file system (e.g., s3, dbfs or
        local filesystem).

        Args:
            path: path to the json file.
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            Dict with json file content.
        """
        url = urlparse(path, allow_fragments=False)
        if disable_dbfs_retry:
            return json.load(S3Storage.get_file_payload(url))
        elif url.scheme == "s3" and cls.is_boto3_configured():
            try:
                return json.load(S3Storage.get_file_payload(url))
            except Exception:
                return json.loads(DBFSStorage.get_file_payload(url))
        elif url.scheme == "file":
            return json.load(LocalFSStorage.get_file_payload(url))
        elif url.scheme in ["dbfs", "s3"]:
            return json.loads(DBFSStorage.get_file_payload(url))
        else:
            raise NotImplementedError(
                f"File storage protocol not implemented for {path}."
            )

    @classmethod
    def read_sql(cls, path: str, disable_dbfs_retry: bool = False) -> Any:
        """Read a sql file.

        The file should be in a supported file system (e.g., s3, dbfs or local
        filesystem).

        Args:
            path: path to the sql file.
            disable_dbfs_retry: optional flag to disable file storage dbfs.

        Returns:
            Content of the SQL file.
        """
        url = urlparse(path, allow_fragments=False)
        if disable_dbfs_retry:
            return S3Storage.get_file_payload(url).read().decode("utf-8")
        elif url.scheme == "s3" and cls.is_boto3_configured():
            try:
                return S3Storage.get_file_payload(url).read().decode("utf-8")
            except Exception:
                return DBFSStorage.get_file_payload(url)
        elif url.scheme == "file":
            return LocalFSStorage.get_file_payload(url).read()
        elif url.scheme in ["dbfs", "s3"]:
            return DBFSStorage.get_file_payload(url)
        else:
            raise NotImplementedError(
                f"Object storage protocol not implemented for {path}."
            )

    @classmethod
    def write_payload(
        cls, path: str, url: ParseResult, content: str, disable_dbfs_retry: bool = False
    ) -> None:
        """Write payload into a file.

        The file should be in a supported file system (e.g., s3, dbfs or local
        filesystem).

        Args:
            path: path to validate the file type.
            url: url of the file.
            content: content to write into the file.
            disable_dbfs_retry: optional flag to disable file storage dbfs.
        """
        if disable_dbfs_retry:
            S3Storage.write_payload_to_file(url, content)
        elif path.startswith("s3://") and cls.is_boto3_configured():
            try:
                S3Storage.write_payload_to_file(url, content)
            except Exception:
                DBFSStorage.write_payload_to_file(url, content)
        elif path.startswith(("s3://", "dbfs:/")):
            DBFSStorage.write_payload_to_file(url, content)
        else:
            LocalFSStorage.write_payload_to_file(url, content)

    @staticmethod
    def is_boto3_configured() -> bool:
        """Check if boto3 is able to locate credentials and properly configured.

        If boto3 is not properly configured, we might want to try a different reader.
        """
        try:
            boto3.client("sts").get_caller_identity()
            return True
        except Exception:
            return False


================================================
FILE: lakehouse_engine/utils/storage/local_fs_storage.py
================================================
"""Module to represent a local file storage system."""

import os
from typing import TextIO
from urllib.parse import ParseResult

from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.storage.file_storage import FileStorage


class LocalFSStorage(FileStorage):
    """Class to represent a local file storage system."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def get_file_payload(cls, url: ParseResult) -> TextIO:
        """Get the payload of a file.

        Args:
            url: url of the file.

        Returns:
            file payload/content.
        """
        cls._LOGGER.info(f"Reading from file: {url.scheme}:{url.netloc}/{url.path}")
        return open(f"{url.netloc}/{url.path}", "r")

    @classmethod
    def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
        """Write payload into a file.

        Args:
            url: url of the file.
            content: content to write into the file.
        """
        cls._LOGGER.info(f"Writing into file: {url.scheme}:{url.netloc}/{url.path}")
        os.makedirs(os.path.dirname(f"{url.netloc}/{url.path}"), exist_ok=True)
        with open(f"{url.netloc}/{url.path}", "w") as file:
            file.write(content)


================================================
FILE: lakehouse_engine/utils/storage/s3_storage.py
================================================
"""Module to represent a s3 file storage system."""

from typing import Any
from urllib.parse import ParseResult

import boto3

from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.storage.file_storage import FileStorage


class S3Storage(FileStorage):
    """Class to represent a s3 file storage system."""

    _LOGGER = LoggingHandler(__name__).get_logger()

    @classmethod
    def get_file_payload(cls, url: ParseResult) -> Any:
        """Get the payload of a config file.

        Args:
            url: url of the file.

        Returns:
            File payload/content.
        """
        s3 = boto3.resource("s3")
        obj = s3.Object(url.netloc, url.path.lstrip("/"))
        cls._LOGGER.info(
            f"Trying with s3_storage: "
            f"Reading from file: {url.scheme}://{url.netloc}{url.path}"
        )
        return obj.get()["Body"]

    @classmethod
    def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
        """Write payload into a file.

        Args:
            url: url of the file.
            content: content to write into the file.
        """
        s3 = boto3.resource("s3")
        obj = s3.Object(url.netloc, url.path.lstrip("/"))
        cls._LOGGER.info(
            f"Trying with s3_storage: "
            f"Writing into file: {url.scheme}://{url.netloc}{url.path}"
        )
        obj.put(Body=content)


================================================
FILE: lakehouse_engine_usage/__init__.py
================================================
"""
# How to use the Lakehouse Engine?
Lakehouse engine usage examples for all the algorithms and other core functionalities.

- [Data Loader](lakehouse_engine_usage/data_loader.html)
- [Data Quality](lakehouse_engine_usage/data_quality.html)
- [Reconciliator](lakehouse_engine_usage/reconciliator.html)
- [Sensors - Sensor & Heartbeat Sensor](lakehouse_engine_usage/sensors.html)
- [GAB](lakehouse_engine_usage/gab.html)

"""


================================================
FILE: lakehouse_engine_usage/data_loader/__init__.py
================================================
"""
.. include::data_loader.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/__init__.py
================================================
"""
.. include::append_load_from_jdbc_with_permissive_mode.md
"""

================================================
FILE: lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/append_load_from_jdbc_with_permissive_mode.md
================================================
# Append Load from JDBC with PERMISSIVE mode (default)

This scenario is an append load from a JDBC source (e.g., SAP BW, Oracle Database, SQL Server Database...).

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "jdbc_args": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db",
        "table": "jdbc_permissive",
        "properties": {
          "driver": "org.sqlite.JDBC"
        }
      },
      "options": {
        "numPartitions": 1
      }
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.jdbc_permissive_table"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_date",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_date"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "db_table": "test_db.jdbc_permissive_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data"
    }
  ]
}

load_data(acon=acon)
```

##### Relevant notes

- The **ReadMode** is **PERMISSIVE** in this scenario, which **is the default in Spark**, hence we **don't need to specify it**. Permissive means don't enforce any schema on the input data. 
- From a JDBC source the ReadType needs to be "batch" always as "streaming" is not available for a JDBC source.
- In this scenario we do an append load by getting the max date (transformer_spec ["get_max_value"](../../../reference/packages/transformers/aggregators.md#packages.transformers.aggregators.Aggregators.get_max_value)) on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec ["incremental_filter"](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.incremental_filter)). **That is the standard way we do incremental batch loads in the lakehouse engine.** For streaming incremental loads we rely on Spark Streaming checkpoint feature [(check a streaming append load ACON example)](../streaming_append_load_with_terminator/streaming_append_load_with_terminator.md).

================================================
FILE: lakehouse_engine_usage/data_loader/append_load_with_failfast/__init__.py
================================================
"""
.. include::append_load_with_failfast.md
"""

================================================
FILE: lakehouse_engine_usage/data_loader/append_load_with_failfast/append_load_with_failfast.md
================================================
# Append Load with FAILFAST

This scenario is an append load enforcing the schema (using the schema of the target table to enforce the schema of the source, i.e., the schema of the source needs to exactly match the schema of the target table) and FAILFASTING if the schema of the input data does not match the one we specified.

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "enforce_schema_from_table": "test_db.failfast_table",
      "options": {
        "header": True,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.failfast_table"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_date",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_date"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "db_table": "test_db.failfast_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data"
    }
  ]
}

load_data(acon=acon)
```
##### Relevant notes

- The **ReadMode** is **FAILFAST** in this scenario, i.e., fail the algorithm if the schema of the input data does not match the one we specified via schema_path, read_schema_from_table or schema Input_specs variables.
- In this scenario we do an append load by getting the max date (transformer_spec ["get_max_value"](../../../reference/packages/transformers/aggregators.md#packages.transformers.aggregators.Aggregators.get_max_value)) on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec ["incremental_filter"](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.incremental_filter)). **That is the standard way we do incremental batch loads in the lakehouse engine.** For streaming incremental loads we rely on Spark Streaming checkpoint feature [(check a streaming append load ACON example)](../streaming_append_load_with_terminator/streaming_append_load_with_terminator.md).

================================================
FILE: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/__init__.py
================================================
"""
.. include::batch_delta_load_init_delta_backfill_with_merge.md
"""

================================================
FILE: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/batch_delta_load_init_delta_backfill_with_merge.md
================================================
# Batch Delta Load Init, Delta and Backfill with Merge

This scenario illustrates the process of implementing a delta load algorithm by first using an ACON to perform an initial load, then another one to perform the regular deltas that will be triggered on a recurrent basis, and finally an ACON for backfilling specific parcels if ever needed.

## Init Load

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": True,
        "delimiter": "|",
        "inferSchema": True
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

load_data(acon=acon)
```

##### Relevant Notes

- We can see that even though this is an init load we still have chosen to condense the records through our ["condense_record_mode_cdc"](../../../reference/packages/transformers/condensers.md#packages.transformers.condensers.Condensers.condense_record_mode_cdc) transformer. This is a condensation step capable of handling SAP BW style changelogs based on actrequest_timestamps, datapakid, record_mode, etc...
- In the init load we actually did a merge in this case because we wanted to test locally if a merge with an empty target table works, but you don't have to do it, as an init load usually can be just a full load. If a merge of init data with an empty table has any performance implications when compared to a regular insert remains to be tested, but we don't have any reason to recommend a merge over an insert for an init load, and as said, this was done solely for local testing purposes, you can just use `write_type: "overwrite"`

## Delta Load

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": True,
        "delimiter": "|",
        "inferSchema": True
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

load_data(acon=acon)
```

##### Relevant Notes

- The merge predicate and the insert, delete or update predicates should reflect the reality of your data, and it's up to each data product to figure out which predicates better match their reality:
    - The merge predicate usually involves making sure that the "primary key" for your data matches.
    !!! note "**Performance Tip!!!**"
        Ideally, in order to get a performance boost in your merges, you should also place a filter in your merge predicate (e.g., certain technical or business date in the target table >= x days ago), based on the assumption that the rows in that specified interval will never change in the future. This can drastically decrease the merge times of big tables.

    - The insert, delete and update predicates will always depend on the structure of your changelog, and also how you expect your updates to arrive (e.g., in certain data products you know that you will never get out of order data or late arriving data, while in other you can never ensure that). These predicates should reflect that in order to prevent you from doing unwanted changes to the target delta lake table.
        - For example, in this scenario, we delete rows that have the R, D or X record_mode values, because we know that if after condensing the rows that is the latest status of that row from the changelog, they should be deleted, and we never insert rows with those status (**note**: we use this guardrail in the insert to prevent out of order changes, which is likely not the case in SAP BW).
        - Because the `insert_predicate` is fully optional, in your scenario you may not require that.
    - In this scenario, we don't pass an `update_predicate` in the ACON, because both `insert_predicate` and update_predicate are fully optional, i.e., if you don't pass them the algorithm will update any data that matches the `merge_predicate` and insert any data that does not match it. The predicates in these cases just make sure the algorithm does not insert or update any data that you don't want, as in the late arriving changes scenario where a deleted row may arrive first from the changelog then the update row, and to prevent your target table to have inconsistent data for a certain period of time (it will eventually get consistent when you receive the latest correct status from the changelog though) you can have this guardrail in the insert or update predicates. Again, for most sources this will not happen but sources like Kafka for example cannot 100% ensure order, for example.
    - In order to understand how we can cover different scenarios (e.g., late arriving changes, out of order changes, etc.), please go [here](../streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md).
- The order of the predicates in the ACON does not matter, is the logic in the lakehouse engine [DeltaMergeWriter's "_merge" function](../../../reference/packages/io/writers/delta_merge_writer.md#packages.io.writers.delta_merge_writer.DeltaMergeWriter.__init__) that matters.
- Notice the "<=>" operator? In Spark SQL that's the null safe equal.

## Backfilling

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": True,
        "delimiter": "|",
        "inferSchema": True
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_value": "20180110120052t",
            "greater_or_equal": True
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

load_data(acon=acon)
```

##### Relevant Notes

- The backfilling process depicted here is fairly similar to the init load, but it is relevant to highlight  by using a static value (that can be modified accordingly to the backfilling needs) in the [incremental_filter](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.incremental_filter) function.
- Other relevant functions for backfilling may include the [expression_filter](../../../reference/packages/transformers/filters.md#packages.transformers.filters.Filters.expression_filter) function, where you can use a custom SQL filter to filter the input data.

================================================
FILE: lakehouse_engine_usage/data_loader/custom_transformer/__init__.py
================================================
"""
.. include::custom_transformer.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/custom_transformer/custom_transformer.md
================================================
# Custom Transformer

There may appear a scenario where the data product dev team faces the need to perform complex data transformations that are either not yet available in the lakehouse engine or the logic is just too complex to chain in an ACON file. In the context of the lakehouse, the only layers that usually can impose that complexity is silver+ and gold. This page targets exactly those cases.

Below you'll find a notebook where you can pass your own PySpark or Spark SQL logic into the ACON, by dynamically injecting a python function into the ACON dictionary. The lakehouse engine will take care of executing those transformations in the transformation step of the data loader algorithm. Please read the notebook's comments carefully to understand how it works, or simply open it in your notebook environment, which will make the notebook's code and comments more readable.

!!! warning "Force Streaming Micro Batch Processing."
    When you use streaming mode, with a custom transformer, it’s highly advisable that you set the `force_streaming_microbatch_processing` flag to `True` in the transform specification, as explained above!

## What is a custom transformer in the Lakehouse Engine and how you can use it to write your own pyspark logic?

We highly promote the Lakehouse Engine for creating Data Products aligned with the data source (bronze/silver layer), pumping data into silver so our Data Scientists and Analysts can leverage the value of the data in silver, as close as it comes from the source.
The low-code and configuration-driven nature of the lakehouse engine makes it a compelling framework to use in such cases, where the transformations that are done from bronze to silver are not that many, as we want to keep the data close to the source.

However, when it comes to Data Products enriched in some way or for insights (silver+, gold), they are typically heavy
on transformations (they are the T of the overall ELT process), so the nature of the lakehouse engine may would have
get into the way of adequately building it. Considering this, and considering our user base that prefers an ACON-based
approach and all the nice off-the-shelf features of the lakehouse engine, we have developed a feature that
allows us to **pass custom transformers where you put your entire pyspark logic and can pass it as an argument
in the ACON** (the configuration file that configures every lakehouse engine algorithm).

!!! note "Motivation"
    Doing that, you let the ACON guide your read, data quality, write and terminate processes, and you just focus on transforming data :)

## Custom transformation Function

The function below is the one that encapsulates all your defined pyspark logic and sends it as a python function to the lakehouse engine. This function will then be invoked internally in the lakehouse engine via a df.transform() function. If you are interested in checking the internals of the lakehouse engine, our codebase is openly available here: https://github.com/adidas/lakehouse-engine

!!! warning "Attention!!!"
    For this process to work, your function defined below needs to receive a DataFrame and return a DataFrame. Attempting any other method signature (e.g., defining more parameters) will not work, unless you use something like [python partials](https://docs.python.org/3/library/functools.html#functools.partial), for example.

```python
def get_new_data(df: DataFrame) -> DataFrame:
    """Get the new data from the lakehouse engine reader and prepare it."""
    return (
        df.withColumn("amount", when(col("_change_type") == "delete", lit(0)).otherwise(col("amount")))
        .select("article_id", "order_date", "amount")
        .groupBy("article_id", "order_date")
        .agg(sum("amount").alias("amount"))
    )


def get_joined_data(new_data_df: DataFrame, current_data_df: DataFrame) -> DataFrame:
    """Join the new data with the current data already existing in the target dataset."""
    return (
        new_data_df.alias("new_data")
        .join(
            current_data_df.alias("current_data"),
            [
                new_data_df.article_id == current_data_df.article_id,
                new_data_df.order_date == current_data_df.order_date,
            ],
            "left_outer",
        )
        .withColumn(
            "current_amount", when(col("current_data.amount").isNull(), lit(0)).otherwise("current_data.amount")
        )
        .withColumn("final_amount", col("current_amount") + col("new_data.amount"))
        .select(col("new_data.article_id"), col("new_data.order_date"), col("final_amount").alias("amount"))
    )


def calculate_kpi(df: DataFrame) -> DataFrame:
    """Calculate KPI through a custom transformer that will be provided in the ACON.
 
    Args:
        df: DataFrame passed as input.
 
    Returns:
        DataFrame: the transformed DataFrame.
    """
    new_data_df = get_new_data(df)

    # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the
    # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.
    current_data_df = ExecEnv.SESSION.table(
        "my_database.my_table"
    )

    transformed_df = get_joined_data(new_data_df, current_data_df)

    return transformed_df
```

### Don't like pyspark API? Write SQL

You don't have to comply to the pyspark API if you prefer SQL. Inside the function above (or any of
the auxiliary functions you decide to develop) you can write something like:

````python
def calculate_kpi(df: DataFrame) -> DataFrame:
    df.createOrReplaceTempView("new_data")

    # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the
    # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.
    ExecEnv.SESSION.sql(
        """
          CREATE OR REPLACE TEMP VIEW my_kpi AS
          SELECT ... FROM new_data ...
        """
    )

    return ExecEnv.SESSION.table("my_kpi")
````

## Just your regular ACON

If you notice the ACON below, everything is the same as you would do in a Data Product, but the `transform_specs` section of the ACON has a difference, which is a function called `"custom_transformation"` where we supply as argument the function defined above with the pyspark code.

!!! warning "Attention!!!"
    Do not pass the function as calculate_kpi(), but as calculate_kpi, otherwise you are telling python to invoke the function right away, as opposed to pass it as argument to be invoked later by the lakehouse engine.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "sales",
            "read_type": "streaming",
            "data_format": "delta",
            "db_table": "my_database.dummy_sales",
            "options": {"readChangeFeed": "true"},
        }
    ],
    "transform_specs": [
        {
            "spec_id": "transformed_sales_kpi",
            "input_id": "sales",
            # because we are using streaming, this allows us to make sure that
            # all the computation in our custom transformer gets pushed to
            # Spark's foreachBatch method in a stream, which allows us to
            # run all Spark functions in a micro batch DataFrame, as there
            # are some Spark functions that are not supported in streaming.
            "force_streaming_foreach_batch_processing": True,
            "transformers": [
                {
                    "function": "custom_transformation",
                    "args": {"custom_transformer": calculate_kpi},
                },
            ],
        }
    ],
    "dq_specs": [
        {
            "spec_id": "my_table_quality",
            "input_id": "transformed_sales_kpi",
            "dq_type": "validator",
            "bucket": "my_dq_bucket",
            "expectations_store_prefix": "dq/expectations/",
            "validations_store_prefix": "dq/validations/",
            "checkpoint_store_prefix": "dq/checkpoints/",
            "tbl_to_derive_pk": "my_table",
            "dq_functions": [
                {"function": "expect_column_values_to_not_be_null", "args": {"column": "article_id"}},
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "sales_kpi",
            "input_id": "transformed_sales_kpi",
            "write_type": "merge",
            "data_format": "delta",
            "db_table": "my_database.my_table",
            "options": {
                "checkpointLocation": "s3://my_data_product_bucket/gold/my_table",
            },
            "merge_opts": {
                "merge_predicate": "new.article_id = current.article_id AND new.order_date = current.order_date"
            },
        }
    ],
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/custom_transformer/sql_custom_transformer.md
================================================
# SQL Custom Transformer
The SQL Custom Transformer executes a SQL transformation provided by the user.This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers.

The transformer receives the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view.

> To register a dataframe as a temp view you can use the "temp_view" config in the input_specs, as shown below.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "sales_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"},
            "schema_path": "file:///app/tests/lakehouse/in/feature/"
            "data_loader_custom_transformer/sql_transformation/"
            "source_schema.json",
            "location": "file:///app/tests/lakehouse/in/feature/"
            "data_loader_custom_transformer/sql_transformation/data",
            "temp_view": "sales_sql",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "calculated_kpi",
            "input_id": "sales_source",
            "transformers": [
                {
                    "function": "sql_transformation",
                    "args": {"sql": SQL},
                }
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales_bronze",
            "input_id": "calculated_kpi",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "file:///app/tests/lakehouse/out/feature/"
            "data_loader_custom_transformer/sql_transformation/data",
        }
    ],
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/custom_transformer_sql/__init__.py
================================================
"""
.. include::custom_transformer_sql.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/custom_transformer_sql/custom_transformer_sql.md
================================================
# SQL Custom Transformer
The SQL Custom Transformer executes a SQL transformation provided by the user.This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers.

The transformer receives the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view.

> To register a dataframe as a temp view you can use the "temp_view" config in the input_specs, as shown below.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "sales_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"},
            "schema_path": "file:///app/tests/lakehouse/in/feature/"
            "data_loader_custom_transformer/sql_transformation/"
            "source_schema.json",
            "location": "file:///app/tests/lakehouse/in/feature/"
            "data_loader_custom_transformer/sql_transformation/data",
            "temp_view": "sales_sql",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "calculated_kpi",
            "input_id": "sales_source",
            "transformers": [
                {
                    "function": "sql_transformation",
                    "args": {"sql": SQL},
                }
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales_bronze",
            "input_id": "calculated_kpi",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "file:///app/tests/lakehouse/out/feature/"
            "data_loader_custom_transformer/sql_transformation/data",
        }
    ],
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/data_loader.md
================================================
# Data Loader

## How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file?

An algorithm (e.g., data load) in the lakehouse-engine is configured using an ACON. The lakehouse-engine is a
configuration-driven framework, so people don't have to write code to execute a Spark algorithm. In contrast, the
algorithm is written in pyspark and accepts configurations through a JSON file (an ACON - algorithm configuration). The
ACON is the configuration providing the behaviour of a lakehouse engine algorithm. [You can check the algorithm code, and
how it interprets the ACON here](../../reference/packages/algorithms/algorithm.md).
In this page we will go through the structure of an ACON file and what are the most suitable ACON files for common data
engineering scenarios.
Check the underneath pages to find several **ACON examples** that cover many data extraction, transformation and loading scenarios.

## Overview of the Structure of the ACON file for DataLoads

An ACON-based algorithm needs several specifications to work properly, but some of them might be optional. The available
specifications are:

- **Input specifications (input_specs)**: specify how to read data. This is a **mandatory** keyword.
- **Transform specifications (transform_specs)**: specify how to transform data.
- **Data quality specifications (dq_specs)**: specify how to execute the data quality process.
- **Output specifications (output_specs)**: specify how to write data to the target. This is a **mandatory** keyword.
- **Terminate specifications (terminate_specs)**: specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc.).
- **Execution environment (exec_env)**: custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example).

Below is an example of a complete ACON file that reads from a s3 folder with CSVs and incrementally loads that data (using a merge) into a delta lake table.

!!! note "What is the **spec_id**?"
    **spec_id** is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in transform_specs) of a DataFrame that was read in the input_specs. Check ACON below to see how the spec_id of the input_specs is used as input_id in one transform specification.

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
    {
      "spec_id": "orders_bronze",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json",
      "with_filepath": True,
      "options": {
        "badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/",
        "header": False,
        "delimiter": "\u005E",
        "dateFormat": "yyyyMMdd"
      },
      "location": "s3://my-data-product-bucket/bronze/orders/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "orders_bronze_with_extraction_date",
      "input_id": "orders_bronze",
      "transformers": [
        {
          "function": "with_row_id"
        },
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": True,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "check_orders_bronze_with_extraction_date",
      "input_id": "orders_bronze_with_extraction_date",
      "dq_type": "validator",
      "result_sink_db_table": "my_database.my_table_dq_checks",
      "fail_on_error": False,
      "dq_functions": [
        {
          "dq_function": "expect_column_values_to_not_be_null",
          "args": {
            "column": "omnihub_locale_code"
          }
        },
        {
          "dq_function": "expect_column_unique_value_count_to_be_between",
          "args": {
            "column": "product_division",
            "min_value": 10,
            "max_value": 100
          }
        },
        {
          "dq_function": "expect_column_max_to_be_between",
          "args": {
            "column": "so_net_value",
            "min_value": 10,
            "max_value": 1000
          }
        },
        {
          "dq_function": "expect_column_value_lengths_to_be_between",
          "args": {
            "column": "omnihub_locale_code",
            "min_value": 1,
            "max_value": 10
          }
        },
        {
          "dq_function": "expect_column_mean_to_be_between",
          "args": {
            "column": "coupon_code",
            "min_value": 15,
            "max_value": 20
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "orders_silver",
      "input_id": "check_orders_bronze_with_extraction_date",
      "data_format": "delta",
      "write_type": "merge",
      "partitions": [
        "order_date_header"
      ],
      "merge_opts": {
        "merge_predicate": """
            new.sales_order_header = current.sales_order_header
            and new.sales_order_schedule = current.sales_order_schedule
            and new.sales_order_item=current.sales_order_item
            and new.epoch_status=current.epoch_status
            and new.changed_on=current.changed_on
            and new.extraction_date=current.extraction_date
            and new.lhe_batch_id=current.lhe_batch_id
            and new.lhe_row_id=current.lhe_row_id
        """,
        "insert_only": True
      },
      "db_table": "my_database.my_table_with_dq",
      "location": "s3://my-data-product-bucket/silver/order_events_with_dq/",
      "with_batch_id": True,
      "options": {
        "checkpointLocation": "s3://my-data-product-bucket/checkpoints/order_events_with_dq/"
      }
    }
  ],
  "terminate_specs": [
    {
      "function": "optimize_dataset",
      "args": {
        "db_table": "my_database.my_table_with_dq"
      }
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": True
  }
}

load_data(acon=acon)
```

## Input Specifications

You specify how to read the data by providing a list of Input Specifications. Usually there's just one element in that
list, as, in the lakehouse, you are generally focused on reading data from one layer (e.g., source, bronze, silver,
gold) and put it on the next layer. However, there may be scenarios where you would like to combine two datasets (e.g.,
joins or incremental filtering on one dataset based on the values of another
one), therefore you can use one or more elements.
[More information about InputSpecs](../../reference/packages/core/definitions.md#packages.core.definitions.InputSpec).

##### Relevant notes

- A spec id is fundamental, so you can use the input data later on in any step of the algorithm (transform, write, dq process, terminate).
- You don't have to specify `db_table` and `location` at the same time. Depending on the data_format sometimes you read from a table (e.g., jdbc or deltalake table) sometimes you read from a location (e.g., files like deltalake, parquet, json, avro... or kafka topic).

## Transform Specifications

In the lakehouse engine, you transform data by providing a transform specification, which contains a list of transform functions (transformers). So the transform specification acts upon on input, and it can execute multiple lakehouse engine transformation functions (transformers) upon that input.

If you look into the example above we ask the lakehouse engine to execute two functions on the `orders_bronze` input
data: `with_row_id` and `with_regex_value`. Those functions can of course receive arguments. You can see a list of all
available transformation functions (transformers) here `lakehouse_engine.transformers`. Then, you just invoke them in
your ACON as demonstrated above, following exactly the same function name and parameters name as described in the code
documentation. 
[More information about TransformSpec](../../reference/packages/core/definitions.md#packages.core.definitions.TransformSpec).

##### Relevant notes

- This stage is fully optional, you can omit it from the ACON.
- There is one relevant option `force_streaming_foreach_batch_processing` that can be used to force the transform to be
  executed in the foreachBatch function to ensure non-supported streaming operations can be properly executed. You don't
  have to worry about this if you are using regular lakehouse engine transformers. But if you are providing your custom
  logic in pyspark code via our lakehouse engine
  custom_transformation (`lakehouse_engine.transformers.custom_transformers`) then sometimes your logic may contain
  Spark functions that are not compatible with Spark Streaming, and therefore this flag can enable all of your
  computation to be streaming-compatible by pushing down all the logic into the foreachBatch() function.

## Data Quality Specifications

One of the most relevant features of the lakehouse engine is that you can have data quality guardrails that prevent you
from loading bad data into your target layer (e.g., bronze, silver or gold). The lakehouse engine data quality process
includes one main feature at the moment:

- **Validator**: The capability to perform data quality checks on that data (e.g., is the max value of a column bigger
  than x?) and even tag your data with the results of the DQ checks.

The output of the data quality process can be written into a [**Result Sink**](../data_quality/result_sink/result_sink.md) target (e.g. table or files) and is integrated with a [Data Docs website](../data_quality/data_quality.md#3-data-docs-website), which can be a company-wide available website for people to check the quality of their data and share with others.

To achieve all of this functionality the lakehouse engine uses [Great Expectations](https://greatexpectations.io/) internally. To hide the Great Expectations internals from our user base and provide friendlier abstractions using the ACON, we have developed the concept of DQSpec that can contain many DQFunctionSpec objects, which is very similar to the relationship between the TransformSpec and TransformerSpec, which means you can have multiple Great Expectations functions executed inside a single data quality specification (as in the ACON above).

!!! note
    The names of the functions and args are a 1 to 1 match of [Great Expectations API](https://greatexpectations.io/expectations/).

[More information about DQSpec](../../reference/packages/core/definitions.md#packages.core.definitions.DQSpec).

##### Relevant notes

- You can write the outputs of the DQ process to a sink through the result_sink* parameters of the
  DQSpec. `result_sink_options` takes any Spark options for a DataFrame writer, which means you can specify the options
  according to your sink format (e.g., delta, parquet, json, etc.). We usually recommend using `"delta"` as format.
- You can use the results of the DQ checks to tag the data that you are validating. When configured, these details will
  appear as a new column (like any other), as part of the tables of your Data Product.
- To be able to make an analysis with the data of `result_sink*`, we have available an approach in which you
  set `result_sink_explode` as true (which is the default) and then you have some columns expanded. Those are:
    - General columns: Those are columns that have the basic information regarding `dq_specs` and will have always values
      and does not depend on the expectation types chosen.
        -
      Columns: `checkpoint_config`, `run_name`, `run_time`, `run_results`, `success`, `validation_result_identifier`, `spec_id`, `input_id`, `validation_results`, `run_time_year`, `run_time_month`, `run_time_day`.
    - Statistics columns: Those are columns that have information about the runs of expectations, being those values for
      the run and not for each expectation. Those columns come from `run_results.validation_result.statistics.*`.
        - Columns: `evaluated_expectations`, `success_percent`, `successful_expectations`, `unsuccessful_expectations`.
    - Expectations columns: Those are columns that have information about the expectation executed.
        - Columns: `expectation_type`, `batch_id`, `expectation_success`, `exception_info`. Those columns are exploded
          from `run_results.validation_result.results`
          inside `expectation_config.expectation_type`, `expectation_config.kwargs.batch_id`, `success as expectation_success`,
          and `exception_info`. Moreover, we also include `unexpected_index_list`, `observed_value` and `kwargs`.
    - Arguments of Expectations columns: Those are columns that will depend on the expectation_type selected. Those
      columns are exploded from `run_results.validation_result.results` inside `expectation_config.kwargs.*`.
        - We can have for
          example: `column`, `column_A`, `column_B`, `max_value`, `min_value`, `value`, `value_pairs_set`, `value_set`,
          and others.
    - More columns desired? Those can be added, using `result_sink_extra_columns` in which you can select columns
      like `<name>` and/or explode columns like `<name>.*`.
- Use the parameter `"source"` to identify the data used for an easier analysis.
- By default, Great Expectation will also provide a site presenting the history of the DQ validations that you have performed on your data.
- You can make an analysis of all your expectations and create a dashboard aggregating all that information.
- This stage is fully optional, you can omit it from the ACON.

## Output Specifications

The output_specs section of an ACON is relatively similar to the input_specs section, but of course focusing on how to write the results of the algorithm, instead of specifying the input for the algorithm, hence the name output_specs (output specifications). [More information about OutputSpec](../../reference/packages/core/definitions.md#packages.core.definitions.OutputSpec).

##### Relevant notes

- Respect the supported write types and output formats.
- One of the most relevant options to specify in the options parameter is the `checkpoint_location` when in streaming
  read mode, because that location will be responsible for storing which data you already read and transformed from the
  source, **when the source is a Spark Streaming compatible source (e.g., Kafka or S3 files)**.

## Terminate Specifications

The terminate_specs section of the ACON is responsible for some "wrapping up" activities like optimising a table,
vacuuming old files in a delta table, etc. With time the list of available terminators will likely increase (e.g.,
reconciliation processes), but for now we have the [following terminators](../../reference/packages/terminators/index.md).
This stage is fully optional, you can omit it from the ACON.
The most relevant now in the context of the lakehouse initiative are the following:

- [dataset_optimizer](../../reference/packages/terminators/dataset_optimizer.md)
- [cdf_processor](../../reference/packages/terminators/cdf_processor.md)
- [sensor_terminator](../../reference/packages/terminators/sensor_terminator.md)
- [notifier_terminator](../../reference/packages/terminators/notifiers/email_notifier.md)

[More information about TerminatorSpec](../../reference/packages/core/definitions.md#packages.core.definitions.TerminatorSpec).

## Execution Environment

In the exec_env section of the ACON you can pass any Spark Session configuration that you want to define for the
execution of your algorithm. This is basically just a JSON structure that takes in any Spark Session property, so no
custom lakehouse engine logic. This stage is fully optional, you can omit it from the ACON.

!!! note
    Please be aware that Spark Session configurations that are not allowed to be changed when the Spark cluster is already
    running need to be passed in the configuration of the job/cluster that runs this algorithm, not here in this section.
    This section only accepts Spark Session configs that can be changed in runtime. Whenever you introduce an option make
    sure that it takes effect during runtime, as to the best of our knowledge there's no list of allowed Spark properties
    to be changed after the cluster is already running. Moreover, typically Spark algorithms fail if you try to modify a
    config that can only be set up before the cluster is running.


================================================
FILE: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/__init__.py
================================================
"""
.. include::extract_from_sap_b4_adso.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md
================================================
# Extract from SAP B4 ADSOs

A custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from
SAP B4 DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions
(AQ vs CL, active table, changelog table, requests status table, how to identify the next delta timestamp...),
only requiring a few parameters that are explained and exemplified in the
[template](#extraction-from-sap-b4-adsos-template) scenarios that we have created.


!!! note
    This custom reader is very similar and uses most features from the sap_bw reader, so if you were using specific filters/parameters with the sap_bw reader, there is a high chance you can keep using it in a very similar way with the sap_b4 reader. The main concepts are applied to both readers, as the strategies on how to parallelize the extractions, for example.

How can I find a good candidate column for [partitioning the extraction from S4Hana?](../extract_from_sap_bw_dso/extract_from_sap_bw_dso.md#how-can-we-decide-the-partitionColumn)

!!! danger "**Parallelization Limitations**"
    There are no limits imposed by the Lakehouse-Engine framework, but you need to consider that there might be differences imposed by the source.

    E.g. Each User might be restricted on utilisation of about 100GB memory at a time from the source.

    Parallel extractions ***can bring a jdbc source down*** if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

!!! danger 
    **In case you want to perform further filtering in the REQTSN field, please be aware that it is not being pushed down to SAP B4 by default (meaning it will have bad performance).** 
    In that case, you will need to use customSchema option while reading, so that you are able to enable filter push down for those.


You can check the code documentation of the reader below:

[**SAP B4 Reader**](../../../reference/packages/io/readers/sap_b4_reader.md)

[**JDBC Extractions arguments**](../../../reference/packages/utils/extraction/jdbc_extraction_utils.md#packages.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__)

[**SAP B4 Extractions arguments**](../../../reference/packages/utils/extraction/sap_b4_extraction_utils.md#packages.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__)

!!! note 
    For extractions using the SAP B4 reader, you can use the arguments listed in the SAP B4 arguments, but also the ones listed in the JDBC extractions, as those are inherited as well.


## Extraction from SAP B4 ADSOs Template
This template covers the following scenarios of extractions from the SAP B4Hana ADSOs:

- 1 - The Simplest Scenario (Not parallel - Not Recommended)
- 2 - Parallel extraction
  - 2.1 - Simplest Scenario
  - 2.2 - Provide upperBound (Recommended)
  - 2.3 - Automatic upperBound (Recommended)
  - 2.4 - Provide predicates (Recommended)
  - 2.5 - Generate predicates (Recommended)

!!! note
    The template will cover two ADSO Types:

    - **AQ**: ADSO which is of append type and for which a single ADSO/tables holds all the information, like an
    event table. For this type, the same ADSO is used for reading data both for the inits and deltas. Usually, these
    ADSOs end with the digit "6".
    - **CL**: ADSO which is split into two ADSOs, one holding the change log events, the other having the active
    data (current version of the truth for a particular source). For this type, the ADSO having the active data
    is used for the first extraction (init) and the change log ADSO is used for the subsequent extractions (deltas).
    Usually, these ADSOs are split into active table ending with the digit "2" and changelog table ending with digit "3".

For each of these ADSO types, the lakehouse-engine abstracts the logic to get the delta extractions. This logic
basically consists of joining the `db_table` (for `AQ`) or the `changelog_table` (for `CL`) with the table
having the requests status (`my_database.requests_status_table`).
One of the fields used for this joining is the `data_target`, which has a relationship with the ADSO
(`db_table`/`changelog_table`), being basically the same identifier without considering parts of it.

Based on the previous insights, the queries that the lakehouse-engine generates under the hood translate to
(this is a simplified version, for more details please refer to the lakehouse-engine code documentation):

**AQ Init Extraction:**
`SELECT t.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp
FROM my_database.my_table t`

**AQ Delta Extraction:**
`SELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp
FROM my_database.my_table AS tbl
JOIN my_database.requests_status_table AS req
WHERE STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')
AND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('my_identifier')
AND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table`

**CL Init Extraction:**
`SELECT t.*,
    {self._SAP_B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn,
    '0' AS datapakid,
    0 AS record,
    CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp
FROM my_database.my_table_2 t`

**CL Delta Extraction:**
`SELECT tbl.*,
CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp`
FROM my_database.my_table_3 AS tbl
JOIN my_database.requests_status_table AS req
WHERE STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')
AND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('my_data_target')
AND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table`

!!! note "Introductory Notes"
    If you want to have a better understanding about JDBC Spark optimizations, here you have a few useful links:
    
    - https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
    - https://docs.databricks.com/en/connect/external-systems/jdbc.html
    - https://bit.ly/3x2eCEm
    - https://newbedev.com/how-to-optimize-partitioning-when-migrating-data-from-jdbc-source

### 1 - The Simplest Scenario (Not parallel - Not Recommended)
This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques
and using a single connection to retrieve all the data from the source. It should only be used in case the ADSO
you want to extract from SAP B4Hana is a small one, with no big requirements in terms of performance to fulfill.
When extracting from the source ADSO, there are two options:

- **Delta Init** - full extraction of the source ADSO. You should use it in the first time you extract from the
ADSO or any time you want to re-extract completely. Similar to a so-called full load.
- **Delta** - extracts the portion of the data that is new or has changed in the source, since the last
extraction (using the `max_timestamp` value in the location of the data already extracted
`latest_timestamp_data_location`).

Below example is composed of two cells.

- The first cell is only responsible to define the variables `extraction_type` and `write_type`,
depending on the extraction type: **Delta Init** (`load_type = "init"`) or a **Delta** (`load_type = "delta"`).
The variables in this cell will also be referenced by other acons/examples in this notebook, similar to what
you would do in your pipelines/jobs, defining this centrally and then re-using it.
- The second cell is where the acon to be used is defined (which uses the two variables `extraction_type` and
`write_type` defined) and the `load_data` algorithm is executed to perform the extraction.

!!! note
    There may be cases where you might want to always extract fully from the source ADSO. In these cases,
    you only need to use a Delta Init every time, meaning you would use `"extraction_type": "init"` and
    `"write_type": "overwrite"` as it is shown below. The explanation about what it is a Delta Init/Delta is
    applicable for all the scenarios presented in this notebook.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_b4",
            "options": {
                "url": "my_sap_b4_url",
                "user": "my_user",
                "password": "my_b4_hana_pwd",
                "dbtable": "my_database.my_table",
                "extraction_type": extraction_type,
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "adso_type": "AQ",
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

### 2 - Parallel extraction
In this section, 5 possible scenarios for parallel extractions from SAP B4Hana ADSOs are presented.

#### 2.1 - Parallel Extraction, Simplest Scenario
This scenario provides the simplest example you can have for a parallel extraction from SAP B4Hana, only using
the property `numPartitions`. The goal of the scenario is to cover the case in which people do not have
much knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can
be used to split the extraction in several tasks. This scenario can also be used if the use case does not
have big performance requirements/concerns, meaning you do not feel the need to optimize the performance of
the extraction to its maximum potential.

On the example below, `"numPartitions": 10` is specified, meaning that Spark will open 10 parallel connections
to the source ADSO and automatically decide how to parallelize the extraction upon that requirement. This is the
only change compared to the example provided in the scenario 1.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_b4",
            "options": {
                "url": "my_sap_b4_url",
                "user": "my_user",
                "password": "my_sap_b4_pwd",
                "dbtable": "my_database.my_table",
                "extraction_type": extraction_type,
                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_simple/",
                "adso_type": "AQ",
                "numPartitions": 10,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/my_identifier_par_simple/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.2 - Parallel Extraction, Provide upper_bound (Recommended)
This scenario performs the extraction from the SAP B4 ADSO in parallel, but is more concerned with trying to
optimize and have more control (compared to 2.1 example) on how the extraction is split and performed,
using the following options:

- `numPartitions` - number of Spark partitions to split the extraction.
- `partitionColumn` - column used to split the extraction. It must be a numeric, date, or timestamp.
It should be a column that is able to split the extraction evenly in several tasks. An auto-increment
column is usually a very good candidate.
- `lowerBound` - lower bound to decide the partition stride.
- `upperBound` - upper bound to decide the partition stride.

This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as
the `partitionColumn`. If you compare with the previous example, you'll notice that now `numPartitions` and
three additional options are provided to fine tune the extraction (`partitionColumn`, `lowerBound`,
`upperBound`).

When these 4 properties are used, Spark will use them to build several queries to split the extraction.

**Example:** for `"numPartitions": 10`, `"partitionColumn": "record"`, `"lowerBound: 1"`, `"upperBound: 100"`,
Spark will generate 10 queries like this:

- `SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL`
- `SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20`
- `SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30`
- ...
- `SELECT * FROM dummy_table WHERE RECORD >= 100`

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_b4",
            "options": {
                "url": "my_sap_b4_url",
                "user": "my_user",
                "password": "my_b4_hana_pwd",
                "dbtable": "my_database.my_table",
                "extraction_type": extraction_type,
                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_prov_upper/",
                "adso_type": "AQ",
                "partitionColumn": "RECORD",
                "numPartitions": 10,
                "lowerBound": 1,
                "upperBound": 1000000,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/my_identifier_par_prov_upper/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.3 - Parallel Extraction, Automatic upper_bound (Recommended)
This scenario is very similar to 2.2, the only difference being that **`upperBound`
is not provided**. Instead, the property `calculate_upper_bound` equals to true is used to benefit
from the automatic calculation of the `upperBound` (derived from the `partitionColumn`) offered by the
lakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of
the max value for the column. The only thing you need to consider is that if you use this automatic
calculation of the upperBound you will be doing an initial query to the SAP B4 ADSO to retrieve the max
value for the `partitionColumn`, before doing the actual query to perform the extraction.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_b4",
            "calculate_upper_bound": True,
            "options": {
                "url": "my_sap_b4_url",
                "user": "my_user",
                "password": "my_b4_hana_pwd",
                "dbtable": "my_database.my_table",
                "extraction_type": extraction_type,
                "latest_timestamp_data_location": "s3://my_path/my_identifier_par_calc_upper/",
                "adso_type": "AQ",
                "partitionColumn": "RECORD",
                "numPartitions": 10,
                "lowerBound": 1,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/my_identifier_par_calc_upper/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.4 - Parallel Extraction, Provide Predicates (Recommended)
This scenario performs the extraction from SAP B4 ADSO in parallel, useful in contexts in which there is no
numeric, date or timestamp column to parallelize the extraction (e.g. when extracting from ADSO of Type `CL`,
the active table does not have the `RECORD` column, which is usually a good option for scenarios 2.2 and 2.3):

- `partitionColumn` - column used to split the extraction. It can be of any type.

This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as
the `partitionColumn`, specially if these columns are not complying with the scenario 2.2 or 2.3.

**When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.**

Below the lakehouse function to generate predicate list automatically is presented.

This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.

**predicates_query:** At the sample below the whole table is being considered (`select distinct(x) from table`),
but it is possible to filter predicates list here, specially if you are applying filter on transformations spec,
and you know entire table won't be necessary, so you can change it to something like this: `select distinct(x)
from table where x > y`.

**predicates_add_null:** You can decide if you want to consider null on predicates list or not, by default
this property is `True`.

**Example:** for `"partition_column": "CALMONTH"`

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers
# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionUtils,
)

ExecEnv.get_or_create()

partition_column = "CALMONTH"
dbtable = "my_database.my_table_3"

predicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""
user = "my_user"
password = "my_b4_hana_pwd"
url = "my_sap_b4_url"
predicates_add_null = True

jdbc_util = JDBCExtractionUtils(
    JDBCExtraction(
        user=user,
        password=password,
        url=url,
        predicates_add_null=predicates_add_null,
        partition_column=partition_column,
        dbtable=dbtable,
    )
)

predicates = jdbc_util.get_predicates(predicates_query)

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_2_source",
            "read_type": "batch",
            "data_format": "sap_b4",
            "options": {
                "url": "my_sap_b4_url",
                "user": "my_user",
                "password": "my_b4_hana_pwd",
                "driver": "com.sap.db.jdbc.Driver",
                "dbtable": "my_database.my_table_2",
                "changelog_table": "my_database.my_table_3",
                "extraction_type": extraction_type,
                "latest_timestamp_data_location": "s3://my_path/my_identifier_2_prov_predicates/",
                "adso_type": "CL",
                "predicates": predicates,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_2_bronze",
            "input_id": "my_identifier_2_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/my_identifier_2_prov_predicates/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.5 - Parallel Extraction, Generate Predicates
This scenario is very similar to the scenario 2.4, with the only difference that it automatically
generates the predicates (`"generate_predicates": True`).

This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as
the `partitionColumn`, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise 
those would probably be recommended).

When this property is used, the lakehouse engine will generate the predicates to be used to extract data from
the source. What the lakehouse engine does is to check for the init/delta portion of the data,
what are the distinct values of the `partitionColumn` serving that data. Then, these values will be used by
Spark to generate several queries to extract from the source in a parallel fashion.
Each distinct value of the `partitionColumn` will be a query, meaning that you will not have control over the
number of partitions used for the extraction. For example, if you face a scenario in which you
are using a `partitionColumn` `LOAD_DATE` and for today's delta, all the data (let's suppose 2 million rows) is
served by a single `LOAD_DATE = 20200101`, that would mean Spark would use a single partition
to extract everything. In this extreme case you would probably need to change your `partitionColumn`. **Note:**
these extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.

**Example:** for `"partitionColumn": "record"`

Generate predicates:
- `SELECT DISTINCT(RECORD) as RECORD FROM dummy_table`
- `1`
- `2`
- `3`
- ...
- `100`
- Predicates List: ['RECORD=1','RECORD=2','RECORD=3',...,'RECORD=100']

Spark will generate 100 queries like this:

- `SELECT * FROM dummy_table WHERE RECORD = 1`
- `SELECT * FROM dummy_table WHERE RECORD = 2`
- `SELECT * FROM dummy_table WHERE RECORD = 3`
- ...
- `SELECT * FROM dummy_table WHERE RECORD = 100`

Generate predicates will also consider null by default:

- `SELECT * FROM dummy_table WHERE RECORD IS NULL`

To disable this behaviour the following variable value should be changed to false: `"predicates_add_null": False`

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_2_source",
            "read_type": "batch",
            "data_format": "sap_b4",
            "generate_predicates": True,
            "options": {
                "url": "my_sap_b4_url",
                "user": "my_user",
                "password": "my_b4_hana_pwd",
                "driver": "com.sap.db.jdbc.Driver",
                "dbtable": "my_database.my_table_2",
                "changelog_table": "my_database.my_table_3",
                "extraction_type": extraction_type,
                "latest_timestamp_data_location": "s3://my_path/my_identifier_2_gen_predicates/",
                "adso_type": "CL",
                "partitionColumn": "CALMONTH",
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_2_bronze",
            "input_id": "my_identifier_2_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/my_identifier_2_gen_predicates/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/__init__.py
================================================
"""
.. include::extract_from_sap_bw_dso.md
"""

================================================
FILE: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md
================================================
# Extract from SAP BW DSOs

!!! danger "**Parallelization Limitations**"
    Parallel extractions **can bring a jdbc source down** if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

A custom sap_bw reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from 
SAP BW DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions 
(active table, changelog table, activation requests table, how to identify the next delta timestamp...), 
only requiring a few parameters that are explained and exemplified in the 
[template](#extraction-from-sap-bw-template) scenarios that we have created.

This page also provides you a section to help you figure out a good candidate for [partitioning the extraction from SAP BW](#how-can-we-decide-the-partitionColumn).

You can check the code documentation of the reader below:

[**SAP BW Reader**](../../../reference/packages/io/readers/sap_bw_reader.md)

[**JDBC Extractions arguments**](../../../reference/packages/utils/extraction/jdbc_extraction_utils.md#packages.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__)

[**SAP BW Extractions arguments**](../../../reference/packages/utils/extraction/sap_bw_extraction_utils.md#packages.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__)

!!! note
    For extractions using the SAP BW reader, you can use the arguments listed in the SAP BW arguments, but also 
    the ones listed in the JDBC extractions, as those are inherited as well. 


## Extraction from SAP-BW template

This template covers the following scenarios of extractions from the SAP BW DSOs:

- 1 - The Simplest Scenario (Not parallel - Not Recommended)
- 2 - Parallel extraction
  - 2.1 - Simplest Scenario
  - 2.2 - Provide upperBound (Recommended)
  - 2.3 - Automatic upperBound (Recommended)
  - 2.4 - Backfilling
  - 2.5 - Provide predicates (Recommended)
  - 2.6 - Generate predicates (Recommended)
- 3 - Extraction from Write Optimized DSO
  - 3.1 - Get initial actrequest_timestamp from Activation Requests Table

!!! note "Introductory Notes"
    If you want to have a better understanding about JDBC Spark optimizations, 
    here you have a few useful links:
    - https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html
    - https://docs.databricks.com/en/connect/external-systems/jdbc.html
    - https://bit.ly/3x2eCEm
    - https://newbedev.com/how-to-optimize-partitioning-when-migrating-data-from-jdbc-source

### 1 - The Simplest Scenario (Not parallel - Not Recommended)
This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques 
and using a single connection to retrieve all the data from the source. It should only be used in case the DSO 
you want to extract from SAP BW is a small one, with no big requirements in terms of performance to fulfill.

When extracting from the source DSO, there are two options:

- **Delta Init** - full extraction of the source DSO. You should use it in the first time you extract from the 
DSO or any time you want to re-extract completely. Similar to a so-called full load.
- **Delta** - extracts the portion of the data that is new or has changed in the source, since the last
extraction (using the max `actrequest_timestamp` value in the location of the data already extracted,
by default).

Below example is composed of two cells.

- The first cell is only responsible to define the variables `extraction_type` and `write_type`,
depending on the extraction type **Delta Init** (`LOAD_TYPE = INIT`) or a **Delta** (`LOAD_TYPE = DELTA`).
The variables in this cell will also be referenced by other acons/examples in this notebook, similar to what
you would do in your pipelines/jobs, defining this centrally and then re-using it.
- The second cell is where the acon to be used is defined (which uses the two variables `extraction_type` and
`write_type` defined) and the `load_data` algorithm is executed to perform the extraction.

!!! note
    There may be cases where you might want to always extract fully from the source DSO. In these cases,
    you only need to use a Delta Init every time, meaning you would use `"extraction_type": "init"` and
    `"write_type": "overwrite"` as it is shown below. The explanation about what it is a Delta Init/Delta is
    applicable for all the scenarios presented in this notebook.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            # You should use this custom reader to benefit from the lakehouse-engine utils for extractions from SAP BW
            "data_format": "sap_bw",
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "changelog_table": "my_database.my_changelog_table",
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

### 2 - Parallel extraction
In this section, 6 possible scenarios for parallel extractions from SAP BW DSOs.

#### 2.1 - Parallel Extraction, Simplest Scenario
This scenario provides the simplest example you can have for a parallel extraction from SAP BW, only using
the property `numPartitions`. The goal of the scenario is to cover the case in which people does not have
much knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can
be used to split the extraction in several tasks. This scenario can also be used if the use case does not
have big performance requirements/concerns, meaning you do not feel the need to optimize the performance of
the extraction to its maximum potential. 
On the example below, `"numPartitions": 10` is specified, meaning that Spark will open 10 parallel connections
to the source DSO and automatically decide how to parallelize the extraction upon that requirement. This is the
only change compared to the example provided in the example 1.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "changelog_table": "my_database.my_changelog_table",
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
                "numPartitions": 10,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.2 - Parallel Extraction, Provide upper_bound (Recommended)
This scenario performs the extraction from the SAP BW DSO in parallel, but is more concerned with trying to
optimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using
the following options:

- `numPartitions` - number of Spark partitions to split the extraction.
- `partitionColumn` - column used to split the extraction. It must be a numeric, date, or timestamp.
It should be a column that is able to split the extraction evenly in several tasks. An auto-increment
column is usually a very good candidate.
- `lowerBound` - lower bound to decide the partition stride.
- `upperBound` - upper bound to decide the partition stride. It can either be **provided (as it is done in
this example)** or derived automatically by our upperBound optimizer (example 2.3).

This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as
the `partitionColumn`. If you compare with the previous example, you'll notice that now `numPartitions` and
three additional options are provided to fine tune the extraction (`partitionColumn`, `lowerBound`,
`upperBound`).

When these 4 properties are used, Spark will use them to build several queries to split the extraction.

**Example:** for `"numPartitions": 10`, `"partitionColumn": "record"`, `"lowerBound: 1"`, `"upperBound: 100"`,
Spark will generate 10 queries like this:

- `SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL`
- `SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20`
- `SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30`
- ...
- `SELECT * FROM dummy_table WHERE RECORD >= 100`

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "changelog_table": "my_database.my_changelog_table",
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
                "numPartitions": 3,
                "partitionColumn": "my_partition_col",
                "lowerBound": 1,
                "upperBound": 42,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.3 - Parallel Extraction, Automatic upper_bound (Recommended)
This scenario is very similar to 2.2, the only difference being that **upper_bound
is not provided**. Instead, the property `calculate_upper_bound` equals to true is used to benefit
from the automatic calculation of the upperBound (derived from the `partitionColumn`) offered by the
lakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of
the max value for the column. The only thing you need to consider is that if you use this automatic
calculation of the upperBound you will be doing an initial query to the SAP BW DSO to retrieve the max
value for the `partitionColumn`, before doing the actual query to perform the extraction.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "calculate_upper_bound": True,
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "changelog_table": "my_database.my_changelog_table",
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
                "numPartitions": 10,
                "partitionColumn": "my_partition_col",
                "lowerBound": 1,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.4 - Parallel Extraction, Backfilling
This scenario covers the case, in which you might want to backfill the data extracted from a SAP BW DSO and
made available in the bronze layer. By default, the delta extraction considers the max value of the column
`actrequest_timestamp` on the data already extracted. However, there might be cases, in which you might want
to extract a delta from a particular timestamp onwards or for a particular interval of time. For this, you
can use the properties `min_timestamp` and `max_timestamp`.

Below, a very similar example to the previous one is provided, the only differences being that
the properties `"min_timestamp": "20210910000000"` and `"max_timestamp": "20210913235959"` are not provided,
meaning it will extract the data from the changelog table, using a filter
`"20210910000000" > actrequest_timestamp <= "20210913235959"`, ignoring if some of the data is already
available in the destination or not. Moreover, note that the property `latest_timestamp_data_location`
does not need to be provided, as the timestamps to be considered are being directly provided (if both
the timestamps and the `latest_timestamp_data_location` are provided, the last parameter will have no effect).
Additionally, `"extraction_type": "delta"` and `"write_type": "append"` is forced, instead of using the
variables as in the other  examples, because the backfilling scenario only makes sense for delta extractions.

!!! note
    Note: be aware that the backfilling example being shown has no mechanism to enforce that
    you don't generate duplicated data in bronze. For your scenarios, you can either use this example and solve
    any duplication in the silver layer or extract the delta with a merge strategy while writing to bronze,
    instead of appending.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "calculate_upper_bound": True,
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "changelog_table": "my_database.my_changelog_table",
                "extraction_type": "delta",
                "numPartitions": 10,
                "partitionColumn": "my_partition_col",
                "lowerBound": 1,
                "min_timestamp": "20210910000000",
                "max_timestamp": "20210913235959",
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.5 - Parallel Extraction, Provide Predicates (Recommended)
This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no
numeric, date or timestamp column to parallelize the extraction:

- `partitionColumn` - column used to split the extraction. It can be of any type. 

This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as
the `partitionColumn`, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise
those would probably be recommended).

**When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.**

Below the lakehouse function to generate predicate list automatically is presented.

This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.

**predicates_query:** At the sample below the whole table is being considered (`select distinct(x) from table`),
but it is possible to filter predicates list here,
specially if you are applying filter on transformations spec, and you know entire table won't be necessary, so
you can change it to something like this: `select distinct(x) from table where x > y`.

**predicates_add_null:** You can decide if you want to consider null on predicates list or not, by default this
property is True.

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers
# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionUtils,
)

ExecEnv.get_or_create()

partition_column = "my_partition_column"
dbtable = "my_database.my_table"

predicates_query = f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""
column_for_predicates = partition_column
user = "my_user"
password = "my_hana_pwd"
url = "my_bw_url"
predicates_add_null = True

jdbc_util = JDBCExtractionUtils(
    JDBCExtraction(
        user=user,
        password=password,
        url=url,
        dbtable=dbtable,
        partition_column=partition_column,
    )
)

predicates = jdbc_util.get_predicates(predicates_query)

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
                "predicates": predicates,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.6 - Parallel Extraction, Generate Predicates (Recommended)
This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no
numeric, date or timestamp column to parallelize the extraction:

- `partitionColumn` - column used to split the extraction. It can be of any type.

This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as
the `partitionColumn`, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise
those would probably be recommended).

When this property is used, the lakehouse engine will generate the predicates to be used to extract data from
the source. What the lakehouse engine does is to check for the init/delta portion of the data,
what are the distinct values of the `partitionColumn` serving that data. Then, these values will be used by
Spark to generate several queries to extract from the source in a parallel fashion.
Each distinct value of the `partitionColumn` will be a query, meaning that you will not have control over the
number of partitions used for the extraction. For example, if you face a scenario in which you
are using a `partitionColumn` `LOAD_DATE` and for today's delta, all the data (let's suppose 2 million rows) is
served by a single `LOAD_DATE = 20200101`, that would mean Spark would use a single partition
to extract everything. In this extreme case you would probably need to change your `partitionColumn`. **Note:**
these extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.

**Example:** for `"partitionColumn": "record"`
Generate predicates:

- `SELECT DISTINCT(RECORD) as RECORD FROM dummy_table`
- `1`
- `2`
- `3`
- ...
- `100`
- Predicates List: ['RECORD=1','RECORD=2','RECORD=3',...,'RECORD=100']

Spark will generate 100 queries like this:

- `SELECT * FROM dummy_table WHERE RECORD = 1`
- `SELECT * FROM dummy_table WHERE RECORD = 2`
- `SELECT * FROM dummy_table WHERE RECORD = 3`
- ...
- `SELECT * FROM dummy_table WHERE RECORD = 100`

Generate predicates will also consider null by default:
- `SELECT * FROM dummy_table WHERE RECORD IS NULL`

To disable this behaviour the following variable value should be changed to false: `"predicates_add_null": False`

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "generate_predicates": True,
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "odsobject": "my_ods_object",
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
                "partitionColumn": "my_partition_col",
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

### 3 - Extraction from Write Optimized DSOs
This scenario is based on the best practices of the scenario 2.2, but it is ready to extract data from
Write Optimized DSOs, which have the changelog embedded in the active table, instead of having a separate
changelog table. Due to this reason, you need to specify that the `changelog_table` parameter value is equal
to the `dbtable` parameter value.
Moreover, these tables usually already include the changelog technical columns
like `RECORD` and `DATAPAKID`, for example, that the framework adds by default. Thus, you need to specify
`"include_changelog_tech_cols": False` to change this behaviour.
Finally, you also need to specify the name of the column in the table that can be used to join with the
activation requests table to get the timestamp of the several requests/deltas,
which is `"actrequest"` by default (`"request_col_name": 'request'`).

```python
from lakehouse_engine.engine import load_data

LOAD_TYPE = "INIT" or "DELTA"

if LOAD_TYPE == "INIT":
    extraction_type = "init"
    write_type = "overwrite"
else:
    extraction_type = "delta"
    write_type = "append"

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "changelog_table": "my_database.my_table",
                "odsobject": "my_ods_object",
                "request_col_name": "request",
                "include_changelog_tech_cols": False,
                "latest_timestamp_data_location": "s3://my_path/my_identifier/",
                "extraction_type": extraction_type,
                "numPartitions": 2,
                "partitionColumn": "RECORD",
                "lowerBound": 1,
                "upperBound": 50000,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": write_type,
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 3.1 - Extraction from Write Optimized DSOs, Get ACTREQUEST_TIMESTAMP from Activation Requests Table
By default, the act_request_timestamp has being hardcoded (either assumes a given extraction_timestamp or the
current timestamp) in the init extraction, however this may be causing problems when merging changes in silver,
for write optimised DSOs. So, a new possibility to choose when to retrieve this timestamp from the
act_req_table was added.

This scenario performs the data extraction from Write Optimized DSOs, forcing the actrequest_timestamp to
assume the value from the activation requests table (timestamp column).

This feature is only available for WODSOs and to use it you need to specify `"get_timestamp_from_actrequest": True`.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "sap_bw",
            "options": {
                "user": "my_user",
                "password": "my_hana_pwd",
                "url": "my_sap_bw_url",
                "dbtable": "my_database.my_table",
                "changelog_table": "my_database.my_table",
                "odsobject": "my_ods_object",
                "request_col_name": "request",
                "include_changelog_tech_cols": False,
                "latest_timestamp_data_location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP/",
                "extraction_type": "init",
                "numPartitions": 2,
                "partitionColumn": "RECORD",
                "lowerBound": 1,
                "upperBound": 50000,
                "get_timestamp_from_act_request": True,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": "overwrite",
            "data_format": "delta",
            "partitions": ["actrequest_timestamp"],
            "location": "s3://my_path/my_identifier_ACTREQUEST_TIMESTAMP",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

## How can we decide the partitionColumn?

**Compatible partitionColumn for upperBound/lowerBound Spark options:**

It needs to be **int, date, timestamp** → https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html

**If you don't have any column to partition on those formats, you can use predicates to partition the table** → https://docs.databricks.com/en/connect/external-systems/jdbc.html#manage-parallelism

One of the most important parameters to optimise the extraction is the **partitionColumn**, as you can see in the template. Thus, this section helps you figure out if a column is a good candidate or not. 

Basically the partition column needs to be a column which is able to adequately split the processing, which means we can use it to "create" different queries with intervals/filters, so that the Spark tasks process similar amounts of rows/volume. Usually a good candidate is an integer auto-increment technical column.

!!! note
    Although RECORD is usually a good candidate, it is usually available on the changelog table only. Meaning that you would need to use a different strategy for the init. In case you don't have good candidates for partitionColumn, you can use the sample acon provided in the **scenario 2.1** in the template above. It might make sense to use **scenario 2.1** for the init and then **scenario 2.2 or 2.3** for the subsequent deltas.

**When there is no int, date or timestamp good candidate for partitionColumn:**

In this case you can opt by the **scenario 2.5 - Generate Predicates**, which supports any kind of column to be defined as **partitionColumn**.

However, you should still analyse if the column you are thinking about is a good candidate or not. In this scenario, Spark will create one query per distinct value of the **partitionColumn**, so you can perform some analysis.

================================================
FILE: lakehouse_engine_usage/data_loader/extract_from_sftp/__init__.py
================================================
"""
.. include::extract_from_sftp.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/extract_from_sftp/extract_from_sftp.md
================================================
# Extract from SFTP

Secure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web.

This feature is available in the Lakehouse Engine with the purpose of having a mechanism to read data directly from SFTP directories without moving those files manually/physically to a S3 bucket.

The engine uses Pandas to read the files and converts them into a Spark dataframe, which makes the available resources of an Acon usable, such as `dq_specs`, `output_specs`, `terminator_specs` and `transform_specs`.

Furthermore, this feature provides several filters on the directories that makes easier to control the extractions.


#### **Introductory Notes**:

There are important parameters that must be added to **input specs** in order to make the SFTP extraction work properly:


!!! note "**Read type**"
    The engine supports only **BATCH** mode for this feature.


**sftp_files_format** - File format that will be used to read data from SFTP. **The engine supports: CSV, FWF, JSON and XML**.

**location** - The SFTP directory to be extracted. If it is necessary to filter a specific file, it can be made using the `file_name_contains` option.

**options** - Arguments used to set the Paramiko SSH client connection (hostname, username, password, port...), set the filter to retrieve files and set the file parameters (separators, headers, cols...). For more information about the file parameters, please go to the Pandas link in the useful links section.

The options allowed are:

| Property type                 | Detail                   | Example                                                                | Comment                                                                                                                                                                                                                                                                                               |
|-------------------------------|--------------------------|------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Connection                    | add_auto_policy(str)     | true of false                                                          | Indicates to allow an SFTP connection using no host key. When a connection attempt is being made using no host key, then the engine will throw an exception if the auto_add_policy property is false. The purpose of this flag is to make the user conscientiously choose a lesser secure connection. |
| Connection                    | key_type (str)           | "Ed25519" or "RSA"                                                     | Indicates the key type to be used for the connection (SSH, Ed25519).                                                                                                                                                                                                                                  |
| Connection                    | key_filename (str)       | "/path/to/private_key/private_key.ppk"                                 | The filename, or list of filenames, of optional private(keys), and/or certs to try for authentication. It must be used with a pkey in order to add a policy. If a pkey is not provided, then use `add_auto_policy`.                                                                                   |
| Connection                    | pkey (str)               | "AAAAC3MidD1lVBI1NTE5AAAAIKssLqd6hjahPi9FBH4GPDqMqwxOMsfxTgowqDCQAeX+" | Value to use for the host key when connecting to the remote SFTP server.                                                                                                                                                                                                                              |
| Filter                        | date_time_gt (str)       | "1900-01-01" or "1900-01-01 08:59:59"                                  | Filter the files greater than the string datetime formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS"                                                                                                                                                                                                  |
| Filter                        | date_time_lt (str)       | "3999-12-31" or "3999-12-31 20:59:59"                                  | Filter the files lower than the string datetime formatted as "YYYY-MM-DD" or "YYYY-MM-DD HH:MM:SS"                                                                                                                                                                                                    |
| Filter                        | earliest_file (bool)     | true or false                                                          | Filter the earliest dated file in the directory.                                                                                                                                                                                                                                                      |
| Filter                        | file_name_contains (str) | "part_of_filename"                                                     | Filter files when match the pattern.                                                                                                                                                                                                                                                                  |
| Filter                        | latest_file (bool)       | true or false                                                          | Filter the most recent dated file in the directory.                                                                                                                                                                                                                                                   |
| Read data from subdirectories | sub_dir (bool)           | true or false                                                          | The engine will search files into subdirectories of the **location**. It will consider one level below the root location given.<br>When `sub_dir` is used with **latest_file/earliest_file** argument, the engine will retrieve the latest/earliest file for each subdirectory.                       |
| Add metadata info             | file_metadata (bool)     | true or false                                                          | When this option is set as True, the dataframe retrieves the **filename with location** and the **modification_time** from the original files in sftp. It attaches these two columns adding the information to respective records.                                                                    |

**Useful Info & Links**:
1. [Paramiko SSH Client](https://docs.paramiko.org/en/latest/api/client.html)
2. [Pandas documentation](https://pandas.pydata.org/docs/reference/io.html)


## Scenario 1
The scenario below shows the extraction of a CSV file using most part of the available filter options. Also, as an example, the column "created_on" is created in the transform_specs in order to store the processing date for every record. As the result, it will have in the output table the original file date (provided by the option `file_metadata`) and the processing date from the engine.

For an incremental load approach, it is advised to use the "modification_time" column created by the option `file_metadata`. Since it has the original file date of modification, this date can be used in the logic to control what is new and has been changed recently.

!!! note
    Below scenario uses **"add_auto_policy": true**, which is **not recommended**.

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
      {
          "spec_id": "sftp_source",
          "read_type": "batch",
          "data_format": "sftp",
          "sftp_files_format": "csv",
          "location": "my_sftp_data_path",
          "options": {
              "hostname": "my_sftp_hostname",
              "username": "my_sftp_username",
              "password": "my_sftp_password",
              "port": "my_port",
              "add_auto_policy": True,
              "file_name_contains": "test_pattern",
              "args": {"sep": "|"},
              "latest_file": True,
              "file_metadata": True
          }
      },
  ],
  "transform_specs": [
      {
          "spec_id": "sftp_transformations",
          "input_id": "sftp_source",
          "transformers": [
              {
                  "function": "with_literals",
                  "args": {"literals": {"created_on": datetime.now()}},
              },
          ],
      },
  ],
  "output_specs": [
    {
      "spec_id": "sftp_bronze",
      "input_id": "sftp_transformations",
      "write_type": "append",
      "data_format": "delta",
      "location": "s3://my_path/dummy_table"
    }
  ]
}

load_data(acon=acon)
```

## Scenario 2
The following scenario shows the extraction of a JSON file using an RSA pkey authentication instead of auto_add_policy. The engine supports Ed25519Key and RSA for pkeys.

For the pkey file location, it is important to have the file in a location accessible by the cluster. This can be achieved either by mounting the location or with volumes.

!!! note
    This scenario uses a more secure authentication, thus it is the recommended option, instead of the previous scenario.

```python
from lakehouse_engine.engine import load_data

acon = {
  "input_specs": [
      {
          "spec_id": "sftp_source",
          "read_type": "batch",
          "data_format": "sftp",
          "sftp_files_format": "json",
          "location": "my_sftp_data_path",
          "options": {
              "hostname": "my_sftp_hostname",
              "username": "my_sftp_username",
              "password": "my_sftp_password",
              "port": "my_port",
              "key_type": "RSA",
              "key_filename": "dbfs_mount_location/my_file_key.ppk",
              "pkey": "my_key",
              "latest_file": True,
              "file_metadata": True,
              "args": {"lines": True, "orient": "columns"},
          },
      },
  ],
  "transform_specs": [
      {
          "spec_id": "sftp_transformations",
          "input_id": "sftp_source",
          "transformers": [
              {
                  "function": "with_literals",
                  "args": {"literals": {"lh_created_on": datetime.now()}},
              },
          ],
      },
  ],
  "output_specs": [
    {
      "spec_id": "sftp_bronze",
      "input_id": "sftp_transformations",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "s3://my_path/dummy_table"
    }
  ]
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/__init__.py
================================================
"""
.. include::extract_using_jdbc_connection.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/extract_using_jdbc_connection.md
================================================
# Extract using JDBC connection

!!! danger "**SAP Extraction**"

    SAP is only used as an example to demonstrate how we can use a JDBC connection to extract data.

    **If you are looking to extract data from SAP, please use our sap_b4 or sap_bw reader.**

    You can find the **sap_b4 reader** documentation: [Extract from SAP B4 ADSOs](../../data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md) and the **sap_bw reader** documentarion: [Extract from SAP BW DSOs](../../data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md)

!!! danger "**Parallel Extraction**"
    Parallel extractions **can bring a jdbc source down** if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.

## Introduction

Many databases allow a JDBC connection to extract data. Our engine has one reader where you can configure all the necessary definitions to connect to a database using JDBC.

In the next section you will find several examples about how to do it.

## The Simplest Scenario using sqlite 
!!! warning "Not parallel"
    Recommended for smaller datasets only, or when stressing the source system is a high concern

This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source.

Here we use a sqlite database where any connection is allowed. Due to that, we do not specify any username or password.

Same as spark, we provide two different ways to run jdbc reader.

1 - We can use the **jdbc() function**, passing inside all the arguments needed for Spark to work, and we can even combine this with additional options passed through .options().

2 - Other way is using **.format("jdbc")** and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution.


**You can find and run the following code in our local test for the engine.**

### jdbc() function

As we can see in the next cell, all the arguments necessary to establish the jdbc connection are passed inside the `jdbc_args` object. Here we find the url, the table, and the driver. Besides that, we can add options, such as the partition number. The partition number will impact in the queries' parallelism.

The below code is an example in how to use jdbc() function in our ACON.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/batch_init.json!}
```

This is same as using the following code in pyspark:

```python
spark.read.jdbc(
  url="jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db",
  table="jdbc_function",
  properties={"driver":"org.sqlite.JDBC"})
  .option("numPartitions", 1)
```

### .format("jdbc")

In this example we do not use the `jdbc_args` object. All the jdbc connection parameters are inside the dictionary with the object options.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/batch_init.json!}
```

This is same as using the following code in pyspark:

```python
spark.read.format("jdbc")
    .option("url", "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db")
    .option("driver", "org.sqlite.JDBC")
    .option("dbtable", "jdbc_format")
    .option("numPartitions", 1)
```

## Template with more complete and runnable examples
In this template we will use a **SAP as example** for a more complete and runnable example.
These definitions can be used in several databases that allow JDBC connection.

The following scenarios of extractions are covered:

- 1 - The Simplest Scenario (Not parallel -  Recommended for smaller datasets only,
or when stressing the source system is a high concern)
- 2 - Parallel extraction
  - 2.1 - Simplest Scenario 
  - 2.2 - Provide upperBound (Recommended)
  - 2.3 - Provide predicates (Recommended)

!!! note "Disclaimer"
    This template only uses **SAP as demonstration example for JDBC connection.**
    **This isn't a SAP template!!!**
    **If you are looking to extract data from SAP, please use our sap_b4 reader or the sap_bw reader.**

The JDBC connection has 2 main sections to be filled, the **jdbc_args** and **options**:

- jdbc_args - Here you need to fill everything related to jdbc connection itself, like table/query, url, user,
..., password.
- options - This section is more flexible, and you can provide additional options like "fetchSize", "batchSize",
"numPartitions", ..., upper and "lowerBound".

If you want to know more regarding jdbc spark options you can follow the link below:

- https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html

If you want to have a better understanding about JDBC Spark optimizations, you can find them in the following:

- https://docs.databricks.com/en/connect/external-systems/jdbc.html
- https://stackoverflow.com/questions/41085238/what-is-the-meaning-of-partitioncolumn-lowerbound-upperbound-numpartitions-pa
- https://newbedev.com/how-to-optimize-partitioning-when-migrating-data-from-jdbc-source

### 1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets, or for not stressing the source)
This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques
and using a single connection to retrieve all the data from the source. It should only be used in case the data
you want to extract from is a small one, with no big requirements in terms of performance to fulfill.

When extracting from the source, we can have two options:

- **Delta Init** - full extraction of the source. You should use it in the first time you extract from the
source or any time you want to re-extract completely. Similar to a so-called full load.
- **Delta** - extracts the portion of the data that is new or has changed in the source, since the last
extraction (for that, the logic at the transformation step needs to be applied). On the examples below,
the logic using REQTSN column is applied, which means that the maximum value on bronze is filtered
and its value is used to filter incoming data from the data source.

##### Init - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "fetchSize": 100000,
                "compress": True,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": "overwrite",
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}
 
load_data(acon=acon)
```

##### Delta - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_jdbc_url",
                "table": "my_database.my_table",
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "fetchSize": 100000,
                "compress": True,
            },
        },
        {
            "spec_id": "my_identifier_bronze",
            "read_type": "batch",
            "data_format": "delta",
            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",
        },
    ],
    "transform_specs": [
        {
            "spec_id": "max_my_identifier_bronze_date",
            "input_id": "my_identifier_bronze",
            "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}],
        },
        {
            "spec_id": "appended_my_identifier",
            "input_id": "my_identifier_source",
            "transformers": [
                {
                    "function": "incremental_filter",
                    "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"},
                }
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "appended_my_identifier",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/jdbc_template/no_parallel/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}
 
load_data(acon=acon)
```

### 2 - Parallel extraction
On this section we present 3 possible scenarios for parallel extractions from JDBC sources.

!!! note "Disclaimer for parallel extraction"
    Parallel extractions can bring a jdbc source down if a lot of stress
    is put on the system. **Be careful when choosing the number of partitions. 
    Spark is a distributed system and can lead to many connections.**

#### 2.1 - Parallel Extraction, Simplest Scenario
This scenario provides the simplest example you can have for a parallel extraction from JDBC sources, only using
the property `numPartitions`. The goal of the scenario is to cover the case in which people do not have
much experience around how to optimize the extraction from JDBC sources or cannot identify a column that can
be used to split the extraction in several tasks. This scenario can also be used if the use case does not
have big performance requirements/concerns, meaning you do not feel the need to optimize the performance of
the extraction to its maximum potential.

On the example bellow, `"numPartitions": 10` is specified, meaning that Spark will open 10 parallel connections
to the source and automatically decide how to parallelize the extraction upon that requirement. This is the
only change compared to the example provided in the scenario 1.

##### Delta Init - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "fetchSize": 100000,
                "compress": True,
                "numPartitions": 10,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": "overwrite",
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

##### Delta - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "fetchSize": 100000,
                "compress": True,
                "numPartitions": 10,
            },
        },
        {
            "spec_id": "my_identifier_bronze",
            "read_type": "batch",
            "data_format": "delta",
            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",
        },
    ],
    "transform_specs": [
        {
            "spec_id": "max_my_identifier_bronze_date",
            "input_id": "my_identifier_bronze",
            "transformers": [{"function": "get_max_value", "args": {"input_col": "REQTSN"}}],
        },
        {
            "spec_id": "appended_my_identifier",
            "input_id": "my_identifier_source",
            "transformers": [
                {
                    "function": "incremental_filter",
                    "args": {"input_col": "REQTSN", "increment_df": "max_my_identifier_bronze_date"},
                }
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "appended_my_identifier",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["REQTSN"],
            "location": "s3://my_path/jdbc_template/parallel_1/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}
 
load_data(acon=acon)
```

#### 2.2 - Parallel Extraction, Provide upper_bound (Recommended)
This scenario performs the extraction from the JDBC source in parallel, but has more concerns trying to
optimize and have more control (compared to 2.1 example) on how the extraction is split and performed,
using the following options:

- `numPartitions` - number of Spark partitions to split the extraction.
- `partitionColumn` - column used to split the extraction. It must be a numeric, date, or timestamp.
It should be a column that is able to split the extraction evenly in several tasks. An auto-increment
column is usually a very good candidate.
- `lowerBound` - lower bound to decide the partition stride.
- `upperBound` - upper bound to decide the partition stride.

This is an adequate example to be followed if there is a column in the data source that is good to
be used as the `partitionColumn`. Comparing with the previous example,
the `numPartitions` and three additional options to fine tune the extraction (`partitionColumn`, `lowerBound`,
`upperBound`) are provided.

When these 4 properties are used, Spark will use them to build several queries to split the extraction.
**Example:** for `"numPartitions": 10`, `"partitionColumn": "record"`, `"lowerBound: 1"`, `"upperBound: 100"`,
Spark will generate 10 queries like:

- `SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL`
- `SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20`
- `SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30`
- ...
- `SELECT * FROM dummy_table WHERE RECORD >= 100`

 
##### Init - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "partitionColumn": "RECORD",
                "numPartitions": 10,
                "lowerBound": 1,
                "upperBound": 2000,
                "fetchSize": 100000,
                "compress": True,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": "overwrite",
            "data_format": "delta",
            "partitions": ["RECORD"],
            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}
 
load_data(acon=acon)
```

##### Delta - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "partitionColumn": "RECORD",
                "numPartitions": 10,
                "lowerBound": 1,
                "upperBound": 2000,
                "fetchSize": 100000,
                "compress": True,
            },
        },
        {
            "spec_id": "my_identifier_bronze",
            "read_type": "batch",
            "data_format": "delta",
            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",
        },
    ],
    "transform_specs": [
        {
            "spec_id": "max_my_identifier_bronze_date",
            "input_id": "my_identifier_bronze",
            "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}],
        },
        {
            "spec_id": "appended_my_identifier",
            "input_id": "my_identifier_source",
            "transformers": [
                {
                    "function": "incremental_filter",
                    "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"},
                }
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "appended_my_identifier",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["RECORD"],
            "location": "s3://my_path/jdbc_template/parallel_2/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=acon)
```

#### 2.3 - Parallel Extraction with Predicates (Recommended)
This scenario performs the extraction from JDBC source in parallel, useful in contexts where there aren't
numeric, date or timestamp columns to parallelize the extraction:

- `partitionColumn` - column used to split the extraction (can be of any type).

- This is an adequate example to be followed if there is a column in the data source that is good to be
used as the `partitionColumn`, specially if these columns are not complying with the scenario 2.2.

**When this property is used, all predicates to Spark need to be provided, otherwise it will leave data behind.**

Bellow, a lakehouse function to generate predicate list automatically, is presented.

**By using this function one needs to be careful specially on predicates_query and predicates_add_null variables.**

**predicates_query:** At the sample below the whole table (`select distinct(x) from table`) is being considered,
but it is possible to filter using predicates list here, specially if you are applying filter on
transformations spec, and you know entire table won't be necessary, so you can change it to something like this:
`select distinct(x) from table where x > y`.

**predicates_add_null:** One can consider if null on predicates list or not. By default, this property is True.
**Example:** for `"partitionColumn": "record"`

##### Init - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionUtils,
)
ExecEnv.get_or_create()

partitionColumn = "my_partition_col"
dbtable = "my_database.my_table"
 
predicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})"""
column_for_predicates = partitionColumn
user = "my_user"
password = "my_b4_hana_pwd"
url = "my_sap_b4_url"
driver = "com.sap.db.jdbc.Driver"
predicates_add_null = True
 
jdbc_util = JDBCExtractionUtils(
    JDBCExtraction(
        user=user,
        password=password,
        url=url,
        predicates_add_null=predicates_add_null,
        partition_column=partitionColumn,
        dbtable=dbtable,
    )
)
 
predicates = jdbc_util.get_predicates(predicates_query)

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "predicates": predicates,
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "fetchSize": 100000,
                "compress": True,
            },
        }
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "my_identifier_source",
            "write_type": "overwrite",
            "data_format": "delta",
            "partitions": ["RECORD"],
            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}
 
load_data(acon=acon)
```

##### Delta - Load data into the Bronze Bucket
```python
from lakehouse_engine.engine import load_data
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.extraction.jdbc_extraction_utils import (
    JDBCExtraction,
    JDBCExtractionUtils,
)
ExecEnv.get_or_create()

partitionColumn = "my_partition_col"
dbtable = "my_database.my_table"

predicates_query = f"""(SELECT DISTINCT({partitionColumn}) FROM {dbtable})"""
column_for_predicates = partitionColumn
user = "my_user"
password = "my_b4_hana_pwd"
url = "my_sap_b4_url"
driver = "com.sap.db.jdbc.Driver"
predicates_add_null = True

jdbc_util = JDBCExtractionUtils(
    JDBCExtraction(
        user=user,
        password=password,
        url=url,
        predicates_add_null=predicates_add_null,
        partition_column=partitionColumn,
        dbtable=dbtable,
    )
)

predicates = jdbc_util.get_predicates(predicates_query)

acon = {
    "input_specs": [
        {
            "spec_id": "my_identifier_source",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": "my_sap_b4_url",
                "table": "my_database.my_table",
                "predicates": predicates,
                "properties": {
                    "user": "my_user",
                    "password": "my_b4_hana_pwd",
                    "driver": "com.sap.db.jdbc.Driver",
                },
            },
            "options": {
                "fetchSize": 100000,
                "compress": True,
            },
        },
        {
            "spec_id": "my_identifier_bronze",
            "read_type": "batch",
            "data_format": "delta",
            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",
        },
    ],
    "transform_specs": [
        {
            "spec_id": "max_my_identifier_bronze_date",
            "input_id": "my_identifier_bronze",
            "transformers": [{"function": "get_max_value", "args": {"input_col": "RECORD"}}],
        },
        {
            "spec_id": "appended_my_identifier",
            "input_id": "my_identifier_source",
            "transformers": [
                {
                    "function": "incremental_filter",
                    "args": {"input_col": "RECORD", "increment_df": "max_my_identifier_bronze_date"},
                }
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "my_identifier_bronze",
            "input_id": "appended_my_identifier",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["RECORD"],
            "location": "s3://my_path/jdbc_template/parallel_3/my_identifier/",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}
 
load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/filtered_full_load/__init__.py
================================================
"""
.. include::filtered_full_load.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/filtered_full_load/filtered_full_load.md
================================================
# Filtered Full Load

This scenario is very similar to the [full load](../full_load/full_load.md), but it filters the data coming from the source, instead of doing a complete full load.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/full_load/with_filter/batch.json!}
```

##### Relevant notes:

* As seen in the ACON, the filtering capabilities are provided by a transformer called `expression_filter`, where you can provide a custom Spark SQL filter.

================================================
FILE: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/__init__.py
================================================
"""
.. include::filtered_full_load_with_selective_replace.md
"""

================================================
FILE: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/filtered_full_load_with_selective_replace.md
================================================
# Filtered Full Load with Selective Replace

This scenario is very similar to the [Filtered Full Load](../filtered_full_load/filtered_full_load.md), but we only replace a subset of the partitions, leaving the other ones untouched, so we don't replace the entire table. This capability is very useful for backfilling scenarios.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/full_load/with_filter_partition_overwrite/batch.json!}
```

##### Relevant notes:

* The key option for this scenario in the ACON is the `replaceWhere`, which we use to only overwrite a specific period of time, that realistically can match a subset of all the partitions of the table. Therefore, this capability is very useful for backfilling scenarios.

================================================
FILE: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/__init__.py
================================================
"""
.. include::flatten_schema_and_explode_columns.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/flatten_schema_and_explode_columns.md
================================================
# Flatten Schema and Explode Columns

Related with schema, we can make two kind of operations:

* **Flatten Schema**: transformer named "flatten_schema" used to flatten the schema of dataframe.
    * Parameters to be defined:
        * max_level: 2 => this sets the level until you want to flatten the schema.
        * shorten_names: True => this flag is when you want to shorten the name of the prefixes of the fields.
        * alias: True => this flag is used when you want to define a prefix for the column to be flattened.
        * num_chars: 7 => this sets the number of characters to consider when shortening the names of the fields.
        * ignore_cols: True => this list value should be set to specify the columns you don't want to flatten.


* **Explode Columns**: transformer named "explode_columns" used to explode columns with types ArrayType and MapType. 
    * Parameters to be defined:
        * explode_arrays: True => this flag should be set to true to explode all array columns present in the dataframe.
        * array_cols_to_explode: ["sample_col"] => this list value should be set when to specify the array columns desired to explode.
        * explode_maps: True => this flag should be set to true to explode all map columns present in the dataframe.
        * map_cols_to_explode: ["map_col"] => this list value should be set when to specify the map columns desired to explode.
    * Recommendation: use array_cols_to_explode and map_cols_to_explode to specify the columns desired to explode and do not do it for all of them.


The below scenario of **flatten_schema** is transforming one or more columns and dividing the content nested in more columns, as desired. We defined the number of levels we want to flatten in the schema, regarding the nested values. In this case, we are just setting `max_level` of `2`.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json!}
```

The scenario of **explode_arrays** is transforming the arrays columns in one or more rows, depending on the number of elements, so, it replicates the row for each array value. In this case we are using explode to all array columns, using `explode_arrays` as `true`.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json!}
```

The scenario of **flatten_and_explode_arrays_and_maps** is using `flatten_schema` and `explode_columns` to have the desired output. In this case, the desired output is to flatten all schema and explode maps and arrays, even having an array inside a struct. Steps:

    1. In this case, we have an array column inside a struct column, so first we need to use the `flatten_schema` transformer to extract the columns inside that struct;
    2. Then, we are able to explode all the array columns desired and map columns, using `explode_columns` transformer.
    3. To be able to have the map column in 2 columns, we use again the `flatten_schema` transformer.

As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch.json!}
```


================================================
FILE: lakehouse_engine_usage/data_loader/full_load/__init__.py
================================================
"""
.. include::full_load.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/full_load/full_load.md
================================================
# Full Load

This scenario reads CSV data from a path and writes in full to another path with delta lake files.

##### Relevant notes

- This ACON infers the schema automatically through the option `inferSchema` (we use it for local tests only). This is usually not a best practice using CSV files, and you should provide a schema through the InputSpec variables `schema_path`, `read_schema_from_table` or `schema`.
- The `transform_specs` in this case are purely optional, and we basically use the repartition transformer to create one partition per combination of date and customer. This does not mean you have to use this in your algorithm.
- A full load is also adequate for an init load (initial load).

As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/full_load/full_overwrite/batch.json!}
```

================================================
FILE: lakehouse_engine_usage/data_loader/read_from_dataframe/__init__.py
================================================
"""
.. include::read_from_dataframe.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/read_from_dataframe/read_from_dataframe.md
================================================
# Read from Dataframe

!!! danger
    Don't use this feature if the Lakehouse Engine already has a supported data format for your use case, as in that case it is preferred to use the dedicated data formats which are more extensively tested and predictable. Check the supported data formats [here](../../../reference/packages/core/definitions.md#packages.core.definitions.InputFormat).

Reading from a Spark DataFrame is very simple using our framework. You just need to define the input_specs as follows: 

```python
{
    "input_spec": {
        "spec_id": "my_df",
        "read_type": "batch",
        "data_format": "dataframe",
        "df_name": df,
    }
}
```

!!! note "**Why is it relevant?**"
    With this capability of reading a dataframe you can deal with sources that do not yet officially have a reader (e.g., REST api, XML files, etc.).

================================================
FILE: lakehouse_engine_usage/data_loader/read_from_sharepoint/__init__.py
================================================
"""
.. include::read_from_sharepoint.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/read_from_sharepoint/read_from_sharepoint.md
================================================
# Read from Sharepoint

There may be scenarios where data products must ingest curated datasets that business teams maintain directly in Sharepoint, for example exports from external systems or manually maintained reference files.

The `SharepointReader` is a specialized reader module designed to load one or more files from a Sharepoint document library into the lakehouse. It abstracts away the complexity of accessing Sharepoint by:

* Resolving the configured Sharepoint site, drive, and document path.
* Downloading the target file or all files matching a configured pattern into a temporary local location.
* Reading the downloaded file(s) into a Spark DataFrame using the configured format and options.
* Optionally combining multiple files into a single DataFrame (for example, unioning all matching CSV files in a folder) and optionally archiving processed files back to Sharepoint (success and error folders).

!!! note
    📘 Tip: This reader integrates seamlessly into the lakehouse engine’s input step and can be triggered as part of the ACON-based pipeline, just like any other reader module.

!!! warning
    When reading from text-based formats such as CSV, complex data types (arrays, maps, structs) are not preserved in the source file. If your downstream tables expect these types, you must reconstruct them from string columns after ingestion (for example using `from_json` or explicit casts).


### Usage Scenarios

The examples below show how to read data from Sharepoint, ranging from simple single-file reads to more advanced multi-file and large-file scenarios.

1. [Configuration parameters](#1-configuration-parameters)
2. [**Simple:** Read one file from Sharepoint](#2-simple-read-one-file-from-sharepoint)
    1. [Minimal configuration](#i-minimal-configuration)
    2. [With optional configurations](#ii-with-optional-configurations)
3. [**Complex:** Read multiple files from Sharepoint](#3-complex-read-multiple-files-from-sharepoint)
    1. [Read multiple files (standard size)](#i-read-multiple-files-standard-size)
    2. [Read multiple large files with `chunk_size` and CSV options](#ii-read-multiple-large-files-with-chunk_size-and-csv-options)
4. [Delimiter handling](#4-delimiter-handling)
5. [Orchestrating multiple Sharepoint reads (loop pattern)](#5-orchestrating-multiple-sharepoint-reads-loop-pattern)


## 1. Configuration parameters

### The mandatory configuration parameters are:

   - **client_id** (str): azure client ID application, available at the
     Azure Portal -> Azure Active Directory.
   - **tenant_id** (str): tenant ID associated with the Sharepoint site, available at the
     Azure Portal -> Azure Active Directory.
   - **site_name** (str): name of the Sharepoint site where the document library resides.
     Sharepoint URL naming convention is: **https://your_company_name.Sharepoint.com/sites/site_name**
   - **drive_name** (str): name of the document library where the file will be uploaded.
     Sharepoint URL naming convention is: **https://your_company_name.Sharepoint.com/sites/site_name/drive_name**
   - **file_name** (str): name of the file to be read from Sharepoint when
     performing a **single-file** read.
     - In multi-file scenarios, `file_pattern` is typically used instead
       (see examples below).
   - **secret** (str): client secret for authentication, available at the
     Azure Portal -> Azure Active Directory.
   - **local_path** (str): temporary local storage path (Volume) where files are
     downloaded before being read.
     - Ensure the **path ends with "/"**.
     - The **specified sub-folder may be deleted during processing** (for example when
       cleaning up temporary files); it does not perform a recursive delete on parent
       directories.
     - **Avoid using a critical sub-folder.**
   - **api_version** (str): version of the Graph Sharepoint API to be used for operations.
     This defaults to "v1.0".

> 🔐 Authentication details (`client_id`, `secret`, etc.) should be handled
> securely via lakehouse configuration or secret management tools, rather than
> hard-coded in notebooks.

### The optional parameters are:

   - **folder_relative_path** (Optional[str]): relative folder path within the
     document library where the file(s) are located (for example,
     `"incoming/daily_exports"`).
   - **chunk_size** (Optional[int]): size (in bytes) of the file chunks used when
     downloading and archiving files.
     **Default is `5 * 1024 * 1024` (5 MB).**
     Useful when working with large files to avoid memory pressure.
   - **local_options** (Optional[dict]): additional options for customizing the
     **Spark read** from the temporary local file(s) (for example CSV options such as
     `header`, `delimiter`, `encoding`, etc.). See the Spark CSV options link below.
   - **conflict_behaviour** (Optional[str]): behavior to adopt when archiving files
     and a file with the same name already exists in the target location
     (for example, `"replace"`, `"fail"`).
   - **file_pattern** (Optional[str]): pattern to match **multiple files** in
     Sharepoint (for example, `"export_*.csv"`).
     Used by the multi-file reader flow to download and union all matching files.
   - **file_type** (Optional[str]): type of the files to be read from Sharepoint
     (for example, `"csv"`). The reader uses this to decide which Spark data source
     to use when reading from `local_path`.


!!! note
    For more details about the Sharepoint framework, refer to Microsoft's official documentation:

    > 📖[ Microsoft Graph API - Sharepoint](https://learn.microsoft.com/en-us/graph/api/resources/sharepoint?view=graph-rest-1.0)

    > 🛠️ [Graph Explorer Tool](https://developer.microsoft.com/en-us/graph/graph-explorer) -  this tool helps you explore available Sharepoint Graph API functionalities.

    > 📑 [Spark CSV options](https://spark.apache.org/docs/3.5.3/sql-data-sources-csv.html)

## 2. Simple: Read one file from Sharepoint

This section demonstrates both minimal configuration and extended configurations
when using the Sharepoint Reader.

### i. Minimal Configuration

This approach uses only the mandatory parameters needed to connect to Sharepoint
and read a single CSV file into the lakehouse.

**Note:** In this minimal configuration:

- The file is read from the configured `drive_name` (optionally under `folder_relative_path`).
- No explicit archiving or custom CSV options are configured; those are covered in later sections.


```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "csv_read",
            "data_format": "sharepoint",
            "read_type": "batch",
            "sharepoint_opts": {
                "client_id": "dummy_client_id",
                "tenant_id": "dummy_tenant_id",
                "secret": "dummy_secret",
                "site_name": "dummy_site_name",
                "drive_name": "dummy_drive_name",
                "local_path": "/Volumes/my_volume/sharepoint_tmp/",  # must end with "/"
                "folder_relative_path": "dummy_folder",              # optional
                "file_name": "dummy_sales.csv",
                "file_type": "csv",
            },
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_output",
            "input_id": "csv_read",
            "data_format": "delta",
            "db_table": "dummy_sales",
            "write_type": "overwrite",
            "location": "s3://my_data_product_bucket/silver/dummy_sales/"
        },
    ],
}

load_data(acon=acon)
```

### ii. With optional configurations

For more control over the read process, additional parameters can be specified on
top of the minimal configuration:

> **archive_enabled (Optional):** Enables archiving of the processed file in
> Sharepoint.
>
> * If `True`, the reader moves the file out of the input folder after the read.
> * Successful reads go to the *success* subfolder; failures go to the *error*
>   subfolder.

> **archive_success_subfolder (Optional):** Name of the subfolder used to store
> successfully processed files (default is `"done"`).
> The folder is created under the same `folder_relative_path` and `drive_name`.

> **archive_error_subfolder (Optional):** Name of the subfolder used to store
> files that failed to be processed (default is `"error"`).

> **local_options (Optional):** Additional options passed to Spark when reading
> the downloaded CSV file(s) from `local_path` (for example `header`, `delimiter`,
> `encoding`, etc.).
> These options can be used in both **single-file** and **multi-file** read modes.

>
> * For available options, refer to:
>   [Apache Spark CSV Options](https://spark.apache.org/docs/3.5.4/sql-data-sources-csv.html).

> **chunk_size (Optional):** Size (in bytes) of the chunks used when
> downloading files.
>
> * Default: `5 * 1024 * 1024` (5 MB).
> * Smaller chunks are safer for very large files or memory-constrained clusters.

```python
from lakehouse_engine.engine import load_data

# Optional CSV options for the local read
LOCAL_OPTIONS = {
    "header": "true",
    "delimiter": ";",
}

acon = {
    "input_specs": [
        {
            "spec_id": "csv_read",
            "data_format": "sharepoint",
            "read_type": "batch",
            "sharepoint_opts": {
                "client_id": "dummy_client_id",
                "tenant_id": "dummy_tenant_id",
                "secret": "dummy_secret",
                "site_name": "dummy_site_name",
                "drive_name": "dummy_drive_name",
                "local_path": "/Volumes/my_volume/sharepoint_tmp/",
                "folder_relative_path": "dummy_simple",
                "file_name": "dummy_sales.csv",
                "file_type": "csv",
                "archive_enabled": True,
                "archive_success_subfolder": "successful",
                "archive_error_subfolder": "with_error",
                "local_options": LOCAL_OPTIONS,
                "chunk_size": 5 * 1024 * 1024,
            },
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_output",
            "input_id": "csv_read",
            "data_format": "delta",
            "db_table": "dummy_sales",
            "write_type": "overwrite",
            "location": "s3://my_data_product_bucket/silver/dummy_sales/"
        },
    ],
}

load_data(acon=acon)
```

## 3. Complex: Read multiple files from Sharepoint

In many cases, data in Sharepoint is split across multiple files within a folder or
exported periodically.
The `SharepointReader` can automatically locate and read all matching files based
on a configured pattern, merging them into a single DataFrame.

### i. Read multiple files (standard size)

Use `file_pattern` to match and load multiple files within the same folder.
The reader downloads all matching files into the temporary local folder and
performs a union of their contents before returning the DataFrame.

⚠️ **Schema consistency check:**
All matched files must share the same schema.
If a file with a different schema is encountered, the reader stops the ingestion,
moves that file to the configured *error archive* folder, and logs the event.

> **file_pattern (Optional):** Glob-style pattern for matching files, such as `"export_*.csv"`.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "csv_read_multi",
            "data_format": "sharepoint",
            "read_type": "batch",
            "sharepoint_opts": {
                "client_id": "dummy_client_id",
                "tenant_id": "dummy_tenant_id",
                "secret": "dummy_secret",
                "site_name": "dummy_site_name",
                "drive_name": "dummy_drive_name",
                "local_path": "/Volumes/my_volume/sharepoint_tmp/",
                "folder_relative_path": "dummy_sales/daily_exports",
                "file_pattern": "export_*.csv",
                "file_type": "csv",
            },
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_output",
            "input_id": "csv_read_multi",
            "data_format": "delta",
            "db_table": "dummy_sales_daily_exports",
            "write_type": "overwrite",
            "location": "s3://my_data_product_bucket/silver/dummy_sales/"
        },
    ],
}

load_data(acon=acon)
```

## ii. Read multiple large files with `chunk_size` and CSV options

When reading multiple large CSV files, the reader can:

- Download each file in chunks (to avoid memory pressure).
- Apply custom CSV read options (delimiter, header, encoding, etc.) before unioning the data.

> **chunk_size (Optional):**
> Size (in bytes) of the chunks used when downloading and archiving files.
> Default is `5 * 1024 * 1024` (5 MB). Increase this for very large files to reduce the number of download operations.

> **local_options (Optional):**
> Spark CSV options used when reading the downloaded files from `local_path`
> (for example `header`, `delimiter`, `encoding`, `quote`, etc.).

```python
from lakehouse_engine.engine import load_data

LOCAL_OPTIONS = {
    "header": "true",
    "delimiter": ";",
    "encoding": "utf-8",
}

acon = {
    "input_specs": [
        {
            "spec_id": "csv_read_multi_large",
            "data_format": "sharepoint",
            "read_type": "batch",
            "sharepoint_opts": {
                "client_id": "dummy_client_id",
                "tenant_id": "dummy_tenant_id",
                "secret": "dummy_secret",
                "site_name": "dummy_site_name",
                "drive_name": "dummy_drive_name",
                "local_path": "/Volumes/my_volume/sharepoint_tmp/",
                "folder_relative_path": "dummy_sales/big_daily_exports/",
                "file_pattern": "big_export_*.csv",
                "file_type": "csv",
                "chunk_size": 50 * 1024 * 1024,  # 50 MB per chunk
                "local_options": LOCAL_OPTIONS,
            },
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_output",
            "input_id": "csv_read_multi_large",
            "data_format": "delta",
            "db_table": "dummy_sales_daily_exports",
            "write_type": "overwrite",
        },
    ],
}

load_data(acon=acon)
```

## 4. Delimiter handling

When reading CSV files (single-file or multi-file), the Sharepoint Reader:

- Uses `sep` or `delimiter` from `local_options` as-is if provided
  (no auto-detection in this case).
- If no delimiter is provided, it:
    - Tries to auto-detect one from `; , | \t` using `csv.Sniffer`.
    - Optionally compares the resulting column count with `expected_columns`
      (if set) and logs a warning if they do not match.
    - Falls back to comma (`,`) if detection fails.

Internally, the final delimiter is always passed to Spark as `sep`
(`delimiter` is mapped to `sep` and then removed).

> 💡 Tip: You can use `local_options` (including `sep` / `delimiter`) in both
> single-file and multi-file read modes. When in doubt, set `sep` explicitly.


## 5. Orchestrating multiple Sharepoint reads (loop pattern)

If you need to read from multiple independent Sharepoint locations
(different folders, drives, or file patterns), you can orchestrate a loop in your
notebook and call `load_data` once per configuration.

```python
from lakehouse_engine.engine import load_data

sharepoint_sources = [
    {"folder_relative_path": "dummy_sales/big_daily_exports", "file_pattern": "big_export_*.csv"},
    {"folder_relative_path": "dummy_sales/daily_exports", "file_pattern": "export_*.csv.csv"},
]

for src in sharepoint_sources:
    acon = {
        "input_specs": [
            {
                "spec_id": "csv_read",
                "data_format": "sharepoint",
                "read_type": "batch",
                "sharepoint_opts": {
                    "client_id": "...",
                    "tenant_id": "...",
                    "secret": "...",
                    "site_name": "...",
                    "drive_name": "...",
                    "local_path": "/Volumes/my_volume/sharepoint_tmp/",
                    "folder_relative_path": src["folder_relative_path"],
                    "file_pattern": src["file_pattern"],
                    "file_type": "csv",
                },
            },
        ],
        "output_specs": [
            {
                "spec_id": "output",
                "input_id": "csv_read",
                "data_format": "delta",
                "db_table": "dummy_sales_daily_exports",
                "write_type": "append",
            },
        ],
    }

    load_data(acon=acon)
```

‼️ Caution: excessive parallelism

    - Running too many Sharepoint reads in parallel can trigger MS Graph API
    throttling (for example 429 or 503 responses).
    - Prefer a controlled level of parallelism when orchestrating multiple
    pipelines or loops that read from Sharepoint.
    - Monitor logs and retries to ensure stable performance, especially when
    working with large files or many files at once.

The Lakehouse Engine framework uses retry logic with backoff to mitigate
throttling, but it cannot fully replace sensible limits on concurrency.


================================================
FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/__init__.py
================================================
"""
.. include::streaming_append_load_with_malformed.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/streaming_append_load_with_malformed.md
================================================
# Streaming Append Load with DROPMALFORMED

This scenario illustrates an append load done via streaming instead of batch, providing an efficient way of picking up new files from an S3 folder, instead of relying on the incremental filtering from the source needed from a batch based process (see append loads in batch from a JDBC source to understand the differences between streaming and batch append loads). However, not all sources (e.g., JDBC) allow streaming.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/append_load/streaming_dropmalformed/streaming.json!}
```

##### Relevant notes:

* In this scenario, we use DROPMALFORMED read mode, which drops rows that do not comply with the provided schema;
* In this scenario, the schema is provided through the `input_spec` "schema" variable. This removes the need of a separate JSON Spark schema file, which may be more convenient in certain cases.
* As can be seen, we use the `output_spec` Spark option `checkpointLocation` to specify where to save the checkpoints indicating what we have already consumed from the input data. This allows fault-tolerance if the streaming job fails, but more importantly, it allows us to run a streaming job using [AvailableNow](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) and the next job automatically picks up the stream state since the last checkpoint, allowing us to do efficient append loads without having to manually specify incremental filters as we do for batch append loads.

================================================
FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/__init__.py
================================================
"""
.. include::streaming_append_load_with_terminator.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/streaming_append_load_with_terminator.md
================================================
# Streaming Append Load with Optimize Dataset Terminator

This scenario includes a terminator which optimizes a dataset (table), being able of vacuuming the table, optimising it with z-order or not, computing table statistics and more. You can find more details on the Terminator [here](../../../reference/packages/terminators/dataset_optimizer.md).

As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/append_load/streaming_with_terminators/streaming.json!}
```

================================================
FILE: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/__init__.py
================================================
"""
.. include::streaming_delta_load_with_group_and_rank_condensation.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/streaming_delta_load_with_group_and_rank_condensation.md
================================================
# Streaming Delta Load with Group and Rank Condensation

This scenario is useful for when we want to do delta loads based on changelogs that need to be first condensed based on a group by and then a rank only, instead of the record mode logic in the record mode based change data capture.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json!}
```

##### Relevant notes:
* This type of delta load with this type of condensation is useful when the source changelog can be condensed based on dates, instead of technical fields like `datapakid`, `record`, `record_mode`, etc., as we see in SAP BW DSOs.An example of such system is Omnihub Tibco orders and deliveries files.

================================================
FILE: lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/__init__.py
================================================
"""
.. include::streaming_delta_with_late_arriving_and_out_of_order_events.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md
================================================
# Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking)

## How to Deal with Late Arriving Data without using Watermark

This scenario covers a delta load in streaming mode that is able to deal with late arriving and out of order events.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```
Example of ACON configuration:
```json
{!../../../../tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming_delta.json!}
```

##### Relevant notes:

* First question we can impose is: Do we need such complicated update predicate to handle late arriving and out of order events? Simple answer is no. Because we expect that the latest event (e.g., latest status of a record in the source) will eventually arrive, and therefore the target delta lake table will eventually be consistent. However, when will that happen? Do we want to have our target table inconsistent until the next update comes along? This of course is only true when your source cannot ensure the order of the changes and cannot avoid late arriving changes (e.g., some changes that should have come in this changelog extraction, will only arrive in the next changelog extraction). From previous experiences, this is not the case with SAP BW, for example (as SAP BW is ACID compliant, and it will extract data from an SAP source and only have the updated changelog available when the extraction goes through, so theoretically we should not be able to extract data from the SAP BW changelog while SAP BW is still extracting data). 
* However, when the source cannot fully ensure ordering (e.g., Kafka) and we want to make sure we don't load temporarily inconsistent data into the target table, we can pay extra special attention, as we do here, to our update and insert predicates, that will enable us to only insert or update data if the new event meets the respective predicates:
    * In this scenario, we will only update if the `update_predicate` is true, and that long predicate we have here ensures that the change that we are receiving is likely the latest one;
    * In this scenario, we will only insert the record if the record is not marked for deletion (this can happen if the new event is a record that is marked for deletion, but the record was not in the target table (late arriving changes where the delete came before the insert), and therefore, without the `insert_predicate`, the algorithm would still try to insert the row, even if the `record_mode` indicates that that row is for deletion. By using the `insert_predicate` above we avoid that to happen. However, even in such scenario, to prevent the algorithm to insert the data that comes later (which is old, as we said, the delete came before the insert and was actually the latest status), we would even need a more complex predicate based on your data's nature. Therefore, please read the disclaimer below.
!!! note "**Disclaimer**!" The scenario illustrated in this page is purely fictional, designed for the Lakehouse Engine local tests specifically. Your data source changelogs may be different and the scenario and predicates discussed here may not make sense to you. Consequently, the data product team should reason about the adequate merge predicate and insert, update and delete predicates, that better reflect how they want to handle the delta loads for their data.
* We use spark.sql.streaming.schemaInference in our local tests only. We don't encourage you to use it in your data product.


!!! note "**Documentation**"
    [Feature Deep Dive: Watermarking in Apache Spark Structured Streaming - The Databricks Blog](https://www.databricks.com/blog/feature-deep-dive-watermarking-apache-spark-structured-streaming)
    
    [Structured Streaming Programming Guide - Spark 3.4.0 Documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)

## How to Deal with Late Arriving Data using Watermark

When building real-time pipelines, one of the realities that teams have to work with is that distributed data ingestion is inherently unordered. Additionally, in the context of stateful streaming operations, teams need to be able to properly track event time progress in the stream of data they are ingesting for the proper calculation of time-window aggregations and other stateful operations. While working with real-time streaming data there will be delays between event time and processing time due to how data is ingested and whether the overall application experiences issues like downtime. Due to these potential variable delays, the engine that you use to process this data needs to have some mechanism to decide when to close the aggregate windows and produce the aggregate result.

Imagine a scenario where we will need to perform stateful aggregations on the streaming data to understand and identify problems in the machines. **This is where we need to leverage Structured Streaming and Watermarking to produce the necessary stateful aggregations.**

##### Approach 1 - Use a pre-defined fixed window (Bad)

<img src="../../../assets/img/fixed_window.png?raw=true" style="max-width: 800px; height: auto; "/>

Credits: [Image source](https://www.databricks.com/blog/feature-deep-dive-watermarking-apache-spark-structured-streaming)

To explain this visually let’s take a scenario where we are receiving data at various times from around 10:50 AM → 11:20 AM. We are creating 10-minute tumbling windows that calculate the average of the temperature and pressure readings that came in during the windowed period.

In this first picture, we have the tumbling windows trigger at 11:00 AM, 11:10 AM and 11:20 AM leading to the result tables shown at the respective times. When the second batch of data comes around 11:10 AM with data that has an event time of 10:53 AM this gets incorporated into the temperature and pressure averages calculated for the 11:00 AM → 11:10 AM window that closes at 11:10 AM, which does not give the correct result.

##### Approach 2 - Watermark

We can define a **watermark** that will allow Spark to understand when to close the aggregate window and produce the correct aggregate result. In Structured Streaming applications, we can ensure that all relevant data for the aggregations we want to calculate is collected by using a feature called **watermarking**. In the most basic sense, by defining a **watermark** Spark Structured Streaming then knows when it has ingested all data up to some time, **T**, (based on a set lateness expectation) so that it can close and produce windowed aggregates up to timestamp **T**.

<img src="../../../assets/img/watermarking.png?raw=true" style="max-width: 800px; height: auto; "/>

Credits: [Image source](https://www.databricks.com/blog/feature-deep-dive-watermarking-apache-spark-structured-streaming)

Unlike the first scenario where Spark will emit the windowed aggregation for the previous ten minutes every ten minutes (i.e. emit the 11:00 AM →11:10 AM window at 11:10 AM), Spark now waits to close and output the windowed aggregation once **the max event time seen minus the specified watermark is greater than the upper bound of the window**.

In other words, Spark needed to wait until it saw data points where the latest event time seen minus 10 minutes was greater than 11:00 AM to emit the 10:50 AM → 11:00 AM aggregate window. At 11:00 AM, it does not see this, so it only initialises the aggregate calculation in Spark’s internal state store. At 11:10 AM, this condition is still not met, but we have a new data point for 10:53 AM so the internal state gets updated, just **not emitted**. Then finally by 11:20 AM Spark has seen a data point with an event time of 11:15 AM and since 11:15 AM minus 10 minutes is 11:05 AM which is later than 11:00 AM the 10:50 AM → 11:00 AM window can be emitted to the result table.

This produces the correct result by properly incorporating the data based on the expected lateness defined by the watermark. Once the results are emitted the corresponding state is removed from the state store.

###### Watermarking and Different Output Modes

It is important to understand how state, late-arriving records, and the different output modes could lead to different behaviours of your application running on Spark. The main takeaway here is that in both append and update modes, once the watermark indicates that all data is received for an aggregate time window, the engine can trim the window state. In append mode the aggregate is produced only at the closing of the time window plus the watermark delay while in update mode it is produced on every update to the window.

Lastly, by increasing your watermark delay window you will cause the pipeline to wait longer for data and potentially drop less data – higher precision, but also higher latency to produce the aggregates. On the flip side, smaller watermark delay leads to lower precision but also lower latency to produce the aggregates.

Watermarks can only be used when you are running your streaming application in **append** or **update** output modes. There is a third output mode, complete mode, in which the entire result table is written to storage. This mode cannot be used because it requires all aggregate data to be preserved, and hence cannot use watermarking to drop intermediate state.

###### Joins With Watermark

There are three types of stream-stream joins that can be implemented in Structured Streaming: **inner, outer, and semi joins**. The main problem with doing joins in streaming applications is that you may have an incomplete picture of one side of the join. Giving Spark an understanding of when there are no future matches to expect is similar to the earlier problem with aggregations where Spark needed to understand when there were no new rows to incorporate into the calculation for the aggregation before emitting it.

To allow Spark to handle this, we can leverage a combination of watermarks and event-time constraints within the join condition of the stream-stream join. This combination allows Spark to filter out late records and trim the state for the join operation through a time range condition on the join.

Spark has a policy for handling multiple watermark definitions. Spark maintains **one global watermark** that is based on the slowest stream to ensure the highest amount of safety when it comes to not missing data.

We can change this behaviour by changing *spark.sql.streaming.multipleWatermarkPolicy* to max; however, this means that data from the slower stream will be dropped.

###### State Store Performance Considerations

As of Spark 3.2, Spark offers RocksDB state store provider.

If you have stateful operations in your streaming query (for example, streaming aggregation, streaming dropDuplicates, stream-stream joins, mapGroupsWithState, or flatMapGroupsWithState) and you want to maintain millions of keys in the state, then you may face issues related to large JVM garbage collection (GC) pauses causing high variations in the micro-batch processing times. This occurs because, by the implementation of HDFSBackedStateStore, the state data is maintained in the JVM memory of the executors and large number of state objects puts memory pressure on the JVM causing high GC pauses.

In such cases, you can choose to use a more optimized state management solution based on RocksDB. Rather than keeping the state in the JVM memory, this solution uses RocksDB to efficiently manage the state in the native memory and the local disk. Furthermore, any changes to this state are automatically saved by Structured Streaming to the checkpoint location you have provided, thus providing full fault-tolerance guarantees (the same as default state management).

To enable the new build-in state store implementation, *set `spark.sql.streaming.stateStore.providerClass` to `org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider`*.

For more details please visit Spark documentation: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation

You can enable this in your acons, by specifying it as part of the exec_env properties like below:

```json
"exec_env": {
    "spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider"
}
```

================================================
FILE: lakehouse_engine_usage/data_loader/write_and_read_dataframe/__init__.py
================================================
"""
.. include::write_and_read_dataframe.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/write_and_read_dataframe/write_and_read_dataframe.md
================================================
# Write and Read Dataframe

DataFrame writer can give us some advantages by returning a dictionary containing the `spec_id` and the computed dataframe.
In these examples we will cover the following scenarios of using the output `dataframe` format:

1. [**Write to dataframe**: Consuming the output spec as DataFrame;](#1-write-to-dataframe-consuming-the-output-spec-as-dataframe)
2. [**Write all dataframes**: Consuming all DataFrames generated per specs;](#2-write-all-dataframes-consuming-all-dataframes-generated-per-specs)
3. [**Read from and Write to dataframe**: Making use of the DataFrame output spec to compose silver data.](#3-read-from-and-write-to-dataframe-making-use-of-the-dataframe-output-spec-to-compose-silver-data)

#### Main advantages of using this output writer:

- **Debugging purposes**: as we can access any dataframe used in any part of our ACON
  we can observe what is happening with the computation and identify what might be wrong
  or can be improved.
- **Flexibility**: in case we have some very specific need not covered yet by the lakehouse
  engine capabilities, example: return the Dataframe for further processing like using a machine
  learning model/prediction.
- **Simplify ACONs**: instead developing a single complex ACON, using the Dataframe writer,
  we can compose our ACON from the output of another ACON. This allows us to identify
  and split the notebook logic across ACONs.

If you want/need, you can add as many dataframes as you want in the output spec
referencing the spec_id you want to add.

!!! warning
    **This is not intended to replace the other capabilities offered by the lakehouse-engine** and in case **other feature can cover your use case**, you should **use it instead of using the Dataframe writer**, as they are much **more extensively tested on different type of operations**.
  
  *Additionally, please always introspect if the problem that you are trying to resolve and for which no lakehouse-engine feature is available, could be a common problem and thus deserve a common solution and feature.*
  
  Moreover, **Dataframe writer is not supported for the streaming trigger
  types `processing time` and `continuous`.**

## 1. Write to dataframe: Consuming the output spec as DataFrame

### Silver Dummy Sales Write to DataFrame

In this example we will cover the Dummy Sales write to a result containing the output DataFrame.

- An ACON is used to read from bronze, apply silver transformations and write to a dictionary
  containing the output spec as key and the dataframe as value through the following steps:
    - 1 - Definition of how to read data (input data location, read type and data format);
    - 2 - Transformation of data (rename relevant columns);
    - 3 - Write the data to dict containing the dataframe;

!!! note
    If you are trying to retrieve more than once the same data using checkpoint it will return an empty dataframe with empty schema as we don't have new data to read.


```python
from lakehouse_engine.engine import load_data

cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_sales_bronze",
            "read_type": "streaming",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_sales",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_sales_transform",
            "input_id": "dummy_sales_bronze",
            "transformers": [
                {
                    "function": "rename",
                    "args": {
                        "cols": cols_to_rename,
                    },
                },
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_sales_silver",
            "input_id": "dummy_sales_transform",
            "data_format": "dataframe",
            "options": {
                "checkpointLocation": "s3://my_data_product_bucket/checkpoints/bronze/dummy_sales",
            },
        }
    ],
}
```

### Run the Load and Return the Dictionary with the DataFrames by OutputSpec

This exploratory test will return a dictionary with the output spec and the dataframe
that will be stored after transformations.

```python
output = load_data(acon=acon)
display(output.keys())
display(output.get("dummy_sales_silver"))
```

## 2. Write all dataframes: Consuming all DataFrames generated per specs

### Silver Dummy Sales Write to DataFrame

In this example we will cover the Dummy Sales write to a result containing the specs and related DataFrame.

- An ACON is used to read from bronze, apply silver transformations and write to a dictionary
  containing the spec id as key and the DataFrames as value through the following steps:
    - Definition of how to read data (input data location, read type and data format);
    - Transformation of data (rename relevant columns);
    - Write the data to a dictionary containing all the spec ids and DataFrames computed per step;

```python
from lakehouse_engine.engine import load_data

cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_sales_bronze",
            "read_type": "batch",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_sales",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_sales_transform",
            "input_id": "dummy_sales_bronze",
            "transformers": [
                {
                    "function": "rename",
                    "args": {
                        "cols": cols_to_rename,
                    },
                },
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales_bronze",
            "input_id": "dummy_sales_bronze",
            "data_format": "dataframe",
        },
        {
            "spec_id": "sales_silver",
            "input_id": "dummy_sales_transform",
            "data_format": "dataframe",
        },
    ],
}
```

### Run the Load and Return the Dictionary with the related DataFrames by Spec

This exploratory test will return a dictionary with all specs and the related dataframe.
You can access the DataFrame you need by `output.get(<spec_id>)` for future developments and tests.

```python
output = load_data(acon=acon)
display(output.keys())
display(output.get("sales_bronze"))
display(output.get("sales_silver"))
```

## 3. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data

### Silver Load Dummy Deliveries

In this example we will cover the Dummy Deliveries table read and incremental load to silver composing the silver data to write using the DataFrame output spec:

- First ACON is used to get the latest data from bronze, in this step we are using more than one output because we will need the bronze data with the latest data in the next step.
- Second ACON is used to consume the bronze data and the latest data to perform silver transformation, in this ACON we are using as **input the two dataframes computed by the first ACON.**
- Third ACON is used to write the silver computed data from the previous ACON to the target.

!!! note
    This example is not a recommendation on how to deal with incremental loads, the ACON was split in 3 for demo purposes.

Consume bronze data, generate the latest data and return a dictionary with bronze and transformed dataframes:

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "read_type": "batch",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_sales",
        },
        {
            "spec_id": "dummy_deliveries_silver_source",
            "read_type": "batch",
            "data_format": "delta",
            "db_table": "my_database.dummy_deliveries",
        },
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_deliveries_table_max_value",
            "input_id": "dummy_deliveries_silver_source",
            "transformers": [
                {
                    "function": "get_max_value",
                    "args": {"input_col": "delivery_date", "output_col": "latest"},
                },
                {
                    "function": "with_expressions",
                    "args": {
                        "cols_and_exprs": {"latest": "CASE WHEN latest IS NULL THEN 0 ELSE latest END"},
                    },
                },
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "deliveries_bronze",
            "input_id": "dummy_deliveries_bronze",
            "data_format": "dataframe",
        },
        {
            "spec_id": "dummy_deliveries_transformed",
            "input_id": "dummy_deliveries_table_max_value",
            "data_format": "dataframe",
        },
    ],
}

dummy_deliveries_transformed = load_data(acon=acon)

dummy_deliveries_transformed_df = dummy_deliveries_transformed.get("dummy_deliveries_transformed")
dummy_deliveries_bronze_df = dummy_deliveries_transformed.get("deliveries_bronze")
```

Consume previous dataframes generated by the first ACON (bronze and latest bronze data) to generate the silver data. In this acon we are only using **just one output** because we only need the dataframe from the output for the next step.

```python
from lakehouse_engine.engine import load_data

cols_to_rename = {"delivery_note_header": "delivery_note", "article": "article_id"}

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "read_type": "batch",
            "data_format": "dataframe",
            "df_name": dummy_deliveries_bronze_df,
        },
        {
            "spec_id": "dummy_deliveries_table_max_value",
            "read_type": "batch",
            "data_format": "dataframe",
            "df_name": dummy_deliveries_transformed_df,
        },
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_deliveries_transform",
            "input_id": "dummy_deliveries_bronze",
            "transformers": [
                {
                    "function": "rename",
                    "args": {
                        "cols": cols_to_rename,
                    },
                },
                {
                    "function": "incremental_filter",
                    "args": {
                        "input_col": "delivery_date",
                        "increment_df": "dummy_deliveries_table_max_value",
                        "increment_col": "latest",
                        "greater_or_equal": False,
                    },
                },
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_silver",
            "input_id": "dummy_deliveries_transform",
            "data_format": "dataframe",
        }
    ],
}

dummy_deliveries_silver = load_data(acon=acon)
dummy_deliveries_silver_df = dummy_deliveries_silver.get("dummy_deliveries_silver")
```

Write the silver data generated by previous ACON into the target

```python
from lakehouse_engine.engine import load_data

write_silver_acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_silver",
            "read_type": "batch",
            "data_format": "dataframe",
            "df_name": dummy_deliveries_silver_df,
        },
    ],
    "dq_specs": [
        {
            "spec_id": "dummy_deliveries_quality",
            "input_id": "dummy_deliveries_silver",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "expectations_store_prefix": "dq/expectations/",
            "validations_store_prefix": "dq/validations/",
            "checkpoint_store_prefix": "dq/checkpoints/",
            "result_sink_db_table": "my_database.dummy_deliveries_dq",
            "result_sink_location": "my_data_product_bucket/dq/dummy_deliveries",
            "fail_on_error": False,
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "dq_functions": [
                {
                    "function": "expect_column_values_to_not_be_null",
                    "args": {"column": "delivery_note"},
                },
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {"min_value": 19},
                },
                {
                    "function": "expect_column_max_to_be_between",
                    "args": {"column": "delivery_item", "min_value": 2},
                },
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_silver",
            "input_id": "dummy_deliveries_quality",
            "write_type": "append",
            "location": "s3://my_data_product_bucket/silver/dummy_deliveries_df_writer",
            "data_format": "delta",
        }
    ],
    "exec_env": {
        "spark.databricks.delta.schema.autoMerge.enabled": True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.delta.autoCompact.enabled": True,
    },
}

load_data(acon=write_silver_acon)
```

================================================
FILE: lakehouse_engine_usage/data_loader/write_to_console/__init__.py
================================================
"""
.. include::write_to_console.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/write_to_console/write_to_console.md
================================================
# Write to Console

Console writer is an interesting feature to debug / validate what have been done on lakehouse engine. Before moving forward and store data somewhere, it is possible to show / print the final dataframe to the console, which means it is possible to transform the data as many times as you want and display the final result to validate if it is as expected.

## Silver Dummy Sales Write to Console Example

In this template we will cover the Dummy Sales write to console. An ACON is used to read from bronze, apply silver transformations and write on console through the following steps:

1. Definition of how to read data (input data location, read type and data format);
2. Transformation of data (rename relevant columns);
3. Definition of how to print to console (limit, truncate, vertical options);

For this, the ACON specs are :

- **input_specs** (MANDATORY): specify how to read data;
- **transform specs** (OPTIONAL): specify how to transform data;
- **output_specs** (MANDATORY): specify how to write data to the target.

!!! note
    Writer to console **is a wrapper for spark.show() function**, if you want to know more about the function itself or the available options, [please check the spark documentation here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.show.html).

```python
from lakehouse_engine.engine import load_data

cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_sales_bronze",
            "read_type": "streaming",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_sales",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_sales_transform",
            "input_id": "dummy_sales_bronze",
            "transformers": [
                {
                    "function": "rename",
                    "args": {
                        "cols": cols_to_rename,
                    },
                },
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_sales_silver",
            "input_id": "dummy_sales_transform",
            "data_format": "console",
            "options": {"limit": 8, "truncate": False, "vertical": False},
        }
    ],
}
```

And then, **Run the Load and Exit the Notebook**: This exploratory test will write to the console, which means the final
dataframe will be displayed.

```python
load_data(acon=acon)
```


================================================
FILE: lakehouse_engine_usage/data_loader/write_to_rest_api/__init__.py
================================================
"""
.. include::write_to_rest_api.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/write_to_rest_api/write_to_rest_api.md
================================================
# Write to REST API

REST API writer is an interesting feature to send data from Spark to a REST API within the data pipeline context. It uses the Python requests library to execute the REST calls.

It is possible to configure a few aspects of the writer, like if the payload should be sent via JSON body or via file, or configure additional JSON body parameters to add to the payload generated via Spark.

In the current implementation of the writer, each row will generate a request to the API, so it is important that you prepare your dataframe accordingly (check example below).

## Silver Dummy Sales Write to REST API Example

In this template we will cover the Dummy Sales write to a REST API. An ACON is used to read from bronze, apply silver transformations to prepare the REST api payload and write to the API through the following steps:

1. Definition of how to read data (input data location, read type and data format);
2. Transformation of the data so that we form a payload column per each row.
    **Important Note:** In the current implementation of the writer, each row will generate a request to the API, so `create_payload` is a lakehouse engine custom transformer function that creates a JSON string with the **payload** to be sent to the API. The column name should be exactly **"payload"**, so that the lakehouse engine further processes that column accordingly, in order to correctly write the data to the REST API.
3. Definition of how to write to a REST api (url, authentication, payload format configuration, ...);

For this, the ACON specs are :

- **input_specs** (MANDATORY): specify how to read data;
- **transform specs** (MANDATORY): specify how to transform data to prepare the payload;
- **output_specs** (MANDATORY): specify how to write data to the target.

```python
from lakehouse_engine.engine import load_data

def create_payload(df: DataFrame) -> DataFrame:
    payload_df = payload_df.withColumn(
        "payload",
        lit('{"just a dummy key": "just a dummy value"}')
    )

    return payload_df

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_sales_bronze",
            "read_type": "streaming",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_sales",
        }
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_sales_transform",
            "input_id": "dummy_sales_bronze",
            "transformers": [
                {
                    "function": "custom_transformation",
                    "args": {
                        "custom_transformer": create_payload,
                    },
                }
            ],
        },
    ],
    "output_specs": [
        { 
            "spec_id": "data_to_send_to_api",
            "input_id": "dummy_sales_transform",
            "data_format": "rest_api",
            "options": {
                "rest_api_url": "https://foo.bar.com",
                "rest_api_method": "post",
                "rest_api_basic_auth": {
                    "username": "...",
                    "password": "...",
                },
                "rest_api_is_file_payload": False, # True if payload is to be sent via JSON file instead of JSON body (application/json)
                "rest_api_file_payload_name": "custom_file", # this is the name of the file to be sent in cases where the payload uses file uploads rather than JSON body.
                "rest_api_extra_json_payload": {"x": "y"}
            }
        }
    ],
}

load_data(acon=acon)
```


================================================
FILE: lakehouse_engine_usage/data_loader/write_to_sharepoint/__init__.py
================================================
"""
.. include::write_to_sharepoint.md
"""


================================================
FILE: lakehouse_engine_usage/data_loader/write_to_sharepoint/write_to_sharepoint.md
================================================
# Write to Sharepoint

There may be scenarios where data products must deliver curated datasets to external platforms like Sharepoint,
often to serve business users or reporting tools outside the lakehouse environment.

The SharePointWriter is a specialized writer module designed to export a single file from the lakehouse to a Sharepoint document library.
It handles the complexities of the export by:

* Writing the dataset to a temporary local file.
* Uploading that file to the configured Sharepoint location using authenticated APIs.
* Since it is scoped to handle only a single file per execution, any logic for splitting or generating multiple files must be implemented within your notebook prior to invoking the writer.

!!! note
    📘 Tip: This writer integrates seamlessly into the lakehouse engine's output step and can be triggered as part of the ACON-based pipeline, just like any other writer module.

!!! warning
    **CSV files do not support complex data types such as array, map, or struct.**
    If these fields exist in the dataset, they must be converted to string (e.g., via to_json(), cast, or similar) before using the Sharepoint Writer, as **these types will cause the export to fail.**

### Usage Scenarios

The examples below show how to write data to Sharepoint, ranging from simple single-DataFrame writes to more complex multi-DataFrame workflows.

1. [Configuration parameters](#1-configuration-parameters)
2. [**Simple:** Write one Dataframe to Sharepoint](#2-simple-write-one-dataframe-to-sharepoint)
    1. [Minimal configuration](#i-minimal-configuration)
    2. [With optional configurations](#ii-with-optional-configurations)
3. [**Complex:** Write multiple Dataframes to Sharepoint](#3-complex-write-multiple-dataframes-to-sharepoint)
    1. [Example: Partitioning function](#i-example-partitioning-function)
    2. [Example: Detect Unsupported Column Types](#ii-detect-unsupported-columns-types)
    2. [Without parallelism (sequential processing)](#iii-without-parallelism-sequential-processing)
    3. [With parallelism (optimized for efficiency)](#iv-complex---with-parallelism-optimized-for-efficiency)

## 1. Configuration parameters

### The mandatory configuration parameters are:

   - **client_id** (str): azure client ID application, available at the
     Azure Portal -> Azure Active Directory.
   - **tenant_id** (str): tenant ID associated with the Sharepoint site, available at the
     Azure Portal -> Azure Active Directory.
   - **site_name** (str): name of the Sharepoint site where the document library resides.
     Sharepoint URL naming convention is: **https://your_company_name.sharepoint.com/sites/site_name**
   - **drive_name** (str): name of the document library where the file will be uploaded.
     Sharepoint URL naming convention is: **https://your_company_name.sharepoint.com/sites/site_name/drive_name**
   - **file_name** (str): name of the file to be uploaded to local path and to Sharepoint.
   - **secret** (str): client secret for authentication, available at the
     Azure Portal -> Azure Active Directory.
   - **local_path** (str): Temporary local storage path for the file before uploading.
     - Ensure the **path ends with "/"**.
     - Note: The **specified sub-folder is deleted during the process**; it does not perform a recursive
     delete on parent directories.
     - **Avoid using a critical sub-folder.**
   - **api_version** (str): version of the Graph Sharepoint API to be used for operations.
     This defaults to "v1.0".

### The optional parameters are:

   - **folder_relative_path** (Optional[str]): relative folder path within the document
       library to upload the file.
   - **chunk_size** (Optional[int]): Optional; size (in Bytes) of the file chunks for
       uploading to Sharepoint. **Default is 100 Mb.**
   - **local_options** (Optional[dict]): Optional; additional options for customizing
       write to csv action to local path. You can check the available options
       below.
   - **conflict_behaviour** (Optional[str]): Optional; behavior to adopt in case
       of a conflict (e.g., 'replace', 'fail').

!!! note
    For more details about the Sharepoint framework, refer to Microsoft's official documentation:

    > 📖[ Microsoft Graph API - Sharepoint](https://learn.microsoft.com/en-us/graph/api/resources/sharepoint?view=graph-rest-1.0)

    > 🛠️ [Graph Explorer Tool](https://developer.microsoft.com/en-us/graph/graph-explorer) -  this tool helps you explore available Sharepoint Graph API functionalities.

    > 📑 [Spark CSV options](https://spark.apache.org/docs/3.5.3/sql-data-sources-csv.html)

## 2. Simple: Write one Dataframe to Sharepoint

This section demonstrates both minimal configuration and extended configurations
when using the Sharepoint Writer.

### i. Minimal Configuration

This approach uses only the mandatory parameters, making it the quickest way to write a DataFrame to Sharepoint.

**Note:** With minimal configurations, not even the header is written on the table. Furthermore, the file is
written on the Sharepoint Drive root folder.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_input",
            "read_type": "batch",
            "data_format": "delta",
            "db_table": "dummy_sales",
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_output",
            "input_id": "dummy_input",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "dummy_client_id",
                "tenant_id": "dummy_tenant_id",
                "secret": "dummy_secret",
                "site_name": "dummy_site_name",
                "drive_name": "dummy_drive_name",
                "local_path": "s3://my_data_product_bucket/silver/dummy_sales/",  # this path must end with an "/"
                "file_name": "dummy_sales",
            },
        },
    ],
}

load_data(acon=acon)
```

### ii. With Optional Configurations

For more control over the upload process, additional parameters can be specified:

>**folder_relative_path (Optional):** Defines the subfolder inside the Sharepoint drive
where the file should be stored.
>
> ‼️ **Important:** The drive within the site acts as the root.
>
> **Example:**
>
>   * Site Name: "dummy_sharepoint"
>   * Drive Name: "dummy_drive"
>   * Folder Path: "dummy/test/"
>   * File Name: "test.csv"
>   * Final Destination: "dummy_sharepoint/dummy_drive/dummy/test/test.csv"

> **chunk_size (Optional):** Defines the file chunk size (in bytes) for uploading.
>
> * Default: 100 MB (Recommended unless handling large files).
> * Larger chunk sizes can improve performance but may increase memory usage.

> **local_options (Optional):** Additional options for writing the DataFrame to a CSV file before upload.
>
> * For available options, refer to: [Apache Spark CSV Options](https://spark.apache.org/docs/3.5.4/sql-data-sources-csv.html).

> **conflict_behaviour (Optional):** Determines the action taken if a file with the same name already exists.
>
> * Possible values: "replace", "fail", "rename", etc.
> * Refer to Microsoft’s documentation: [Drive Item Conflict Behavior](https://learn.microsoft.com/en-us/dynamics365/business-central/application/system-application/enum/system.integration.graph.graph-conflictbehavior).

```python
from lakehouse_engine.engine import load_data

# Set the optional parameters
LOCAL_OPTIONS = {"mode": "overwrite", "header": "true"}

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_input",
            "read_type": "batch",
            "data_format": "delta",
            "db_table": "dummy_sales",
        },
    ],
    "transform_specs": [
        {
            "spec_id": "dummy_transform",
            "input_id": "dummy_input",
            "transformers": [
                {
                    "function": "add_current_date",
                    "args": {"output_col": "extraction_timestamp"},
                },  # Add a new column with the current date if needed
                {
                    "function": "expression_filter",
                    "args": {"exp": "customer = 'customer 1'"},
                },  # Filter the data if needed
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "dummy_output",
            "input_id": "dummy_transform",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "dummy_client_id",
                "tenant_id": "dummy_tenant_id",
                "secret": "dummy_secret",
                "site_name": "dummy_site_name",
                "drive_name": "dummy_drive_name",
                "local_path": "s3://my_data_product_bucket/silver/dummy_sales/",  # this path must end with an "/"
                "file_name": "dummy_sales",
                "folder_relative_path": "dummy_simple",  # writes file in the folder ./dummy_simple
                "local_options": LOCAL_OPTIONS,
                "chunk_size": 300 * 1024 * 1024,  # 300 MB
            },
        },
    ],
}

load_data(acon=acon)
```

## 3. Complex: Write multiple Dataframes to Sharepoint

This scenario illustrates how to write multiple files to Sharepoint within a loop.
Some use cases may require uploading files categorized by season, customer type, product category, etc.,
depending on the business needs.

Partitioning the data ensures better organization and optimized file management in Sharepoint.

!!!warning
    ‼️ **Caution: Excessive Parallelism!**

    * Too many simultaneous uploads can trigger Graph API throttling, leading to 503 (Service Unavailable) errors.
    * Use a controlled level of parallelism (limit concurrent uploads) **if necessary**.
        * [Coalesce](https://spark.apache.org/docs/3.5.3/sql-performance-tuning.html#coalesce-hints-for-sql-queries) allows you to control Spark's parallelism.
    * **As the size of the files increases so does this concern,** so it’s important to test and monitor upload
    processes to avoid service disruptions and ensure smooth performance.

**Neverthless, a stress test with over 50 partition files with > 4GB each** was performed and parallelism
issues were not detected.
The Lakehouse Engine Framework uses a **exponential backoff retry logic to avoid throttling** issues.

### i. Example: Partitioning function

This function is a mere example on how to fetch the distinct of a column from a given table.\
It is not part of the lakehouse_engine framework.

```python
def get_partitions(
partition: str, bucket: Optional[str] = None, table: Optional[str] = None, filter_expression: Optional[str] = None
) -> List[Dict[str, str]]:
"""Fetch distinct values from a given partition column in a table or bucket.

    Parameters
    ----------
    partition : str
        The name of the partition column.
    bucket : Optional[str], default=None
        The path to the S3 bucket (if applicable).
    table : Optional[str], default=None
        The name of the table (if applicable).
    filter_expression : Optional[str], default=None
        A filter condition to apply.

    Returns
    -------
    List[Dict[str, str]]
        A list of dictionaries with unique partition values.
    """
    if not bucket and not table:
        raise ValueError("Either 'bucket' or 'table' must be provided")

    df = spark.read.format("delta").load(bucket) if bucket else spark.table(table)

    partitions = df.select(partition).distinct()

    if filter_expression:
        partitions = partitions.filter(filter_expression)

    return [{partition: row[partition]} for row in partitions.collect()]
```

### ii. Detect unsupported columns types

This function exemplifies how to detect unsupported .csv column types.
It is not part of the lakehouse_engine framework.

```python
def detect_array_or_struct_fields(df: DataFrame) -> Dict[str, str]:
"""
Detect fields in a DataFrame that are arrays, structs, or maps.

    Args:
        df (DataFrame): The input DataFrame.

    Returns:
        Dict[str, str]: A dictionary with field names as keys and their types ('array', 'struct', or 'map') as values.
    """
    field_types = {}
    type_mapping = {ArrayType: "StringType", StructType: "StringType", MapType: "StringType"}

    for field in df.schema.fields:
        for data_type, type_name in type_mapping.items():
            if isinstance(field.dataType, data_type):
                field_types[field.name] = type_name
                break
    return field_types
```

### iii. Without parallelism (sequential processing)

```python
from lakehouse_engine.engine import load_data

# Set the optional parameters
LOCAL_OPTIONS = {"mode": "overwrite", "header": "true"}

# Set the partition column
PARTITION = "customer"

# Fetch distinct values from the partition column
partitions = get_partitions(partition=PARTITION, table="dummy_sales")

# Sort the distinct values to ensure the correct order of the files
# Note:
#   - If an error occurs during the process, by sorting beforehand, you guarantee the correct order of the files.
#   - It may come in handy if you want to restart the process (starting on a given file).
partitions.sort(key=lambda x: x["customer"])

for partition in partitions:
    acon = {
        "input_specs": [
            {
                "spec_id": "dummy_input",
                "read_type": "batch",
                "data_format": "delta",
                "db_table": "dummy_sales",
            },
        ],
        "transform_specs": [
            {
                "spec_id": "dummy_transform",
                "input_id": "dummy_input",
                "transformers": [
                    {"function": "add_current_date", "args": {"output_col": "extraction_timestamp"}},
                    {"function": "expression_filter", "args": {"exp": f"customer = '{partition['customer']}'"}},
                    {
                        "function": "coalesce",
                        "args": {"num_partitions": 1},
                    },  # Enforce that only 1 file is written - eliminating the parallelism
                ],
            },
        ],
        "output_specs": [
            {
                "spec_id": "dummy_output",
                "input_id": "dummy_transform",
                "data_format": "sharepoint",
                "sharepoint_opts": {
                    "client_id": "dummy_client_id",
                    "tenant_id": "dummy_tenant_id",
                    "secret": "dummy_secret",
                    "site_name": "dummy_site_name",
                    "drive_name": "dummy_drive_name",
                    "local_path": "s3://my_data_product_bucket/silver/dummy_sales/",  # this path must end with an "/"
                    "folder_relative_path": "dummy_complex/wo_parallelism",
                    "file_name": f"dummy_sales_{partition['customer']}",
                    "local_options": LOCAL_OPTIONS,
                    "chunk_size": 200 * 1024 * 1024,  # 200 MB
                },
            },
        ],
    }

load_data(acon=acon)
```

### iv. Complex - With parallelism (optimized for efficiency)

```python
from lakehouse_engine.engine import load_data

# Set the optional parameters
LOCAL_OPTIONS = {"mode": "overwrite", "header": "true"}

# Set the partition column
PARTITION = "customer"

# Fetch distinct values from the partition column
partitions = get_partitions(partition=PARTITION, table="dummy_sales")

# Detect array, struct or map fields which cannot be written to .csv files
columns_to_cast = detect_array_or_struct_fields(spark.sql(f"SELECT * FROM {dummy_sales}"))

# Sort the distinct values to ensure the correct order of the files
# Note:
#   - If an error occurs during the process, by sorting beforehand, you guarantee the correct order of the files.
#   - It may come in handy if you want to restart the process (starting on a given file).
partitions.sort(key=lambda x: x["customer"])

for partition in partitions:
    acon = {
        "input_specs": [
            {
                "spec_id": "dummy_input",
                "read_type": "batch",
                "data_format": "delta",
                "db_table": "dummy_sales",
            },
        ],
        "transform_specs": [
            {
                "spec_id": "dummy_transform",
                "input_id": "dummy_input",
                "transformers": [
                    {"function": "add_current_date", "args": {"output_col": "extraction_timestamp"}},
                    {"function": "expression_filter", "args": {"exp": f"customer = '{partition['customer']}'"}},
                    # Coalesce removed guaranteeing maximum parallelism
                    {"function": "cast", "args": {"cols": columns_to_cast}}, # Cast unsupported column types
                ],
            },
        ],
        "output_specs": [
            {
                "spec_id": "dummy_output",
                "input_id": "dummy_transform",
                "data_format": "sharepoint",
                "sharepoint_opts": {
                    "client_id": "dummy_client_id",
                    "tenant_id": "dummy_tenant_id",
                    "secret": "dummy_secret",
                    "site_name": "dummy_site_name",
                    "drive_name": "dummy_drive_name",
                    "local_path": "s3://my_data_product_bucket/silver/dummy_sales/",  # this path must end with an "/"
                    "folder_relative_path": "dummy_complex/with_parallelism",
                    "file_name": f"dummy_sales_{partition['customer']}",
                    "local_options": LOCAL_OPTIONS,
                    "chunk_size": 200 * 1024 * 1024,  # 200 MB
                },
            },
        ],
    }

load_data(acon=acon)
```

### Relevant Notes

- Multi-file export is not supported. For such use cases, loop through files manually and invoke SharePointWriter per file.
- Authentication details should be handled securely via lakehouse configuration or secret management tools.

================================================
FILE: lakehouse_engine_usage/data_quality/__init__.py
================================================
"""
.. include::data_quality.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/custom_expectations/__init__.py
================================================
"""
.. include::custom_expectations.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/custom_expectations/custom_expectations.md
================================================
# Custom Expectations

## Defining Custom Expectations

Custom expectations are defined in python and need to follow a structure to correctly integrate with Great Expectations.

Follow the [documentation of GX on Creating Custom Expectations](https://docs.greatexpectations.io/docs/oss/guides/expectations/custom_expectations_lp/) 
and find information about [the existing types of expectations](https://docs.greatexpectations.io/docs/conceptual_guides/expectation_classes). 

Here is an example of custom expectation.
As for other cases, the acon configuration should be executed with `load_data` using:
```python
from lakehouse_engine.engine import load_data
acon = {...}
load_data(acon=acon)
```

Example of ACON configuration:

```python 
{!../../../../lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b.py!}
```

### Naming Conventions
Your expectation's name **should** start with expect.

The name of the file **must** be the name of the expectation written in snake case. Ex: `expect_column_length_match_input_length`

The name of the class **must** be the name of the expectation written in camel case. Ex: `ExpectColumnLengthMatchInputLength`

### File Structure
The file contains two main sections:

- the definition of the metric that we are tracking (where we define the logic of the expectation);
- the definition of the expectation

### Metric Definition
In this section we define the logic of the expectation. This needs to follow a certain structure:

#### Code Structure
1) The class you define needs to extend one of the Metric Providers defined by Great Expectations that corresponds 
to your expectation's type. More info on the [metric providers](https://docs.greatexpectations.io/docs/conceptual_guides/metricproviders). 

2) You need to define the name of your metric. This name **must** be unique and **must** follow the following structure: 
type of expectation.name of metric. Ex.: `column_pair_values.a_smaller_or_equal_than_b`
**Types of expectations:**  `column_values`, `multicolumn_values`, `column_pair_values`, `table_rows`, `table_columns`.

3) Any [GX default parameters](#parameters) that are necessary to calculate your metric **must** be defined as "condition_domain_keys".

4) Any [additional parameters](#parameters) that are necessary to calculate your metric **must** be defined as "condition_value_keys".

5) The logic of your expectation **must** be defined for the SparkDFExecutionEngine in order to be run on the Lakehouse.

```python
1) class ColumnMapMetric(ColumnMapMetricProvider):
    """Asserts that a column matches a pattern."""
 
    2) condition_metric_name = "column_pair_values.a_smaller_or_equal_than_b"
    3) condition_domain_keys = (
        "batch_id",
        "table",
        "column_A",
        "column_B",
        "ignore_row_if",
    )
    4) condition_value_keys = ("margin",)
     
    5) @column_pair_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: ColumnPairMapMetricProvider,
        column_A: Any,
        column_B: Any,
        margin: Any,
        **kwargs: dict,
    ) -> Any:
        """Implementation of the expectation's logic.
 
        Args:
            column_A: Value of the row of column_A.
            column_B: Value of the row of column_B.
            margin: margin value to be added to column_b.
            kwargs: dict with additional parameters.
 
        Returns:
            If the condition is met.
        """
        if margin is None:
            approx = 0
        elif not isinstance(margin, (int, float, complex)):
            raise TypeError(
                f"margin must be one of int, float, complex."
                f" Found: {margin} as {type(margin)}"
            )
        else:
            approx = margin  # type: ignore
 
        return column_A <= column_B + approx  # type: ignore
```

### Expectation Definition
In this section we define the expectation. This needs to follow a certain structure:

#### Code Structure
1) The class you define needs to extend one of the Expectations defined by Great Expectations that corresponds to your expectation's type. 

2) You must define an "examples" object where you define at least one success and one failure of your expectation to 
demonstrate its logic. The result format must be set to complete, and you must set the [unexpected_index_name](#result-format) variable.

!!! note
    For any examples where you will have unexpected results you must define  unexpected_index_list in your "out" element.
    This will be validated during the testing phase.

3) The metric **must** be the same you defined in the metric definition.

4) You **must** define all [additional parameters](#parameters) that the user has to/should provide to the expectation. 

5) You **should** define any default values for your expectations parameters. 

6) You must **define** the `_validate` method like shown in the example. You **must** call the `validate_result` function 
inside your validate method, this process adds a validation to the unexpected index list in the examples.

!!! note
    If your custom expectation requires any extra validations, or you require additional fields to be returned on 
    the final dataframe, you can add them in this function. 
    The validate_result method has two optional parameters (`partial_success` and `partial_result) that can be used to 
    pass the result of additional validations and add more information to the result key of the returned dict respectively.

```python
1) class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):
    """Expect values in column A to be lower or equal than column B.
 
    Args:
        column_A: The first column name.
        column_B: The second column name.
        margin: additional approximation to column B value.
 
    Keyword Args:
        allow_cross_type_comparisons: If True, allow
            comparisons between types (e.g. integer and string).
            Otherwise, attempting such comparisons will raise an exception.
        ignore_row_if: "both_values_are_missing",
            "either_value_is_missing", "neither" (default).
        result_format: Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.
        include_config: If True (default), then include the expectation config
            as part of the result object.
        catch_exceptions: If True, then catch exceptions and
            include them as part of the result object. Default: False.
        meta: A JSON-serializable dictionary (nesting allowed)
            that will be included in the output without modification.
 
    Returns:
        An ExpectationSuiteValidationResult.
    """
    2) examples = [
        {
            "dataset_name": "Test Dataset",
            "data": {
                "a": [11, 22, 50],
                "b": [10, 21, 100],
                "c": [9, 21, 30],
            },
            "schemas": {
                "spark": {"a": "IntegerType", "b": "IntegerType", "c": "IntegerType"}
            },
            "tests": [
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "c",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["c"],
                            "include_unexpected_rows": True,
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {"c": 9, "a": 11},
                            {"c": 21, "a": 22},
                            {"c": 30, "a": 50},
                        ],
                    },
                },
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_A": "a",
                        "column_B": "b",
                        "margin": 1,
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a"],
                        },
                    },
                    "out": {"success": True},
                },
            ],
        },
    ]
      
    3) map_metric = "column_values.pattern_match"
    4) success_keys = (
        "validation_regex",
        "mostly",
    )
    5) default_kwarg_values = {
        "ignore_row_if": "never",
        "result_format": "BASIC",
        "include_config": True,
        "catch_exceptions": False,
        "mostly": 1,
    }
 
    6) def _validate(
        self,
        configuration: ExpectationConfiguration,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> dict:
        """Custom implementation of the GX _validate method.
 
        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GX test logic does not do this validation, and thus
        we need to make it manually.
 
        Args:
            configuration: Configuration used in the test.
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.
 
        Returns:
            Dictionary with the result of the validation.
        """
        return validate_result(self, configuration, metrics)
```

### Printing the Expectation Diagnostics
Your expectations **must** include the ability to call the Great Expectations diagnostic function in order to be validated.

In order to do this code **must** be present.

```python
"""Mandatory block of code. If it is removed the expectation will not be available."""
if __name__ == "__main__":
    # test the custom expectation with the function `print_diagnostic_checklist()`
    ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()
```

## Creation Process

1) Create a branch from lakehouse engine.

2) Create a custom expectation with your specific logic:

   1. All new expectations must be placed inside folder `/lakehouse_engine/dq_processors/custom_expectations`.
   2. The name of the expectation must be added to the file `/lakehouse_engine/core/definitions.py`, to the variable: `CUSTOM_EXPECTATION_LIST`.
   3. All new expectations must be tested on `/tests/feature/custom_expectations/test_custom_expectations.py`.
   In order to create a new test for your custom expectation it is necessary to:
   
   - Copy one of the expectation folders in `tests/resources/feature/custom_expectations` renaming it to your custom expectation.
   - Make any necessary changes on the data/schema file present.
   - On `/tests/feature/custom_expectations/test_custom_expectations.py` add a scenario to test your expectation, all expectations 
   must be tested on batch and streaming. The test is implemented to generate an acon based on each scenario data. 
   - Test your developments to check that everything is working as intended.

3) When the development is completed, create a pull request with your changes.

4) Your expectation will be available with the next release of the lakehouse engine that happens after you pull request is approved. 
This means that you need to upgrade your version of the lakehouse engine in order to use it.

## Usage
Custom Expectations are available to use like any other expectations provided by Great Expectations.

## Parameters
Depending on the type of expectation you are defining some parameters are expected by default. 
Ex: A ColumnMapExpectation has a default "column" parameter.

### Mostly
[Mostly](https://docs.greatexpectations.io/docs/reference/learn/expectations/standard_arguments/#mostly) is a standard 
parameter for a subset of expectations that is used to define a threshold for the failure of an expectation. 
Ex: A mostly value of 0.7 makes it so that the expectation only fails if more than 70% of records have 
a negative result.

## Result Format
Great Expectations has several different types of [result formats](https://docs.greatexpectations.io/docs/reference/learn/expectations/result_format/) 
for the expectations results. The lakehouse engine requires the result format to be set to "COMPLETE" in order to tag 
the lines where the expectations failed.

### `unexpected_index_column_names`
Inside this key you must define what columns are used as an index inside your data. If this is set and the result 
format is set to "COMPLETE" a list with the indexes of the lines that failed the validation will be returned by 
Great Expectations.
This information is used by the Lakehouse Engine to tag the lines in error after the fact. The additional tests 
inside the `_validate` method verify that the custom expectation is tagging these lines correctly.


================================================
FILE: lakehouse_engine_usage/data_quality/data_quality.md
================================================
# Data Quality

The Data Quality framework is based on [Great Expectations (GX)](https://greatexpectations.io/) and other custom-made 
developments, providing a very light abstraction on top of the GX open source framework and the Spark framework.

## How to use Data Quality?

### Data Loader
You can define data quality rules inside the DataLoader algorithm that you use to load data.

!!! note
    The DataLoader algorithm allows you to store the results of the data quality checks inside your custom location
    using the **result_sink** options (e.g., a delta table on your data product). Using result sink unlocks the 
    capability to store DQ results having history over all the DQ executions, which can be used for debugging, 
    to create **DQ dashboards** on top of the data, and much more.

**Examples**:
In these examples, dummy sales local data is used to cover a few example usages of the DQ Framework
(based on Great Expectations).

The main difference between the sample acons is on the usage of `dq_specs`.

- 1 - [Minimal Example applying DQ with the Required Parameters](minimal_example/minimal_example.md)
- 2 - [Configure Result Sink](result_sink/result_sink.md)
- 3 - [Validations Failing](validations_failing/validations_failing.md)
- 4 - [Row Tagging](row_tagging/row_tagging.md)

**Disclaimer:** even though the `"dq_type": "validator"` is still supported (as presented on this template),
our recommendation is to use `"dq_type": "prisma"`, which offers many more features end to end (from DQ Rules
creation, execution until results analysis) and a configurable central observability
with standard offering of Dashboarding on top. The DQ Type validator and the result_sink is still
supported for very specific use cases that might still exist and for which it might make sense to keep using
this approach. In case of doubt between the offerings, please feel free to reach us.

### Data Quality Validator

The DQValidator algorithm focuses on validating data (e.g., spark DataFrames, Files or Tables).
In contrast to the `dq_specs` inside the DataLoader algorithm, the DQValidator focuses on **validating data at rest 
(post-mortem)** instead of validating data in-transit (before it is loaded to the destination).

!!! note
    The DQValidator algorithm allows you to store the results of the data quality checks inside your custom location
    using the **result_sink** options (e.g., a delta table on your data product). Using result sink unlocks the
    capability to store DQ results having history over all the DQ executions, which can be used for debugging,
    to create **DQ dashboards** on top of the data, and much more.

[Here you can find more information regarding DQValidator and examples](data_quality_validator/data_quality_validator.md).


### Reconciliator

Similarly to the [Data Quality Validator](#data-quality-validator) algorithm, the Reconciliator algorithm focuses on 
validating data at rest (post-mortem). In contrast to the DQValidator algorithm, the Reconciliator always compares a 
truth dataset (e.g., spark DataFrames, Files or Tables) with the current dataset (e.g., spark DataFrames, Files or 
Tables), instead of executing DQ rules defined by the teams. 
[Here you can find more information regarding reconciliator and examples](../reconciliator/reconciliator.md).

!!! note
    Reconciliator does not use Great Expectations, therefore Data Docs and Result Sink and others native methods are not available.

### Custom Expectations

If your data has a data quality check that cannot be done with the expectations provided by Great Expectations you 
can create a custom expectation to make this verification.

!!! note
    Before creating a custom expectation check if there is an expectation already created to address your needs, 
    both in Great Expectations and the Lakehouse Engine.
    Any Custom Expectation that is too specific (using hardcoded table/column names) will be rejected.
    **Expectations should be generic by definition.**

[Here you can find more information regarding custom expectations and examples](custom_expectations/custom_expectations.md).

### Row Tagging
The row tagging strategy allows users to tag the rows that failed to be easier to identify the problems 
in the validations. [Here you can find all the details and examples](row_tagging/row_tagging.md).

### Prisma
Prisma is part of the Lakehouse Engine DQ Framework, and it allows users to read DQ functions dynamically from a table instead of writing them explicitly in the Acons.
[Here you can find more information regarding Prisma](prisma/prisma.md).

## How to check the results of the Data Quality Process?

### 1. Table/location analysis
The possibility to configure a **Result Sink** allows you to store the history of executions of the DQ process. 
You can query the table or the location to search through data and analyse history.

### 2. Power BI Dashboard 
With the information expanded, interactive analysis can be built on top of the history of the DQ process.
A dashboard can be created with the results that we have in `dq_specs`. To be able to have this information you 
need to use arguments `result_sink_db_table` and/or `result_sink_location`.

Through having a dashboard, the runs and expectations can be analysed, filtered by year, month, source and 
run name, and you will have information about the number of runs, some statistics, status of expectations and more. 
Analysis such as biggest failures per expectation type, biggest failures by columns, biggest failures per source, 
and others can be made, using the information in the `result_sink_db_table`/`result_sink_location`.

!!! note
    The recommendation is to use the same result sink table/location for all your dq_specs and 
    in the dashboard you will get a preview of the status of all of them.

<img src="../assets/img/dq_dashboard.png?raw=true" style="max-width: 800px; height: auto; "/>


================================================
FILE: lakehouse_engine_usage/data_quality/data_quality_validator/__init__.py
================================================
"""
.. include::data_quality_validator.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/data_quality_validator/data_quality_validator.md
================================================
# Data Quality Validator

DQValidator algorithm allows DQ Validations isolated from the data load (only read and apply data quality validations).
With this algorithm you have the capacity to apply the Lakehouse-Engine Data Quality Process,
using [Great Expectations](https://greatexpectations.io/expectations/) functions directly into a specific dataset also
making use of all the [InputSpecs](../../../reference/packages/core/definitions.md#packages.core.definitions.InputSpec) available in the engine.

Validating the Data Quality, using this algorithm, is a matter of defining the data you want to read and the validations you want to do to your data, detailing the great expectations functions you want to apply on the data to assess its quality.

!!! warning
    **This algorithm also gives the possibility to restore a previous version of a delta table or delta files in case the DQ
    process raises any exception. Please use it carefully!!** You may lose important commits and data. Moreover, this will
    highly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data
    Quality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored
    to the previous version, but the error could have happened 4 or 5 versions before.

## When to use?

- **Post-Load validation**: check quality of data already loaded to a table/location
- **Pre-Load validation**: check quality of the data you want to load (check DQ by reading a set of files in a specific
  location...)
- **Validation of a DataFrame computed in the notebook itself** (e.g. check data quality after joining or filtering
  datasets, using the computed DataFrame as input for the validation)

This algorithm also gives teams some freedom to:

- **Schedule isolated DQ Validations to run periodically**, with the frequency they need;
- Define a DQ Validation process **as an end-to-end test** of the respective data product.

## How to use?

All of these configurations are passed via the ACON to instantiate
a [DQValidatorSpec object](../../../reference/packages/core/definitions.md#packages.core.definitions.DQValidatorSpec). The DQValidator algorithm uses an
ACON to configure its execution. In [DQValidatorSpec](../../../reference/packages/core/definitions.md#packages.core.definitions.DQValidatorSpec) you can
find the meaning of each ACON property.

Here is an example of ACON configuration:

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_spec": {
        "spec_id": "sales_source",
        "read_type": "batch",
        "data_format": "table",
        "db_table": "my_database.my_table"
    },
    "dq_spec": {
        "spec_id": "dq_sales",
        "input_id": "sales_source",
        "dq_type": "validator",
        "store_backend": "file_system",
        "local_fs_root_dir": "/app/tests/lakehouse/in/feature/dq_validator/dq",
        "result_sink_db_table": "my_database.dq_validator",
        "result_sink_format": "json",
        "fail_on_error": False,
        "dq_functions": [
            {"function": "expect_column_to_exist", "args": {"column": "article"}},
            {
                "function": "expect_table_row_count_to_be_between",
                "args": {"min_value": 3, "max_value": 11},
            },
        ],
    },
    "restore_prev_version": True,
}

load_data(acon=acon)
```

On this page you will also find the following examples of usage:

1. Dataframe as input & Success on the DQ Validation
2. Table as input & Failure on DQ Validation & Restore previous version
3. Files as input & Failure on DQ Validation & Fail_on_error disabled
4. Files as input & Failure on DQ Validation & Critical functions defined
5. Files as input & Failure on DQ Validation & Max failure percentage defined


### Example 1 : Dataframe as input & Success on the DQ Validation

This example focuses on using a dataframe, computed in this notebook, directly in the input spec. First, a new
DataFrame is generated as a result of the join of data from two tables (dummy_deliveries and dummy_pd_article) and
some DQ Validations are applied on top of this dataframe.

```python
from lakehouse_engine.engine import execute_dq_validation

input_df = spark.sql("""
        SELECT a.*, b.article_category, b.article_color
        FROM my_database.dummy_deliveries a
        JOIN my_database.dummy_pd_article b
            ON a.article_id = b.article_id
        """
)

acon = {
    "input_spec": {
        "spec_id": "deliveries_article_input",
        "read_type": "batch",
        "data_format": "dataframe",
        "df_name": input_df,
    },
    "dq_spec": {
        "spec_id": "deliveries_article_dq",
        "input_id": "deliveries_article_input",
        "dq_type": "validator",
        "bucket": "my_data_product_bucket",
        "result_sink_db_table": "my_database.dq_validator_deliveries",
        "result_sink_location": "my_dq_path/dq_validator/dq_validator_deliveries/",
        "expectations_store_prefix": "dq/dq_validator/expectations/",
        "validations_store_prefix": "dq/dq_validator/validations/",
        "checkpoint_store_prefix": "dq/dq_validator/checkpoints/",
        "unexpected_rows_pk": ["salesorder", "delivery_item", "article_id"],
        "dq_functions": [{"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}}],
    },
    "restore_prev_version": False,
}

execute_dq_validation(acon=acon)
```


### Example 2: Table as input & Failure on DQ Validation & Restore previous version

In this example we are using a table as input to validate the data that was loaded. Here, we are forcing the DQ Validations to fail in order to show the possibility of restoring the table to the previous version.

!!! warning
    **Be careful when using the feature of restoring a previous version of a delta table or delta files.** You may
    lose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality
    validations. If you run your data loads daily and Data Quality validations weekly, and you define the
    restore_prev_version to true, this means that the table will be restored to the previous version, but the error
    could have happened 4 or 5 versions before (because loads are daily, validations are weekly).

Steps followed in this example to show how the restore_prev_version feature works.

1. **Insert rows into the dummy_deliveries table** to adjust the total numbers of rows and **make the DQ process fail**.
2. **Use the "DESCRIBE HISTORY" statement to check the number of versions available on the table** and check the version
   number resulting from the insertion to the table.
3. **Execute the DQ Validation**, using the configured acon (based on reading the dummy_deliveries table and setting the 
`restore_prev_version` to `true`). Checking the logs of the process, you can see that the data did not pass all the 
expectations defined and that the table version restore process was triggered.
4. **Re-run a "DESCRIBE HISTORY" statement to check that the previous version of the table was restored** and thus, the row inserted in the beginning of the process is no longer present in the table.

```python
from lakehouse_engine.engine import execute_dq_validation

# Force failure of data quality by adding new row
spark.sql("""INSERT INTO my_database.dummy_deliveries VALUES (7, 1, 20180601, 71, "article1", "delivered")""")


# Check history of the table
spark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")

acon = {
    "input_spec": {
        "spec_id": "deliveries_input",
        "read_type": "batch",
        "db_table": "my_database.dummy_deliveries",
    },
    "dq_spec": {
        "spec_id": "dq_deliveries",
        "input_id": "deliveries_input",
        "dq_type": "validator",
        "bucket": "my_data_product_bucket",
        "tbl_to_derive_pk": "my_database.dummy_deliveries",
        "dq_functions": [
            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},
            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 19}},
        ],
    },
    "restore_prev_version": True,
}

execute_dq_validation(acon=acon)
 
# Check that the previous version of the table was restored
spark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")
```


### Example 3: Files as input & Failure on DQ Validation & Fail_on_error disabled

In this example we are using a location as input to validate the files in a specific folder.
Here, we are forcing the DQ Validations to fail, however disabling the "fail_on_error" configuration,
so the algorithm warns about the expectations that failed but the process/the execution of the algorithm doesn't fail.

```python
from lakehouse_engine.engine import execute_dq_validation

acon = {
    "input_spec": {
        "spec_id": "deliveries_input",
        "data_format": "delta",
        "read_type": "streaming",
        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",
    },
    "dq_spec": {
        "spec_id": "dq_deliveries",
        "input_id": "deliveries_input",
        "dq_type": "validator",
        "bucket": "my_data_product_bucket",
        "tbl_to_derive_pk": "my_database.dummy_deliveries",
        "fail_on_error": False,
        "dq_functions": [
            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},
            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},
        ],
    },
    "restore_prev_version": False,
}

execute_dq_validation(acon=acon)
```


### Example 4: Files as input & Failure on DQ Validation & Critical functions defined

In this example we are using a location as input to validate the files in a specific folder.
Here, we are forcing the DQ Validations to fail by using the critical functions feature, which will throw an error
if any of the functions fails.

```python
from lakehouse_engine.engine import execute_dq_validation

acon = {
    "input_spec": {
        "spec_id": "deliveries_input",
        "data_format": "delta",
        "read_type": "streaming",
        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",
    },
    "dq_spec": {
        "spec_id": "dq_deliveries",
        "input_id": "deliveries_input",
        "dq_type": "validator",
        "bucket": "my_data_product_bucket",
        "tbl_to_derive_pk": "my_database.dummy_deliveries",
        "fail_on_error": True,
        "dq_functions": [
            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},
        ],
        "critical_functions": [
            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},
        ],
    },
    "restore_prev_version": False,
}

execute_dq_validation(acon=acon)
```


### Example 5: Files as input & Failure on DQ Validation & Max failure percentage defined

In this example we are using a location as input to validate the files in a specific folder.
Here, we are forcing the DQ Validations to fail by using the max_percentage_failure,
which will throw an error if the percentage of failures surpasses the defined maximum threshold.

```python
from lakehouse_engine.engine import execute_dq_validation

acon = {
    "input_spec": {
        "spec_id": "deliveries_input",
        "data_format": "delta",
        "read_type": "streaming",
        "location": "s3://my_data_product_bucket/silver/dummy_deliveries/",
    },
    "dq_spec": {
        "spec_id": "dq_deliveries",
        "input_id": "deliveries_input",
        "dq_type": "validator",
        "bucket": "my_data_product_bucket",
        "tbl_to_derive_pk": "my_database.dummy_deliveries",
        "fail_on_error": True,
        "dq_functions": [
            {"function": "expect_column_values_to_not_be_null", "args": {"column": "delivery_date"}},
            {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 17}},
        ],
        "max_percentage_failure": 0.2,
    },
    "restore_prev_version": False,
}

execute_dq_validation(acon=acon)
```


## Limitations

Unlike DataLoader, this new DQValidator algorithm only allows, for now, one input_spec (instead of a list of input_specs) and one dq_spec (instead of a list of dq_specs). There are plans and efforts already initiated to make this available in the input_specs and one dq_spec (instead of a list of dq_specs). However, you can prepare a Dataframe which joins more than a source, and use it as input, in case you need to assess the Data Quality from different sources at the same time. Alternatively, you can also show interest on any enhancement on this feature, as well as contributing yourself.


================================================
FILE: lakehouse_engine_usage/data_quality/minimal_example/__init__.py
================================================
"""
.. include::minimal_example.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/minimal_example/minimal_example.md
================================================
# Minimal Example

This scenario illustrates the minimal configuration that you can have to use `dq_specs`, in which
it uses required parameters: `spec_id, input_id, dq_type, bucket, dq_functions`.

Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check:

- **expect_column_to_exist** - if a column exist in the data;
- **expect_table_row_count_to_be_between** - if the row count of the data is between the defined interval;
- **expect_table_column_count_to_be_between** - if the number of columns in the data is bellow the max value defined.


```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": "|",
                "inferSchema": True,
            },
            "location": "s3://my_data_product_bucket/dummy_deliveries/",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "dq_validator",
            "input_id": "dummy_deliveries_source",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},
                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "input_id": "dq_validator",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
        }
    ],
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/data_quality/prisma/__init__.py
================================================
"""
.. include::prisma.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/prisma/prisma.md
================================================
# Prisma

Prisma is part of the Lakehouse Engine DQ Framework, and it allows users to read DQ functions dynamically from a table instead of writing them explicitly in the Acons.


## How to use Prisma?
- Use the Lakehouse Engine version: 1.22.0 or later;
- Use DBR 13.3 or later.  If you are not using Databricks, ensure a similar environment with Spark 3.4.1 and Delta 2.4.0.
- Create the DQ Checks in a table in your Data Product:
  - Each data quality check conducted in Prisma will be hosted within the bucket defined in the engine config file (lakehouse_engine/configs/engine.yaml). Consequently, the result sink location will receive the results of their assessments at the granularity of each "run", capturing all records generated during every operation. The DQ Checks table is located in the demanding data product and can have any name (i.e: data_quality_checks).
  - The idea is for it to be a central bucket for all DPs to ensure easier and better observability and unlock offering of easier insights over the Data Quality of the Lakehouse.
  
Below you find a DDL example with the expected schema and description for the fields:
```sql
DROP TABLE IF EXISTS my_database.data_quality_checks;
CREATE EXTERNAL TABLE my_database.data_quality_checks (
  dq_rule_id STRING COMMENT 'DQ Rule ID.',
  dq_tech_function STRING COMMENT 'Great Expectations function type to apply according to the DQ rules type. Example: expect_column_to_exist.',
  execution_point STRING COMMENT 'In motion/At rest.',
  schema STRING COMMENT 'The database schema on which the check is to be applied.',
  table STRING COMMENT 'The table on which the check is to be applied.',
  column STRING COMMENT 'The column (either on Lakehouse or in other accessible source systems, such as FDP or SAP BW) on which the check is to be applied.',
  filters STRING COMMENT 'General filters to the data set (where part of the statement). Note: this is purely descriptive at this point as there is no automated action/filtering of the Lakehouse Engine or PRISMA upon it.',
  arguments STRING COMMENT 'Additional arguments to run the Great Expectation Function in the same order as they appear in the function. Example: {"column": "amount", "min_value": 0}.',
  dimension STRING COMMENT 'Data Quality dimension.'
)
USING DELTA
LOCATION 's3://my-data-product-bucket/inbound/data_quality_checks'
COMMENT 'Table with dummy data mapping DQ Checks.'
TBLPROPERTIES(
  'lakehouse.primary_key'='dq_rule_id',
  'delta.enableChangeDataFeed'='true'
)
```
**Data sample:**

| dq_rule_id | dq_tech_function                          | execution_point | schema             | table       | column       | filters | arguments                                         | dimension    |
|------------|:------------------------------------------|:----------------|:-------------------|:------------|:-------------|:--------|:--------------------------------------------------|--------------|
| 1          | expect_column_values_to_not_be_null       | at_rest         | my_database_schema | dummy_sales | ordered_item |         | {"column": "ordered_item"}                        | Completeness |
| 2          | expect_column_min_to_be_between           | in_motion       | my_database_schema | dummy_sales | ordered_item |         | {"column": "amount", "min_value": 0}              | Completeness |
| 3          | expect_column_values_to_not_be_in_set     | in_motion       | my_database_schema | dummy_sales | ordered_item |         | {"column": "amount", "value_set": [1,2,3]}        | Completeness |
| 4          | expect_column_pair_a_to_be_not_equal_to_b | at_rest         | my_database_schema | dummy_sales | ordered_item |         | {"column_A": "amount","column_B": "ordered_item"} | Completeness |
| 5          | expect_table_row_count_to_be_between      | at_rest         | my_database_schema | dummy_sales | ordered_item |         | {"min_value": 1, "max_value": 10}                 | Completeness |

**Table definition:**

| Column Name      | Definition                                                                                                                                                                                                                                                                        |
|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| dq_rule_id       | The identifier of a data quality rule.                                                                                                                                                                                                                                            |
| dq_tech_function | Type of Great Expectations function to apply according to the DQ rules type. See the values here: [Gallery of Expectations and Packages](https://greatexpectations.io/legacy/v1/expectations/?filterType=Backend+support&viewType=Summary&showFilters=true&subFilterValues=spark) |
| execution_point  | The way how validations will be performed on top the the data set. List of values: at_rest, in_motion.                                                                                                                                                                            |
| schema           | The schema on which the check is to be applied.                                                                                                                                                                                                                                   |
| table            | The table on which the check is to be applied.                                                                                                                                                                                                                                    |
| column           | The column on which the check is to be applied.                                                                                                                                                                                                                                   |
| filters          | General filters to the data set (where part of the statement). **Note**: this is purely descriptive at this point as there is no automated action/filtering of the Lakehouse Engine or PRISMA upon it.                                                                            |
| arguments        | Additional arguments to run the Great Expectation Function in the same order as they appear in the function.                                                                                                                                                                      |
| dimension        | Categorisation of a DQ rule related to one of the dimensions. List of values: Completeness, Uniqueness, Timeliness, Validity, Consistency, Accuracy. **Note**: these values are purely descriptive.                                                                               |

**Execution behaviour** - The value of the **execution_point** column determines the type of Acon execution:

  - **For records at_rest**, they will only be processed when the Lakehouse engine is called by the execute_dq_validation() function.

  - **For records in_motion**, they will only be processed when the Lakehouse engine is called by load_data() function.

## What are the main changes on my ACON if I already implemented DQ?

The following configurations represent the minimum requirements to make Prisma DQ work.

- **dq_type:** "prisma" - the value must be set in order for the engine process the DQ with Prisma;
- **store_backend:** "file_system" or "s3" - which store backend to use;
  - **bucket** - the bucket name to consider for the store_backend (store DQ artefacts). **Note**: only applicable and mandatory for store_backend s3.
  - **local_fs_root_dir:** path of the root directory. **Notes**: only applicable for store_backend file_system;
- **dq_db_table:** the DQ Check table that is located in the demanding data product;
- **dq_table_table_filter:** name of the table which rules are to be applied in the validations. The table name must match with the values inserted in the column "table" from dq_db_table;
- **data_product_name:** the name of the data product;
- **tbl_to_derive_pk or unexpected_rows_pk:**
  - tbl_to_derive_pk - automatically derive the primary keys from a given database table. **Note**: the primary keys are derived from the **lakehouse.primary_key** property of a table.   
  - unexpected_rows_pk - the list of columns composing the primary key of the source data to identify the rows failing the DQ validations.

**DQ Prisma Acon example**
```python
"dq_specs": [
  {
    "spec_id": "dq_validator_in_motion",
    "input_id": "dummy_sales_transform",
    "dq_type": "prisma",
    "store_backend": "file_system",
    "local_fs_root_dir": "/my-data-product/artefacts/dq",
    "dq_db_table": DQ_DB_TABLE,
    "dq_table_table_filter": "dummy_sales",
    "data_product_name": DATA_PRODUCT_NAME,
    "tbl_to_derive_pk": DB_TABLE,
  }
],
```

!!! note
    Available extra parameters to use in the DQ Specs for Prisma:
    
    - **data_docs_local_fs** - the path for data docs. The parameter is useful in case you want your DQ Results to be reflected on the automatic Data Docs site;
    - **data_docs_prefix** - prefix where to store data_docs' data. This parameter must be used together with `data_docs_local_fs`;
    - **dq_table_extra_filters** - extra filters to be used when deriving DQ functions. This is an SQL expression to be applied to `dq_db_table` which means that the statements must use one of the available columns in the table. For example: dq_rule_id in ('rule1','rule2');
    - **data_docs_bucket** - the bucket name for data docs only. When defined, it will supersede bucket parameter. **Note:** only applicable for store_backend s3;
    - **expectations_store_prefix** - prefix where to store expectations' data. **Note:** only applicable for store_backend s3;
    - **validations_store_prefix** - prefix where to store validations' data. **Note:** only applicable for store_backend s3;
    - **checkpoint_store_prefix** - prefix where to store checkpoints' data. **Note:** only applicable for store_backend s3;

## End2End Example
Below you can also find an End2End and detailed example of loading data into the DQ Checks table and then using PRISMA both with load_data() and execute_dq_validation().

??? example "**1 - Load the DQ Checks Table**"
    This example shows how to insert data into the data_quality_checks table using an Acon with a csv file as a source.
    The location provided is just an example of a place to store the csv. It is also important that the source file contains the **data_quality_checks** schema.
    ```python
    acon = {
      "input_specs": [
        {
        "spec_id": "read_dq_checks",
        "read_type": "batch",
        "data_format": "csv",
        "options": {"header": True, "delimiter": ";"},
        "location": "s3://my-data-product/local_data/data_quality_checks/",
        }
      ],
      "output_specs": [
        {
        "spec_id": "write_dq_checks",
        "input_id": "read_dq_checks",
        "write_type": "overwrite",
        "data_format": "delta",
        "location": "s3://my-data-product-bucket/inbound/data_quality_checks",
        }
      ],
    }
    
    load_data(acon=acon)
    ```

??? example "**2 - PRISMA - IN MOTION (load_data)**"
    ```python
    cols_to_rename = {"item": "ordered_item", "date": "order_date", "article": "article_id"}   
    acon = {
        "input_specs": [
            {
                "spec_id": "dummy_sales_bronze",
                "read_type": "batch",
                "data_format": "delta",
                "location": "s3://my-data-product-bucket/bronze/dummy_sales",
            }
        ],
        "transform_specs": [
            {
                "spec_id": "dummy_sales_transform",
                "input_id": "dummy_sales_bronze",
                "transformers": [
                    {
                        "function": "rename",
                        "args": {
                            "cols": cols_to_rename,
                        },
                    },
                ],
            }
        ],
        "dq_specs": [
            {
                "spec_id": "dq_validator_in_motion",
                "input_id": "dummy_sales_transform",
                "dq_type": "prisma",
                "store_backend": "file_system",
                "local_fs_root_dir": "/my-data-product/artefacts/dq",
                "dq_db_table": DQ_DB_TABLE,
                "dq_table_table_filter": "dummy_sales",
                "dq_table_extra_filters": "1 = 1",
                "data_docs_local_fs": "my-data-product/my-data-product-dq-site",
                "data_docs_prefix": "{}/my-data-product-bucket/data_docs/site/".format(DQ_PREFIX),
                "data_product_name": DATA_PRODUCT_NAME,
                "tbl_to_derive_pk": DB_TABLE,
            }
        ],
        "output_specs": [
            {
                "spec_id": "dummy_sales_silver",
                "input_id": "dq_validator_in_motion",
                "write_type": "overwrite",
                "data_format": "delta",
                "location": "s3://my-data-product-bucket/silver/dummy_sales_dq_template_in_motion",
            }
        ],
    }
    
    load_data(acon=acon)
    ```

??? example "**3 - PRISMA - AT REST (exec_dq_validation)**"
    ```python
    acon = {
        "input_spec": {
            "spec_id": "dummy_sales_source",
            "read_type": "batch",
            "db_table": DB_TABLE,
        },
        "dq_spec": {
            "spec_id": "dq_validator_at_rest",
            "input_id": "sales_input",
            "dq_type": "prisma",
            "store_backend": "file_system",
            "local_fs_root_dir": "/my-data-product/artefacts/dq",
            "dq_db_table": DQ_DB_TABLE,
            "dq_table_table_filter": "dummy_sales",
            "data_docs_local_fs": "my-data-product/my-data-product-dq-site",
            "data_docs_prefix": "{}/my-data-product-bucket/data_docs/site/".format(DQ_PREFIX),
            "data_product_name": DATA_PRODUCT_NAME,
            "tbl_to_derive_pk": DB_TABLE,
        },
    }
    
    execute_dq_validation(acon=acon)
    ```

## Troubleshooting/Common issues
This section provides a summary of common issues and resolutions.

??? warning "**Error type: filter does not get rules from DQ Checks table.**"
    <img src="../../../assets/prisma/img/dq_checks_table_w_no_rules.png" alt="image" width="1000px" height="auto">

    **Solution**: make sure the records in your DQ Checks table are well-defined. In the Acon, ensure that you have the dq_table_table_filter with the correct table name.

??? warning "**Error type: missing expectation.**"
    <img src="../../../assets/prisma/img/missing_expectation.png" alt="image" width="1000px" height="auto">

    **Solution**: make sure that you are using a valid expectation. See the valid ones on: [Gallery of Expectations and Packages](https://greatexpectations.io/legacy/v1/expectations/?filterType=Backend+support&viewType=Summary&showFilters=true&subFilterValues=spark)

??? warning "**Error type: missing expectation parameters.**"
    <img src="../../../assets/prisma/img/missing_expectation_parameters.png" alt="image" width="1000px" height="auto">

    **Solution**: make sure that your "arguments" column in the DQ CHECKS table has all necessary parameters for the expectation. For example, the expectation [expect_column_values_to_not_be_null](https://greatexpectations.io/legacy/v1/expectations/expect_column_values_to_not_be_null?filterType=Backend%20support&gotoPage=1&showFilters=true&viewType=Summary&subFilterValues=spark) needs one argument (column (str): The column name).


================================================
FILE: lakehouse_engine_usage/data_quality/result_sink/__init__.py
================================================
"""
.. include::result_sink.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/result_sink/result_sink.md
================================================
# Result Sink

These scenarios store the results of the dq_specs into a result sink. For that, both scenarios include parameters defining
the specific table and location (`result_sink_db_table` and `result_sink_location`) where the results
are expected to be stored. With this configuration, people can, later on, check the history of the DQ
executions using the configured table/location, as shown bellow. You can configure saving the output of the
results in the result sink following two approaches:

- [**Denormalized/exploded Data Model (recommended)**](#1-result-sink-exploded-recommended) - the results are stored in a detailed format in which
people are able to analyse them by Data Quality Run, by expectation_type and by keyword arguments.

| ...                         | source     | column     | max_value | min_value | expectation_type                        | expectation_success | observed_value | run_time_year | ... |
|-----------------------------|------------|------------|-----------|-----------|-----------------------------------------|---------------------|----------------|---------------|-----|
| all columns from raw + more | deliveries | salesorder | null      | null      | expect_column_to_exist                  | TRUE                | null           | 2023          | ... |
| all columns from raw + more | deliveries | null       | null      | null      | expect_table_row_count_to_be_between    | TRUE                | 23             | 2023          | ... |
| all columns from raw + more | deliveries | null       | null      | null      | expect_table_column_count_to_be_between | TRUE                | 6              | 2023          | ... |

- [**Raw Format Data Model (not recommended)**](#2-raw-result-sink) - the results are stored in the raw format that Great
Expectations outputs. This is not recommended as the data will be highly nested and in a
string format (to prevent problems with schema changes), which makes analysis and the creation of a dashboard on top way 
harder.

| checkpoint_config    | run_name                   | run_time                         | run_results                   | success                | validation_result_identifier | spec_id | input_id |
|----------------------|----------------------------|----------------------------------|-------------------------------|------------------------|------------------------------|---------|----------|
| entire configuration | 20230323-...-dq_validation | 2023-03-23T15:11:32.225354+00:00 | results of the 3 expectations | true/false for the run | identifier                   | spec_id | input_id |

!!! note
    - More configurations can be applied in the result sink, as the file format and partitions.
    - It is recommended to:

        - Use the same result sink table/location for all dq_specs across different data loads, from different 
        sources, in the same Data Product.
        - Use the parameter `source` (only available with `"result_sink_explode": True`), in the dq_specs, as
        used in both scenarios, with the name of the data source, to be easier to distinguish sources in the
        analysis. If not specified, the `input_id` of the dq_spec will be considered as the `source`.
        - These recommendations will enable more rich analysis/dashboard at Data Product level, considering
        all the different sources and data loads that the Data Product is having.

## 1. Result Sink Exploded (Recommended)

This scenario stores DQ Results (results produces by the execution of the dq_specs) in the Result Sink,
in a detailed format, in which people are able to analyse them by Data Quality Run, by expectation_type and
by keyword arguments. This is the recommended approach since it makes the analysis on top of the result
sink way easier and faster.

For achieving the exploded data model, this scenario introduces the parameter `result_sink_explode`, which
is a flag to determine if the output table/location should have the columns exploded (as `True`) or
not (as `False`). **Default:** `True`, but it is still provided explicitly in this scenario for demo purposes.
The table/location will include a schema which contains general columns, statistic columns, arguments of
expectations, and others, thus part of the schema will be always with values and other part will depend on
the expectations chosen.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": "|",
                "inferSchema": True,
            },
            "location": "s3://my_data_product_bucket/dummy_deliveries/",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "dq_validator",
            "input_id": "dummy_deliveries_source",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "result_sink_db_table": "my_database.dq_result_sink",
            "result_sink_location": "my_dq_path/dq_result_sink/",
            "result_sink_explode": True,
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "source": "deliveries_success",
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},
                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "input_id": "dq_validator",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
        }
    ],
}

load_data(acon=acon)
```

To check the history of the DQ results, you can run commands like:

- the table: `display(spark.table("my_database.dq_result_sink"))`
- the location: `display(spark.read.format("delta").load("my_dq_path/dq_result_sink/"))`

## 2. Raw Result Sink
This scenario is very similar to the previous one, but it changes the parameter `result_sink_explode` to `False` so that
it produces a raw result sink output containing only one row representing the full run of `dq_specs` (no
matter the amount of expectations/dq_functions defined there). Being a raw output, **it is not a
recommended approach**, as it will be more complicated to analyse and make queries on top of it.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": "|",
                "inferSchema": True,
            },
            "location": "s3://my_data_product_bucket/dummy_deliveries/",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "dq_validator",
            "input_id": "dummy_deliveries_source",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "result_sink_db_table": "my_database.dq_result_sink_raw",
            "result_sink_location": "my_dq_path/dq_result_sink_raw/",
            "result_sink_explode": False,
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "source": "deliveries_success_raw",
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},
                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "input_id": "dq_validator",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
        }
    ],
}

load_data(acon=acon)
```

To check the history of the DQ results, you can run commands like:

- the table: `display(spark.table("my_database.dq_result_sink_raw"))`
- the location: `display(spark.read.format("delta").load("my_dq_path/dq_result_sink_raw/"))`


================================================
FILE: lakehouse_engine_usage/data_quality/row_tagging/__init__.py
================================================
"""
.. include::row_tagging.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/row_tagging/row_tagging.md
================================================
# Row Tagging
Data quality is essential for any organisation that relies on data to make informed decisions. 
High-quality data provides accurate, reliable, and timely information that enables organisations to identify
opportunities, mitigate risks, and optimize their operations. In contrast, low-quality data can lead to incorrect
conclusions, faulty decisions, and wasted resources.

There are several common issues that can compromise data quality, such as:

- data entry errors; 
- data duplication; 
- incomplete / inconsistent data; 
- changes where data is collected (e.g. sources); 
- faulty data processing, such as inaccurate data cleansing or transformations.

Therefore, implementing data quality controls, such as data validation rules, and regularly monitoring data for 
accuracy and completeness is key for any organisation.

One of these controls that can be applied is the **DQ Row Tagging Strategy** so that you not only apply validations on 
your data to ensure Data Quality, but you also tag your data with the results of the Data Quality validations 
providing advantages like:

- Transparency for downstream and upstream consumers; 
- Data Observability and Reliability; 
- More trust over the data; 
- Anomaly Detection; 
- Easier and faster discovery of Data Quality problems, and, consequently faster resolution; 
- Makes it easier to deal with integrations with other systems and migrations (you can have validations capturing that a column was changed or simply disappeared);

!!! note
    When using the DQ Row Tagging approach data availability will take precedence over Data Quality, meaning 
    that all the data will be introduced into the final target (e.g. table or location) no matter what Data Quality
    issues it is having.

Different Types of Expectations:

- Table Level 
- Column Aggregated Level 
- Query Level 
- Column Values (**row level**)
- Column Pair Value (**row level**)
- Multicolumn Values (**row level**)

The expectations highlighted as **row level** will be the ones enabling to Tag failures on specific rows and adding 
the details about each failure (they affect the field **run_row_result** inside **dq_validations**). The expectations 
with other levels (not row level) influence the overall result of the Data Quality execution, but won't be used to tag
specific rows (they affect the field **run_success** only, so you can even have situations for which you get 
**run_success False** and **run_row_success True** for all rows).

## How does the Strategy work?

The strategy relies mostly on the 6 below arguments.

!!! note
    When you specify `"tag_source_data": True` the arguments **fail_on_error**, **gx_result_format** and 
    **result_sink_explode** are set to the expected values. 

- **unexpected_rows_pk** - the list columns composing the primary key of the source data to use to identify the rows 
failing the DQ validations. 
- **tbl_to_derive_pk** - `db.table` to automatically derive the unexpected_rows_pk from. 
- **gx_result_format** - great expectations result format. Default: `COMPLETE`. 
- **tag_source_data** - flag to enable the tagging strategy in the source data, adding the information of 
the DQ results in a column `dq_validations`. This column makes it possible to identify if the DQ run was
succeeded in general and, if not, it unlocks the insights to know what specific rows have made the DQ validations
fail and why. Default: `False`.

!!! note
    It only works if result_sink_explode is `True`, result_format is `COMPLETE` and 
    fail_on_error is `False. 

- **fail_on_error** - whether to fail the algorithm if the validations of your data in the DQ process failed. 
- **result_sink_explode** - flag to determine if the output table/location should have the columns exploded (as `True`)
or not (as `False`). Default: `True`.

!!! note
    It is mandatory to provide one of the arguments (**unexpected_rows_pk** or **tbl_to_derive_pk**) when using 
    **tag_source_data** as **True**. 
    When **tag_source_data** is **False**, this is not mandatory, but **still recommended**. 

<img src="../../../assets/img/row_tagging.png?raw=true" style="max-width: 800px; height: auto; "/>

!!! note
    The tagging strategy only works when `tag_source_data` is `True`, which automatically
    assigns the expected values for the parameters `result_sink_explode` (True), `fail_on_error` (False)
    and `gx_result_format` ("COMPLETE").

!!! note
    For the DQ Row Tagging to work, in addition to configuring the aforementioned arguments in the dq_specs, 
    you will also need to add the **dq_validations** field into your table (your DDL statements, **recommended**) or 
    enable schema evolution.

!!! note
    Kwargs field is a string, because it can assume different schemas for different expectations and runs. 
    It is useful to provide the complete picture of the **row level failure** and to allow filtering/joining with 
    the result sink table, when there is one. Some examples of kwargs bellow:

    - `{"column": "country", "min_value": 1, "max_value": 2, "batch_id": "o723491yyr507ho4nf3"}` → example for 
    expectations starting with `expect_column_values` (they always make use of "column", the other arguments vary). 
    - `{"column_A: "country", "column_B": "city", "batch_id": "o723491yyr507ho4nf3"}` → example for expectations 
    starting with `expect_column_pair` (they make use of "column_A" and "column_B", the other arguments vary). 
    - `{"column_list": ["col1", "col2", "col3"], "batch_id": "o723491yyr507ho4nf3"}` → example for expectations 
    starting with `expect_multicolumn` (they make use of "column_list", the other arguments vary).
    `batch_id` is common to all expectations, and it is an identifier for the batch of data being validated by
    Great Expectations.

### Example

This scenario uses the row tagging strategy which allow users to tag the rows that failed to be easier to
identify the problems in the validations.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": "|",
                "inferSchema": True,
            },
            "location": "s3://my_data_product_bucket/dummy_deliveries/",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "dq_validator",
            "input_id": "dummy_deliveries_source",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "result_sink_db_table": "my_database.dq_result_sink",
            "result_sink_location": "my_dq_path/dq_result_sink/",
            "tag_source_data": True,
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "source": "deliveries_tag",
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},
                {
                    "function": "expect_column_values_to_be_in_set",
                    "args": {"column": "salesorder", "value_set": ["37"]},
                },
                {
                    "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b",
                    "args": {"column_A": "salesorder", "column_B": "delivery_item"},
                },
                {
                    "function": "expect_multicolumn_sum_to_equal",
                    "args": {"column_list": ["salesorder", "delivery_item"], "sum_total": 100},
                },
            ],
            "critical_functions": [
                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 6}},
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "input_id": "dq_validator",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
        }
    ],
}

load_data(acon=acon)
```

Running bellow cell shows the new column created, named `dq_validations` with information about DQ validations.
`display(spark.read.format("delta").load("s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/"))`

## Performance and Limitations Trade-offs

When using the DQ Row Tagging Strategy, by default we are using Great Expectations Result Format "Complete" with 
Unexpected Index Column Names (a primary key for the failures), meaning that for each failure, we are getting all 
the distinct values for the primary key. After getting all the failures, we are applying some needed transformations 
and joining them with the source data, so that it can be tagged by filling the "dq_validations" column.

Hence, this can definitely be a heavy and time-consuming operation on your data loads. To reduce this disadvantage 
you can cache the dataframe by passing the `"cache_df": True` in your DQ Specs. In addition to this, always have in 
mind that each expectation (dq_function) that you add into your DQ Specs, is more time that you are adding into your 
data loads, so always balance performance vs amount of validations that you need.

Moreover, Great Expectations is currently relying on the driver node to capture the results of the execution and 
return/store them. Thus, in case you have huge amounts of rows failing (let's say 500k or more) Great Expectations 
might raise exceptions.

On these situations, the data load will still happen and the data will still be tagged with the Data Quality 
validations information, however you won't have the complete picture of the failures, so the raised_exceptions 
field is filled as True, so that you can easily notice it and debug it.

Most of the time, if you have such an amount of rows failing, it will probably mean that you did something wrong 
and want to fix it as soon as possible (you are not really caring about tagging specific rows, because you will 
not want your consumers to be consuming a million of defective rows). However, if you still want to try to make it 
pass, you can try to increase your driver and play with some spark configurations like:

- `spark.driver.maxResultSize`
- `spark.task.maxFailures`

For debugging purposes, you can also use a different [Great Expectations Result Format](
https://docs.greatexpectations.io/docs/reference/expectations/result_format/) like "SUMMARY" (adding in your DQ Spec
`"gx_result_format": "SUMMARY"`), so that you get only a partial list of the failures, avoiding surpassing the driver
capacity. 

!!! note
    When using a Result Format different from the default ("COMPLETE"), the flag "tag_source_data" will be 
    overwritten to `False`, as the results of the tagging wouldn't be complete which could lead to erroneous 
    conclusions from stakeholders (but you can always get the details about the result of the DQ execution in
    the `result_sink_location` or `result_sink_db_table` that you have configured).


================================================
FILE: lakehouse_engine_usage/data_quality/validations_failing/__init__.py
================================================
"""
.. include::validations_failing.md
"""


================================================
FILE: lakehouse_engine_usage/data_quality/validations_failing/validations_failing.md
================================================
# Validations Failing

The scenarios presented on this page are similar, but their goal is to show what happens when a DQ expectation fails the validations.
The logs generated by the execution of the code will contain information regarding which expectation(s) have failed and why.

## 1. Fail on Error
In this scenario is specified below two parameters:

- `"fail_on_error": False` - this parameter is what controls what happens if a DQ expectation fails. In case
this is set to `true` (default), your job will fail/be aborted and an exception will be raised.
In case this is set to `false, a log message will be printed about the error (as shown in this
scenario) and the result status will also be available in result sink (if configured) and in the
[data docs great expectation site](../data_quality.html#3-data-docs-website). On this scenario it is set to `false` 
to avoid failing the execution of the notebook.
- the `max_value` of the function `expect_table_column_count_to_be_between` is defined with specific value so that
this expectation fails the validations.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": "|",
                "inferSchema": True,
            },
            "location": "s3://my_data_product_bucket/dummy_deliveries/",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "dq_validator",
            "input_id": "dummy_deliveries_source",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "result_sink_db_table": "my_database.dq_result_sink",
            "result_sink_location": "my_dq_path/dq_result_sink/",
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "source": "deliveries_fail",
            "fail_on_error": False,
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 20}},
                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}},
                {"function": "expect_column_values_to_be_null", "args": {"column": "article"}},
                {"function": "expect_column_values_to_be_unique", "args": {"column": "status"}},
                {
                    "function": "expect_column_min_to_be_between",
                    "args": {"column": "delivery_item", "min_value": 1, "max_value": 15},
                },
                {
                    "function": "expect_column_max_to_be_between",
                    "args": {"column": "delivery_item", "min_value": 15, "max_value": 30},
                },
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "input_id": "dq_validator",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
        }
    ],
}

load_data(acon=acon)
```

If you run bellow command, you would be able to see the `success` column has the value `false`
for the last execution.
`display(spark.table(RENDER_UTILS.render_content("my_database.dq_result_sink")))`

## 2. Critical Functions
In this scenario, alternative parameters to `fail_on_error` are used:

- `critical_functions` - this parameter defaults to `None` if not defined.
It controls what DQ functions are considered a priority and as such, it stops the validation
and throws an execution error whenever a function defined as critical doesn't pass the test.
If any other function that is not defined in this parameter fails, an error message is printed in the logs.
This parameter has priority over `fail_on_error`.
In this specific example, after defining the `expect_table_column_count_to_be_between` as critical,
it is made sure that the execution is stopped whenever the conditions for the function are not met.

Additionally, it can also be defined additional parameters like:

- `max_percentage_failure` - this parameter defaults to `None` if not defined.
It controls what percentage of the total functions can fail without stopping the execution of the validation.
If the threshold is surpassed the execution stops and a failure error is thrown.
This parameter has priority over `fail_on_error` and `critical_functions`.

You can also pair `critical_functions` with `max_percentage_failure` by defining something like
a 0.6 max percentage of failure and also defining some critical function.
In this case even if the threshold is respected, the list defined on `critical_functions` still is checked.

```python
from lakehouse_engine.engine import load_data

acon = {
    "input_specs": [
        {
            "spec_id": "dummy_deliveries_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": "|",
                "inferSchema": True,
            },
            "location": "s3://my_data_product_bucket/dummy_deliveries/",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "dq_validator",
            "input_id": "dummy_deliveries_source",
            "dq_type": "validator",
            "bucket": "my_data_product_bucket",
            "result_sink_db_table": "my_database.dq_result_sink",
            "result_sink_location": "my_dq_path/dq_result_sink/",
            "source": "deliveries_critical",
            "tbl_to_derive_pk": "my_database.dummy_deliveries",
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
                {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},
            ],
            "critical_functions": [
                {"function": "expect_table_column_count_to_be_between", "args": {"max_value": 5}},
            ],
        }
    ],
    "output_specs": [
        {
            "spec_id": "dummy_deliveries_bronze",
            "input_id": "dq_validator",
            "write_type": "overwrite",
            "data_format": "delta",
            "location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
        }
    ],
}

load_data(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/gab/__init__.py
================================================
"""
.. include::gab.md
"""


================================================
FILE: lakehouse_engine_usage/gab/gab.md
================================================
# GAB - Gold Asset Builder

GAB stands for Gold Asset Builder and, technically, it is a SQL-first transformation workflow that allows teams to quickly and collaboratively deploy aggregate tables on top of base fact tables, which can then be used for empowering analytics over different perspectives on dashboards or exploratory queries.

GAB provides the following benefits:

- **Efficiency and speed**: It reduces the efforts and time to production for new aggregate tables (gold layer assets).
- **Simple operation**: It simplifies the cluster decision by having just 3 cluster types (small, medium, large), there's no need to create a separated pipeline for each case. These cluster types are tied to the concept of workload priority in GAB (more on that later).
- **Low-code first:** Focus on low-code aggregation configuration with capabilities to also orchestrate complex SQL.

!!! warning
    Before deciding whether your use case can be supported by GAB or not, read the instructions in the sections below carefully. If there is any doubt about certain metrics which might deviate from the realm of GAB, reach out to us before starting your development and we will support you. GAB may not be a one size fit for all your requirements, so use GAB only if it satisfies your requirements.

<img src="../../assets/gab/img/gab_overview.png" alt="image" width="1000px" height="auto">

## Main Advantages over Self-Orchestrated SQL

- More flexibility to define any type of complex sql queries.
- Only need to touch SQL, GAB takes care of all its orchestration.
- Quick production rollout, adaptability and maintainability, without the need to define any complex aggregation orchestration, rerun logic, monitoring, etc.
- Inner-sourcing model really works, as a data analyst can work on a SQL template and hand it over to the data engineering team, which can then adapt that SQL template and take it to production quickly after the data validation.
- As shown in the image below, it's possible to generate different perspectives (dimensions - D1, D2, D3...) of different metrics (M1, M2, M3) for a specific use case:
    1. **Grouping Set (dimensions D1, D2)** - Compute the same metrics at a higher grain from the finest grain.
    2. **Grouping Set (dimensions D1, D2, D3)** - Compute the same metrics at the finest grain.
    3. **Grouping Set (dimensions D1)** - Compute the same metrics at a higher grain.

    | D1                | D2     | D3     | M1     | M2     | M3     |
    | :------           | :-----:| :-----:| :-----:| :-----:| :-----:|
    | value 1           | value 2| NULL   | 22     | 45     | 54     |
    | value 1           | value 2| value 3| 89     | 12     | 47     |
    | value 1           | NULL   | NULL   | 45     | 57     | 12     |

## When to use GAB?

- When an aggregate result, constructed using SQL, is to be created for different levels of detail (AKA different grains) supporting analytics on dashboards or exploratory queries with some specific dimensions and metrics.
- When metrics and dimensions are bound to configured *DAY, WEEK, MONTH, QUARTER, YEAR* cadences and you are not calculating the whole universe of data in your SQL query (e.g., you're looking back or forward on a specific time interval).

## When not to use GAB?

- When metrics and dimensions are not bound to *DAY, WEEK, MONTH, QUARTER, YEAR* cadences.
- When your result is not an aggregated result, i.e., the resulting table is at the transaction grain.
- If your start and end dates for the time interval include dates into the future.
  - !!! warning
        This is for now a current limitation in the GAB engine codebase (`if new_end_date >= current_date: new_end_date = current_date`) that would require further testing to ensure it can be relaxed.
- If your metrics are not calculated incrementally, you should consider the tradeoff of using GAB vs just writing a very simple "full load" SQL code that computes the all universe of data all the time. 
  - !!! note
        However, if the computation is not very intensive, the orchestration/automation that comes with GAB out of the box can actually provide you value. Moreover, even if the metrics are not computed incrementally, you can collect all the automation benefits from GAB and use a time filter in your SQL statements in GAB. You can take that into consideration for your use case.

## GAB Concepts and Features

### Cadence

In which time grain you want the data to be aggregated: DAILY, WEEKLY, MONTHLY, QUARTERLY, YEARLY. The internal dynamics with the CADENCE concept in GAB heavily rely on an automatically generated dimension calendar for GAB's internal usage.

```python
{'DAY':{},'WEEK':{},'MONTH':{},'YEAR':{}}
```

### Dimensions & Metrics

#### Dimensions

It's just a regular dimension according to the OLAP concept. It will be used to aggregate the metrics, example: `product_category`. Usually it is directly mapped from the source tables without any transformation.

#### Metrics

Aggregated value at the dimension level. As part of the dimensions, GAB has an automatically generated calendar dimension at different grains (more on that below).

There are some options to compute a metric:

- **Using SQL to directly** query and aggregate a source table column. Example: `sum(product_amount)`
- Compute it in the same cadence, but in **CADENCE - 1 time window**. Example: In a `MONTHLY` cadence it will compute for the previous month.
- Compute it in the same cadence, but using **last year's reference value**.  Example: In a `QUARTERLY` cadence it will compute it in the same quarter but from the previous year.
- Compute it in the same cadence, but with a **custom window function**. Example: In a `QUARTER` cadence computing the last 2 quarters.
- Compute it in **using any SQL function**, using any of the available columns, deriving a metric from another, etc. Example: compute a metric by multiplying it by 0.56 for the last 6 months of data.

!!! note
    Each computation derives a [new column on the output view](step_by_step/step_by_step.md#use-case-configuration-using-the-query_builder_helper).

### Extended Window Calculator, Reconciliation & Snapshotting

#### Extended Window Calculator

This feature aims to calculate the extended window of any cadence despite the user providing custom dates which are not the exact start and end dates of a cadence.

For example, if the user wants to calculate the `MONTH` cadence but gives a date range of `2023-01-10` to `2023-01-29`, which is not exactly the start and/or end of the month, the computation window will be extended/adjusted to `2023-01-01`-`2023-01-31`, i.e., including the complete month. This ensures that GAB automatically handles any user error to efficiently integrate the complete data of the selected cadence.

#### Reconciliation

The concept of Extended Window Calculator is intertwined with the concept of Reconciliation. These enable the user to compute the data aggregated by the specified cadence, but leveraging 1) *"cadence to date"* calculations; or 2) Reconcile the data taking into account late events.

##### "*Cadence to Date*" Calculations

For example, there can be a use case where the cadence is `WEEKLY`, but we want the aggregated data with a `DAILY` frequency, so configuring the reconciliation window to be `DAILY` it will compute the data in `WEEK TO DATE` basis. In a case where the first day of week is Monday, on Monday it will have the data just for Monday; on Tuesday will be the computation of Monday + Tuesday; on Wednesday will have the results for Monday + Tuesday + Wednesday; and so on, until the end of week. That example would be configured as follows:

```python
{'WEEK': {'recon_window': {'DAY'}}}
```

##### Reconcile the Data to Account for Late Events

Another example can be if we consider WEEK cadence with reconciliation MONTH and QUARTER enabled (`{WEEK':{'recon_window':['MONTH','QUARTER']}`). What this means is, at the start of a new month or a quarter, all the weeks that still belong to that month or that quarter are recalculated to consider the late events. For example, `2023-01-01` is the start of a month, quarter and a year. In this example, since month and quarter are given, and quarter is the higher grain among the two, all the weeks in Q4/22 (using the extended window explained above) are recalculated, i.e. instead of `2022-10-01` to `2022-12-31`, extended window to consider in the current GAB execution is `2022-09-26` to `2023-01-01`. This is true because the first day of Q1/23 was on a Sunday of the last week of Q4/22, and once we execute GAB on 01/01/2023, we are reconciling all the weeks of Q4/22, hence weekly cadence with quarterly reconciliation.

You can find in the image below other illustrative examples of how the extended window and the reconciliation concept work together. In the first example, GAB will always extend the processing window and reconcile the results for all the weeks (yellow color) involved in that month (green color color). In the second example, GAB will always extend the processing window and reconcile the results for all the months (yellow color) involved in the year (note that green color is quarter, not year, but since year is an higher grain than quarter GAB extends the window and reconciles the results for all the months involved in the year, not only the quarter).

<img src="../../assets/gab/img/gab_extended_window_calculator.png" alt="image" width="1000px" height="auto">

### Snapshot

It creates a snapshot of the data on a specified cadence. For example: in a case where we have `MONTHLY` cadence and snapshot enabled at `DAILY` basis, we are going to compute the aggregates for each day in the month:

```python
{'MONTH': {'recon_window': {'DAY': {'snapshot': 'Y'}}}}
```

This is possible with the template column `{{ to_date }}`, which will tell us the end date of the snapshot.

In the version without snapshot, there will be one record for the *MONTH* cadence, but when we enable the above configuration the number of entries for the *MONTH* cadence will be the same as the number of days in the month.

This means there will be a separate entry for each day of the month, which enables to compare the data to the previous year on the same day from the start of the month.

!!! note
    The snapshot feature will always write the snapshot entry for the given period (start date and end date), meaning if you have runs that overlap each other but for a different period (e.g., same start date but different end date) it will not rewrite past snapshot entries.

The above configuration is just an example, and the snapshot can be enabled on any combination of cadences:

```python
{'QUARTER': {'recon_window': {'WEEK': {'snapshot': 'Y'}}}}
{'YEAR': {'recon_window': {'MONTH': {'snapshot': 'Y'}}}}
{'MONTH': {'recon_window': {'WEEK': {'snapshot': 'Y'}}}}
```

## Next Steps

If you are interested in using GAB you can check our [step-by-step documentation](step_by_step/step_by_step.md) that aims to help in the use case configuration and make easier to use GAB.

## FAQ

### Can we ensure past snapshots are not changed?

When we use the snapshots feature, taking monthly cadence with daily reconciliation as example, the number of entries for the *MONTH* cadence will be the same as the number of days in the month, because every day, GAB will generate a snapshot of that month, providing a cumulative picture of the month throughout the several days. In this way, snapshots are immutable.

There may be cases, where the date that you want to control the snapshots is different than the cadence date in GAB, and in this case you will have to inject custom snapshot gathering logic in your GAB SQL templates and potentially play around with GAB's filter date to achieve what you want, because as of now, GAB relies on the cadence date to control the snapshot logic.

### How exactly `lookback_window` works?

Sometimes, `lookback_days` in [GAB execution notebook](../../assets/gab/notebooks/gab.py) and `lookback_window` get confused. `lookback_window` is only used for when you define derived metrics that use window functions (check [step-by-step documentation](step_by_step/step_by_step.md)), and it is used to configure the window. On the other hand, `lookback_days` are only part of [GAB execution notebook](../../assets/gab/notebooks/gab.py) to modify the provided `start_date` so that it considers `lookback_days` before that.

### Can I use GAB with cadence dates in the future?

As mentioned in the ["When not to use GAB?"](#when-not-to-use-gab) section, this is currently not supported.

### What is the purpose of the `rerun` flag?

If you run GAB for same start date and end date as it was run before, without the *rerun* flag, GAB will ignore the execution based on the `gab_events_log` table. The *rerun* flag ensures we can force such re-execution.

### Does my data product needs to be using a star schema (fact table and dimension tables) to use GAB?

No, GAB can be used regardless of the underlying data model, as you should prepare your data with templated SQL (that can be as simple or as complex as your use case) before feeding it to the GAB execution engine.


================================================
FILE: lakehouse_engine_usage/gab/step_by_step/__init__.py
================================================
"""
.. include::step_by_step.md
"""


================================================
FILE: lakehouse_engine_usage/gab/step_by_step/step_by_step.md
================================================
# GAB Step-by-Step

!!! note
    Requirements: Lakehouse engine: 1.20.0+

## 1. Setup Data Product based on Templated Files

- Copy GAB assets from the templated files to your data product:
  - GAB Tables:
    - [Calendar table - dim_calendar](../../../assets/gab/metadata/tables/dim_calendar.sql)
    - [Use case configuration table - lkp_query_builder](../../../assets/gab/metadata/tables/lkp_query_builder.sql)
    - [Unified data table - gab_use_case_results](../../../assets/gab/metadata/tables/gab_use_case_results.sql)
    - [GAB log events table - gab_log_events](../../../assets/gab/metadata/tables/gab_log_events.sql)
  - GAB Notebooks:
    - [Feed Calendar table - gab_dim_calendar](../../../assets/gab/notebooks/gab_dim_calendar.py)
    - [Use case creation - query_builder_helper](../../../assets/gab/notebooks/query_builder_helper.py)
    - [GAB execution - gab](../../../assets/gab/notebooks/gab.py)
    - [GAB job manager - gab_job_manager](../../../assets/gab/notebooks/gab_job_manager.py)

## 2. Set up the Use Case

### 2.1. Create the SQL Template Files

Start by writing the SQL code for your use case. Here's an example where you will find several available placeholders (more on that below):

```sql
SELECT
    {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour) {% endif %} AS order_date,  # date aggregation: computed cadence start date
    {{ to_date }} AS to_date,  # date aggregation: last day of the cadence or of the snapshot if enabled
    b.category_name,
    COUNT(a.article_id) qty_articles,
    SUM(amount) total_amount
FROM
    {{ database }}.dummy_sales_kpi a  # source database
    {{ joins }}  # calendar table join: used to compute the cadence start and end date
LEFT JOIN
    article_categories b ON a.article_id = b.article_id
WHERE
    {{ partition_filter }}  # filter: partition filter
AND
    TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= (
        '{{ start_date }}' + interval '{{ offset_value }}' hour
    )  # filter by date column configured in the use case for this file and timezone shift
AND
    TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < (
        '{{ end_date }}' + interval '{{ offset_value }}' hour
    )  # filter by date column configured in the use case for this file and timezone shift
GROUP BY 1,2,3
```

#### Available SQL Template Placeholders

You can use placeholders in your SQL queries to have them replaced at runtime by the GAB engine. There are several available placeholders that will be listed in this section.

!!! warning
    The placeholder value will always be [injected as per the configurations of the use cases](#use-case-configuration-using-the-query_builder_helper) in the [lkp_query_builder table](../../../assets/gab/metadata/tables/lkp_query_builder.sql).

##### Reference Dates

- *Start and End Dates*:
    - `{{ start_date }}` and `{{ end_date }}` are the dates that control the time window of the current GAB execution. These can be used to execute GAB on a certain schedule and have it incrementally compute the aggregated metrics. These dates are fundamental to control GAB executions and will be provided as arguments in the GAB notebook.

      - !!! warning
            Currently only past and present dates are supported. Future dates are not supported.

- *Project Date*:
    - `{{ project_date_column }}` is the reference date used to compute the cadences and the extended window (together with `{{ start_date }}` and `{{ end_date }}`).

        ```python
        {% if replace_offset_value == 0 %} {{ project_date_column }}
        {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour)
        {% endif %}
        ```

        - !!! note
              The `replace_offset_value` is a flag that has the responsibility to instruct GAB to either directly use the `{{ project_date_column }}` or shift it to the specified timezone according to the provided `offset_value` from the configured use case.

- *To Date*:
    - `{{ to_date }}` is the last date of the cadence, if snapshots are disabled, or, if snapshots are enabled, then this date is the snapshot end date.

##### Filter Placeholders

- `{{ partition_filter }}` the expression to filter the data according to a date partitioning scheme (year/month/day) and it replaces the placeholder with a filter like `year = **** and month = ** and day = **`:
    - !!! warning
          if your table does not have the Year, Month, Day columns you should not add this template
- `{{ filter_date_column }}` and `{{offset_value}}` can be used to filter the data to be processed on your use case to be between the specified time range:
  
    ```python
    {{ filter_date_column }} >= ('{{ start_date }}' + interval '{{offset_value}}' hour) AND {{ filter_date_column }} < ('{{ end_date}}' + interval '{{offset_value}}' hour)
    ```

##### Source Database

From where the data comes from: `{{ database }}`.

##### Dim Calendar join

Represented by the `{{ joins }}` placeholder.

!!! warning
    It is mandatory! Can be added after any of the table names in the `from` statement. The framework renders these `joins` with an internal calendar join and populates the `to_date` and the `project_date_column` as per the configured cadences.

#### Combining Multiple SQL Template Files for a Use Case

For each use case, you can have just one SQL file or have multiple SQL files that depend on each other and need to be executed in a specific order.

##### If there's just one SQL file for the use case

The file should start with 1_. Example: 1_xxxx.sql.

##### When the use case has several SQL files

The different files will represent different intermediate stages/temp tables in GAB execution of the use case. Create the SQL files according to the sequence order (as shown in the image below) and a final combined script, example:

<img src="../../../assets/gab/img/gab_sample_templated_query.png" alt="image" width="auto" height="auto">

!!! note
    We suggest using the folder **metadata/gab** to use as the SQL use case folder but this is a parametrized property that you can override with the property [gab_base_path in the GAB notebook](../../../assets/gab/notebooks/gab.py). This property is used in the [GAB Job Manager](../../../assets/gab/notebooks/gab_job_manager.py) as well.

### 2.2. Configure the Use Case using the Query Builder Helper Notebook

GAB will pull information from **`lkp_query_builder`** in order to retrieve information/configuration to execute the process. To help you on this task you can use the [query_builder_help notebook](../../../assets/gab/notebooks/query_builder_helper.py). In this section, we will go step-by-step in the notebook instructions to configure a use case.

#### 2.2.1. General Configuration

<img src="../../../assets/gab/img/gab_use_case.png" alt="image" width="1000px" height="auto">
      
| Variable                    | Default Value                                            | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
|-----------------------------|----------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Complexity**              | Low                                                      | Defines the complexity of your use case.<br />You should mainly consider the volume of the data or the complexity of the SQL potentially generating a high load.<br />Possible values: **Low**, **Medium** and **High**. These values are used GAB's orchestration, i.e., [GAB job manager - gab_job_manager](../../../assets/gab/notebooks/gab_job_manager.py), which uses it to define the job cluster size/type based on the complexity of the query.                                                                                                                                                                                                                                                                                                     |
| **Database Name**           | example_database                                         | Refers to the name of the development environment database where the **lkp_query_builder** table resides.<br />This parameter is used at the end of the notebook to insert data into the **lkp_query_builder** table.                                                                                                                                                                                                                                          |
| **How many dimensions**     | 1                                                        | Number of dimension columns expected in the use case.<br />**Note: Do not consider the `project_date_column` or metrics**, as they have their own parameters.                                                                                                                                                                                                                                                                                                |
| **How many views**          | 1                                                        | Defines how many output views to generate for the use case. It's possible to have as many as the use case needs.<br />All views will have the same structure (dimensions and metrics), the only difference possible to specify between the views is the `view filter`.<br />**Default value is 1.**<br />**Note**: This configuration has a direct impact in the `3. Configure View Name and Filters` configuration.                                                 |
| **Is Active**               | Y                                                        | Flag to make the use case active or not.<br />**Default value is Y**.                                                                                                                                                                                                                                                                                                                                                                                          |
| **Market**                  | GLOBAL                                                   | Used in the **gab_job_manager** to execute the use cases for each **market**. If your business does not have the concept of Market, you can leave the `GLOBAL` default.                                                                                                                                                                                                                                                                                                                                                                                 |
| **SQL File Names**          | 1_article_category.sql,<br />2_f_agg_dummy_sales_kpi.sql | Name of the SQL files used in the use case, according to what you have configured in ***step 2.1***. <br />You can combine different layers of dependencies between them as shown in the example above, where the **2_combined.sql** file depends on **1_product_category.sql** file. <br />The file name should follow the pattern x_file_name (where x is an integer digit) and should be separated by a comma (e.g.: 1_first_query.sql, 2_second_query.sql).                                                                      |
| **Snapshot End Date**       | to_date                                                  | This parameter is used in the template, by default its value must be ***to_date***.<br />You can change it if you have managed this in your SQL files.<br />The values stored in this column depend on the use case behavior:<br /><ul><li>if snapshots are enabled, it will contain the snapshot end date.</li><li>If no snapshot is enabled, it will contain the last date of the cadence.</li></ul>The snapshot behavior is set in the reconciliation steps (more on that later). |
| **Timezone Offset**         | 0                                                        | The timezone offset that you want to apply to the the date columns (`project_date_column` or `filter_date_column`).<br />It should be a number to decrement or add to the date (e.g., -8 or 8).<br />**The default value is 0**, which means that, by default, no timezone transformation will be applied to the date.                                                                                                                                                                                           |
| **Use Case Name**           | f_agg_dummy_sales_kpi                                    | Name of the use case.<br />The suggestion is to use lowercase and underlined alphanumeric characters.                                                                                                                                                                                                                                                                                                                                                          |
| **Use Case Reference Date** | order_date                                               | Reference date of the use case, i.e., `project_date_column`.<br />The parameter should be the column name and the selected column should have the date/datetime format.                                                                                                                                                                                                                                                                                                                      |
| **Week Start**              | MONDAY                                                   | The start of the business week of the use case.<br />Possible values: **SUNDAY** or **MONDAY**.

#### 2.2.2. Configure Dimension Names

<img src="../../../assets/gab/img/gab_dimensions.png" alt="image" width="1000px" height="auto">

#### 2.2.3. Configure View Name and Filters

This will be the name of the output view at the end of the process. Filters can be applied at this step, if needed.

<img src="../../../assets/gab/img/gab_view_name_and_filters.png" alt="image" width="1000px" height="auto">
    
| Variable        | Default Value            | Description                                                                                                                                                                                                                                                             |
|-----------------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **View Filter** |                          | A SQL *WHERE* clause expression based on the dimensions defined in the previous step.<br />**Example**: if you have set the country as `D1`, the filter here could be **D1 = "Germany"**. The syntax allowed here is the same as the syntax of the *WHERE* clause in SQL. |
| **View Name**   | vw_f_agg_dummy_sales_kpi | Name of the view to query the resulting aggregated data. This will contain the results produced by GAB for the configured use case.

#### 2.2.4. Configure the Cadence, Reconciliation and Snapshot

This step is where we define which will be the cadence displayed at the view.

<img src="../../../assets/gab/img/gab_recon.png" alt="image" width="1000px" height="auto">
    
| Variable                   | Default Value  | Description                                                                                                                                             |
|----------------------------|----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Reconciliation Cadence** | YEAR           | Compute the data aggregated by the specified cadence, optionally defined with reconciliation and snapshotting.<br />[Check more about it here](../gab.md#reconciliation). |

#### 2.2.5. Configure METRICS

First question to ask regarding metrics is how many metrics do you have on our SQL use case query. On our template we have two metrics (`qty_articles` and `total_amount`).

<img src="../../../assets/gab/img/gab_query_metrics.png" alt="image" width="1000px" height="auto">

<img src="../../../assets/gab/img/gab_metrics.png" alt="image" width="1000px" height="auto">

Next, we will define if we want GAB to create secondary calculations for us based on the metric name.

!!! warning
    Metrics should follow the same order as defined on the SQL use case query.

<img src="../../../assets/gab/img/gab_metrics_configuration.png" alt="image" width="1000px" height="auto">

| Variable                                    | Description                                                                                                                                                                                                                                                                                                                                                                      |
|---------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [**Calculated Metric**](../gab.md#Metrics) | It's possible to derive (add secondary calculations) 4 new columns based on each metric.<br />Those new columns will be based on cadences like ***last_cadence***, ***last_year_cadence*** and ***window function***.<br />Moreover, you can create a derived column, which is a custom SQL statement that you can write by selecting the ***derived_metric*** option. |
| **Metric Name**                             | Name of the base metric. Should have the same name as on the SQL use case query in the SQL template files defined previously. |

After that, it's where you configure secondary calculations.

<img src="../../../assets/gab/img/gab_metrics_calculations.png" alt="image" width="1000px" height="auto">

| Variable                            | Description                                                                                                                                |
|-------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|
| **derived_metric.Formula**          | Formula to calculate the metric referring any of previous configured metrics by the **Metric Name**.<br />**Example**: `total_amount*0.56` |
| **derived_metric.Label**            | Name of the generated metric by ***derived_metric***.                                                                                      |
| **last_cadence.Label**              | Name of the generated metric by ***last_cadence***.                                                                                        |
| **last_cadence.Window**             | Cadence lookback window, which means in this example, a lookback from the previous year (as the use case is on **YEARLY** cadence)    |
| **window_function.Agg Func**        | SQL Function to calculate the metric.<br />Possible values: ***sum***, ***avg***, ***max***, ***min***, ***count***                        |
| **window_function.Label**           | Name of the generated metric by ***window_function***.                                                                                     |
| **window_function.Window Interval** | Window interval to use on the metric generation. 

#### 2.2.6. Configure Stages

Stages are related to each SQL file in the use case.

<img src="../../../assets/gab/img/gab_stages_configuration.png" alt="image" width="1000px" height="auto">

| Variable                       | Description                                                                                                                                                                                                                                       |
|--------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Filter Date Column**  | It will be used to filter the data of your use case.<br />This information will be replaced in the placeholder of the GAB template `{{ filter_date_column }}`.                                                                                                               |
| **Project Date Column** | It will be used as reference date for the given query.<br />This information will be replaced in the placeholder of the GAB template `{{ project_date_column }}`.                                                                                                             |
| **Repartition Type**    | Type of repartitioning of the data of the query.<br />Possible values: ***Key*** and ***Number***.<br />When you use Key, it expects column names separated by a comma.<br />When you use Number, it expects an integer of how many partitions the user wants. |
| **Repartition Value**   | This parameter only has effect when used with **Repartition Type parameter**.<br />It sets the value for the repartitioning type set by the parameter above selected.                                                                                                   |
| **Storage Level**       | Defines the Spark persistence storage level you want (e.g. ***Memory Only***, ***Memory and Disk*** etc).                                                                                                                 |
| **Table Alias**         | The alias of the SQL file that will be executed. This name can be used to consume the output of a SQL stage (corresponding to a SQL file) in the next stage (the next SQL file).

#### 2.2.7. Build and Execute the SQL Commands to populate the lkp_query_builder Table

<img src="../../../assets/gab/img/gab_build_insert_sql_instruction.png" alt="image" width="1000px" height="auto">

<img src="../../../assets/gab/img/gab_insert_use_case.png" alt="image" width="1000px" height="auto">

After configuring the use case, it would generate a SQL command to create it on the `lkp_query_builder`:

```sql
DELETE FROM example_database.lkp_query_builder WHERE QUERY_LABEL = 'f_agg_dummy_sales_kpi';
INSERT INTO example_database.lkp_query_builder VALUES (
  1,
  'f_agg_dummy_sales_kpi',
  'GLOBAL',
  """{
    'vw_f_agg_dummy_sales_kpi': {
      'dimensions': {
        'from_date': 'order_date',
        'to_date': 'to_date',
        'd1': 'category_name'
      },
      'metric': {
        'm1': {
          'metric_name': 'qty_articles',
          'calculated_metric': {},
          'derived_metric': {}
        },
        'm2': {
          'metric_name': 'total_amount',
            'calculated_metric': {
              'last_cadence': [
                {
                  'label': 'total_amount_last_year',
                  'window': '1'
                }
              ],
              'window_function': [
                {
                  'label': 'avg_total_amount_last_2_years',
                  'window': [2, 1],
                  'agg_func': 'avg'
                }
              ]
            },
            'derived_metric': [
              {
                'label': 'discounted_total_amount',
                'formula': 'total_amount*0.56'
              }
            ]
          }
        },
      'filter': {}
    }
  }""",
  """{
    '1': {
        'file_path': 'f_agg_dummy_sales_kpi/1_article_category.sql',
        'table_alias': 'article_categories',
        'storage_level': 'MEMORY_ONLY',
        'project_date_column': '',
        'filter_date_column': '',
        'repartition': {}
    },
    '2': {
        'file_path': 'f_agg_dummy_sales_kpi/2_f_agg_dummy_sales_kpi.sql',
        'table_alias': 'dummy_sales_kpi',
        'storage_level': 'MEMORY_ONLY',
        'project_date_column': 'order_date',
        'filter_date_column': 'order_date',
        'repartition': {}
    }
  }""",
  """{'YEAR': {}}""",
  '0',
  'MONDAY',
  'Y',
  'Low',
  current_timestamp()
)
```

## 3. Use case execution

After the initial setup and adding your use case to the ***lkp_query_builder*** you can schedule the [gab_job_manager](../../../assets/gab/notebooks/gab_job_manager.py) to manage the use case execution in any schedule you want.

You can repeat these steps for each use case you have.

## 4. Consuming the data

The data is available in the view you specified as output from the use case in ***step 2***, so you can normally consume the view as you would consume any other data asset (e.g., Report, Dashboard, ML model, Data Pipeline).


================================================
FILE: lakehouse_engine_usage/lakehouse_engine_usage.md
================================================
# How to use the Lakehouse Engine?

Lakehouse engine usage examples for all the algorithms and other core functionalities.

- [Data Loader](data_loader/data_loader.md)
- [Data Quality](data_quality/data_quality.md)
- [Reconciliator](reconciliator/reconciliator.md)
- [Sensors](sensors/sensors.md)
- [GAB](gab/gab.md)

================================================
FILE: lakehouse_engine_usage/managerhelper/managerhelper.md
================================================
# Table and File Manager Operations Generator

Generate JSON configurations for TableManager and FileManager operations with an interactive form.

<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
<link rel="stylesheet" href="../managerhelper/styles-mkdocs.css">
<link rel="stylesheet" href="../managerhelper/operations-styles-mkdocs.css">

<div markdown="0" class="managerhelper-wrapper">

    <!-- Navigation Tabs -->
    <nav class="tabs">
        <button class="tab-button active" data-tab="table-manager">
            <i class="fas fa-table"></i>
            Table Manager
        </button>
        <button class="tab-button" data-tab="file-manager">
            <i class="fas fa-folder"></i>
            File Manager
        </button>
    </nav>

    <!-- Operations Container -->
    <main class="operations-container">
        <!-- Table Manager Tab -->
        <div id="table-manager" class="tab-content active">
            <div class="section">
                <h3><i class="fas fa-table"></i> Table Manager Operations</h3>
                
                <div class="operation-selector">
                    <label for="table-operation-select">Select Operation:</label>
                    <select id="table-operation-select" class="form-control">
                        <option value="">Choose an operation...</option>
                        <option value="compute_table_statistics">Compute Table Statistics</option>
                        <option value="create_table">Create Table</option>
                        <option value="create_tables">Create Multiple Tables</option>
                        <option value="create_view">Create View</option>
                        <option value="drop_table">Drop Table</option>
                        <option value="drop_view">Drop View</option>
                        <option value="execute_sql">Execute SQL</option>
                        <option value="truncate">Truncate Table</option>
                        <option value="vacuum">Vacuum Table</option>
                        <option value="describe">Describe Table</option>
                        <option value="optimize">Optimize Table</option>
                        <option value="show_tbl_properties">Show Table Properties</option>
                        <option value="get_tbl_pk">Get Table Primary Key</option>
                        <option value="repair_table">Repair Table</option>
                        <option value="delete_where">Delete Where</option>
                    </select>
                </div>

                <!-- Dynamic form fields will be inserted here -->
                <div id="table-dynamic-fields" class="dynamic-fields">
                    <div class="no-operation-selected">
                        <i class="fas fa-arrow-up"></i>
                        <p>Select an operation above to see its configuration options</p>
                    </div>
                </div>
            </div>
        </div>

        <!-- File Manager Tab -->
        <div id="file-manager" class="tab-content">
            <div class="section">
                <h2><i class="fas fa-folder"></i> File Manager Operations</h2>
                
                <div class="operation-selector">
                    <label for="file-operation-select">Select Operation:</label>
                    <select id="file-operation-select" class="form-control">
                        <option value="">Choose an operation...</option>
                        <option value="delete_objects">Delete Objects</option>
                        <option value="copy_objects">Copy Objects</option>
                        <option value="move_objects">Move Objects</option>
                        <option value="request_restore">Request Restore (S3)</option>
                        <option value="check_restore_status">Check Restore Status (S3)</option>
                        <option value="request_restore_to_destination_and_wait">Request Restore and Copy (S3)</option>
                    </select>
                </div>

                <!-- Dynamic form fields will be inserted here -->
                <div id="file-dynamic-fields" class="dynamic-fields">
                    <div class="no-operation-selected">
                        <i class="fas fa-arrow-up"></i>
                        <p>Select an operation above to see its configuration options</p>
                    </div>
                </div>
            </div>
        </div>
    </main>

    <!-- Operations List -->
    <div class="operations-list-container">
        <div class="operations-header">
            <h4><i class="fas fa-list"></i> Operations Queue</h4>
            <div class="operations-actions">
                <button id="add-operation" class="btn btn-primary" disabled>
                    <i class="fas fa-plus"></i>
                    Add Operation
                </button>
                <button id="clear-operations" class="btn btn-outline">
                    <i class="fas fa-trash"></i>
                    Clear All
                </button>
            </div>
        </div>
        
        <div id="operations-list" class="operations-list">
            <div class="empty-operations">
                <i class="fas fa-clipboard-list"></i>
                <p>No operations added yet. Configure and add operations to build your JSON.</p>
            </div>
        </div>
    </div>

    <!-- Actions -->
    <div class="actions">
        <button id="generate-json" class="btn btn-primary" disabled>
            <i class="fas fa-code"></i>
            Generate JSON
        </button>
        <button id="copy-json" class="btn btn-secondary" disabled>
            <i class="fas fa-copy"></i>
            Copy to Clipboard
        </button>
        <button id="download-json" class="btn btn-secondary" disabled>
            <i class="fas fa-download"></i>
            Download JSON
        </button>
    </div>

    <!-- JSON Output -->
    <div class="output-container">
        <div class="output-header">
            <h4><i class="fas fa-file-code"></i> Generated JSON Configuration</h4>
            <div class="output-actions">
                <button id="format-json" class="btn btn-sm">
                    <i class="fas fa-indent"></i>
                    Format
                </button>
                <button id="validate-json" class="btn btn-sm">
                    <i class="fas fa-check"></i>
                    Validate
                </button>
            </div>
        </div>
        <pre id="json-output" class="json-output"></pre>
        <div id="validation-result" class="validation-result"></div>
    </div>

    <!-- Loading Spinner -->
    <div id="loading" class="loading" style="display: none;">
        <div class="spinner"></div>
        <p>Generating configuration...</p>
    </div>

    <!-- Success Toast -->
    <div id="toast" class="toast"></div>
</div>

<script src="../managerhelper/operations-script.js"></script>


================================================
FILE: lakehouse_engine_usage/managerhelper/operations-script.js
================================================
// ============================================================================
// LAKEHOUSE ENGINE OPERATIONS GENERATOR - MAIN JAVASCRIPT
// ============================================================================
// This script manages the interactive UI for generating JSON configurations
// for Lakehouse Engine table and file manager operations.
// ============================================================================

// ============================================================================
// DOM ELEMENT REFERENCES
// ============================================================================
// Cache frequently accessed DOM elements for better performance

/** Tab navigation buttons for switching between table and file managers */
const tabButtons = document.querySelectorAll('.tab-button');

/** Tab content containers for table and file manager sections */
const tabContents = document.querySelectorAll('.tab-content');

/** Dropdown select for choosing table manager operations */
const tableOperationSelect = document.getElementById('table-operation-select');

/** Dropdown select for choosing file manager operations */
const fileOperationSelect = document.getElementById('file-operation-select');

/** Container for dynamically generated table operation parameter fields */
const tableDynamicFields = document.getElementById('table-dynamic-fields');

/** Container for dynamically generated file operation parameter fields */
const fileDynamicFields = document.getElementById('file-dynamic-fields');

/** Button to add the currently configured operation to the list */
const addOperationBtn = document.getElementById('add-operation');

/** Button to clear all operations from the list */
const clearOperationsBtn = document.getElementById('clear-operations');

/** Container displaying the list of added operations */
const operationsList = document.getElementById('operations-list');

/** Button to generate JSON configuration from operations list */
const generateBtn = document.getElementById('generate-json');

/** Button to copy generated JSON to clipboard */
const copyBtn = document.getElementById('copy-json');

/** Button to download generated JSON as a file */
const downloadBtn = document.getElementById('download-json');

/** Button to format the displayed JSON */
const formatBtn = document.getElementById('format-json');

/** Button to validate the generated JSON configuration */
const validateBtn = document.getElementById('validate-json');

/** Pre-formatted text area displaying the generated JSON output */
const jsonOutput = document.getElementById('json-output');

/** Element displaying validation results and messages */
const validationResult = document.getElementById('validation-result');

/** Loading spinner overlay element */
const loading = document.getElementById('loading');

/** Toast notification element for user feedback */
const toast = document.getElementById('toast');

// ============================================================================
// APPLICATION STATE
// ============================================================================
// Global state variables that track the application's current status

/** Current active tab ('table-manager' or 'file-manager') */
let currentTab = 'table-manager';

/** Array of operation objects added by the user */
let operations = [];

/** Generated JSON configuration object */
let generatedConfig = null;

// ============================================================================
// OPERATION DEFINITIONS - TABLE MANAGER
// ============================================================================
// Defines all available table manager operations with their parameters,
// validation rules, and UI presentation details

/**
 * Table Manager Operations Configuration
 * Each operation includes:
 * - name: Display name for the UI
 * - icon: FontAwesome icon class
 * - fields: Array of field definitions with type, validation, and help text
 */
const TABLE_OPERATIONS = {
    'compute_table_statistics': {
        name: 'Compute Table Statistics',
        icon: 'fas fa-chart-bar',
        fields: [
            { name: 'table_or_view', label: 'Table or View Name', type: 'text', required: true, help: 'Name of the table or view to compute statistics for' }
        ]
    },
    'create_table': {
        name: 'Create Table',
        icon: 'fas fa-plus-square',
        fields: [
            { name: 'path', label: 'SQL File Path', type: 'text', required: true, help: 'Path to the SQL file containing the CREATE TABLE statement' },
            { name: 'disable_dbfs_retry', label: 'Disable DBFS Retry', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to disable DBFS retry mechanism' },
            { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' },
            { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' }
        ]
    },
    'create_tables': {
        name: 'Create Multiple Tables',
        icon: 'fas fa-layer-group',
        fields: [
            { name: 'path', label: 'SQL File Paths', type: 'textarea', required: true, help: 'Comma-separated paths to SQL files containing CREATE TABLE statements' },
            { name: 'disable_dbfs_retry', label: 'Disable DBFS Retry', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to disable DBFS retry mechanism' },
            { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' },
            { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' }
        ]
    },
    'create_view': {
        name: 'Create View',
        icon: 'fas fa-eye',
        fields: [
            { name: 'path', label: 'SQL File Path', type: 'text', required: true, help: 'Path to the SQL file containing the CREATE VIEW statement' },
            { name: 'disable_dbfs_retry', label: 'Disable DBFS Retry', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to disable DBFS retry mechanism' },
            { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' },
            { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' }
        ]
    },
    'drop_table': {
        name: 'Drop Table',
        icon: 'fas fa-trash-alt',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to drop' }
        ]
    },
    'drop_view': {
        name: 'Drop View',
        icon: 'fas fa-eye-slash',
        fields: [
            { name: 'table_or_view', label: 'View Name', type: 'text', required: true, help: 'Name of the view to drop' }
        ]
    },
    'execute_sql': {
        name: 'Execute SQL',
        icon: 'fas fa-code',
        fields: [
            { name: 'sql', label: 'SQL Commands', type: 'textarea', required: true, help: 'SQL commands to execute (separated by delimiter)' },
            { name: 'delimiter', label: 'SQL Delimiter', type: 'text', default: ';', help: 'Delimiter to separate SQL commands' },
            { name: 'advanced_parser', label: 'Advanced Parser', type: 'select', options: ['True', 'False'], default: 'False', help: 'Use advanced SQL parser' }
        ]
    },
    'truncate': {
        name: 'Truncate Table',
        icon: 'fas fa-cut',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to truncate' }
        ]
    },
    'vacuum': {
        name: 'Vacuum Table',
        icon: 'fas fa-broom',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', help: 'Name of the table to vacuum (leave empty to use path)' },
            { name: 'path', label: 'Table Path', type: 'text', help: 'Path to the Delta table location (use if table_or_view is empty)' },
            { name: 'vacuum_hours', label: 'Retention Hours', type: 'number', default: '168', help: 'Number of hours to retain old versions (default: 168 hours = 7 days)' }
        ]
    },
    'describe': {
        name: 'Describe Table',
        icon: 'fas fa-info-circle',
        fields: [
            { name: 'table_or_view', label: 'Table or View Name', type: 'text', required: true, help: 'Name of the table or view to describe' }
        ]
    },
    'optimize': {
        name: 'Optimize Table',
        icon: 'fas fa-tachometer-alt',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', help: 'Name of the table to optimize (leave empty to use path)' },
            { name: 'path', label: 'Table Path', type: 'text', help: 'Path to the Delta table location (use if table_or_view is empty)' },
            { name: 'where_clause', label: 'Where Clause', type: 'text', help: 'Optional WHERE clause to limit optimization scope' },
            { name: 'optimize_zorder_col_list', label: 'Z-Order Columns', type: 'text', help: 'Comma-separated list of columns for Z-ORDER optimization' }
        ]
    },
    'show_tbl_properties': {
        name: 'Show Table Properties',
        icon: 'fas fa-cogs',
        fields: [
            { name: 'table_or_view', label: 'Table or View Name', type: 'text', required: true, help: 'Name of the table or view to show properties for' }
        ]
    },
    'get_tbl_pk': {
        name: 'Get Table Primary Key',
        icon: 'fas fa-key',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to get primary key from' }
        ]
    },
    'repair_table': {
        name: 'Repair Table',
        icon: 'fas fa-wrench',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to repair' },
            { name: 'sync_metadata', label: 'Sync Metadata', type: 'select', options: ['True', 'False'], default: 'False', help: 'Whether to sync metadata during repair' }
        ]
    },
    'delete_where': {
        name: 'Delete Where',
        icon: 'fas fa-eraser',
        fields: [
            { name: 'table_or_view', label: 'Table Name', type: 'text', required: true, help: 'Name of the table to delete from' },
            { name: 'where_clause', label: 'Where Clause', type: 'text', required: true, help: 'WHERE condition for deletion (without WHERE keyword)' }
        ]
    }
};

// ============================================================================
// OPERATION DEFINITIONS - FILE MANAGER
// ============================================================================
// Defines all available file manager operations for S3 and DBFS file systems

/**
 * File Manager Operations Configuration
 * Supports operations for:
 * - S3: delete, copy, move, restore from Glacier
 * - DBFS: delete, copy, move
 */
const FILE_OPERATIONS = {
    'delete_objects': {
        name: 'Delete Objects',
        icon: 'fas fa-trash',
        fields: [
            { name: 'bucket', label: 'Bucket Name', type: 'text', help: 'S3 bucket name (leave empty for DBFS paths)' },
            { name: 'object_paths', label: 'Object Paths', type: 'textarea', required: true, help: 'Comma-separated list of object paths to delete' },
            { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be deleted without actually deleting' }
        ]
    },
    'copy_objects': {
        name: 'Copy Objects',
        icon: 'fas fa-copy',
        fields: [
            { name: 'bucket', label: 'Source Bucket', type: 'text', help: 'Source S3 bucket name (leave empty for DBFS paths)' },
            { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the source object or directory' },
            { name: 'destination_bucket', label: 'Destination Bucket', type: 'text', help: 'Destination S3 bucket name (leave empty for DBFS paths)' },
            { name: 'destination_object', label: 'Destination Object Path', type: 'text', required: true, help: 'Path of the destination object or directory' },
            { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be copied without actually copying' }
        ]
    },
    'move_objects': {
        name: 'Move Objects',
        icon: 'fas fa-arrows-alt',
        fields: [
            { name: 'bucket', label: 'Source Bucket', type: 'text', help: 'Source S3 bucket name (leave empty for DBFS paths)' },
            { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the source object or directory' },
            { name: 'destination_bucket', label: 'Destination Bucket', type: 'text', help: 'Destination S3 bucket name (leave empty for DBFS paths)' },
            { name: 'destination_object', label: 'Destination Object Path', type: 'text', required: true, help: 'Path of the destination object or directory' },
            { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be moved without actually moving' }
        ]
    },
    'request_restore': {
        name: 'Request Restore (S3)',
        icon: 'fas fa-undo',
        fields: [
            { name: 'bucket', label: 'S3 Bucket', type: 'text', required: true, help: 'S3 bucket containing archived objects' },
            { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the archived object to restore' },
            { name: 'restore_expiration', label: 'Restore Expiration (days)', type: 'number', required: true, default: '7', help: 'Number of days to keep restored objects available' },
            { name: 'retrieval_tier', label: 'Retrieval Tier', type: 'select', options: ['Expedited', 'Standard', 'Bulk'], default: 'Standard', help: 'Speed and cost tier for restoration' },
            { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be restored without actually restoring' }
        ]
    },
    'check_restore_status': {
        name: 'Check Restore Status (S3)',
        icon: 'fas fa-search',
        fields: [
            { name: 'bucket', label: 'S3 Bucket', type: 'text', required: true, help: 'S3 bucket containing archived objects' },
            { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the object to check restore status' }
        ]
    },
    'request_restore_to_destination_and_wait': {
        name: 'Request Restore and Copy (S3)',
        icon: 'fas fa-sync-alt',
        fields: [
            { name: 'bucket', label: 'Source S3 Bucket', type: 'text', required: true, help: 'S3 bucket containing archived objects' },
            { name: 'source_object', label: 'Source Object Path', type: 'text', required: true, help: 'Path of the archived object to restore' },
            { name: 'destination_bucket', label: 'Destination S3 Bucket', type: 'text', required: true, help: 'Destination S3 bucket for restored objects' },
            { name: 'destination_object', label: 'Destination Object Path', type: 'text', required: true, help: 'Path of the destination for restored objects' },
            { name: 'restore_expiration', label: 'Restore Expiration (days)', type: 'number', required: true, default: '7', help: 'Number of days to keep restored objects available' },
            { name: 'retrieval_tier', label: 'Retrieval Tier', type: 'select', options: ['Expedited'], default: 'Expedited', help: 'Only Expedited tier supported for this operation' },
            { name: 'dry_run', label: 'Dry Run', type: 'select', options: ['True', 'False'], default: 'False', help: 'Preview what would be restored without actually restoring' }
        ]
    }
};

// ============================================================================
// INITIALIZATION
// ============================================================================
// Set up the application when the DOM is fully loaded

/**
 * Initialize the application on page load
 * Sets up tabs, event listeners, and loads any saved state
 */
document.addEventListener('DOMContentLoaded', function() {
    initializeTabs();
    initializeEventListeners();
    loadFromLocalStorage();
});

// ============================================================================
// TAB MANAGEMENT
// ============================================================================

/**
 * Initialize tab navigation functionality
 * Sets up click handlers for switching between table and file manager tabs
 */
function initializeTabs() {
    tabButtons.forEach(button => {
        button.addEventListener('click', () => {
            const tabId = button.getAttribute('data-tab');
            switchTab(tabId);
        });
    });
}

/**
 * Switch to a different tab
 * @param {string} tabId - The ID of the tab to activate ('table-manager' or 'file-manager')
 */
function switchTab(tabId) {
    // Update button active states
    tabButtons.forEach(btn => btn.classList.remove('active'));
    document.querySelector(`[data-tab="${tabId}"]`).classList.add('active');
    
    // Update content visibility
    tabContents.forEach(content => content.classList.remove('active'));
    document.getElementById(tabId).classList.add('active');
    
    // Update application state
    currentTab = tabId;
    updateAddButtonState();
}

// ============================================================================
// EVENT LISTENERS SETUP
// ============================================================================

/**
 * Initialize all event listeners for interactive elements
 * Connects UI actions to their handler functions
 */
function initializeEventListeners() {
    // Operation selection change handlers
    tableOperationSelect.addEventListener('change', handleTableOperationChange);
    fileOperationSelect.addEventListener('change', handleFileOperationChange);
    
    // Button click handlers
    addOperationBtn.addEventListener('click', addCurrentOperation);
    clearOperationsBtn.addEventListener('click', clearAllOperations);
    generateBtn.addEventListener('click', generateJSON);
    copyBtn.addEventListener('click', copyToClipboard);
    downloadBtn.addEventListener('click', downloadJSON);
    formatBtn.addEventListener('click', formatJSON);
    validateBtn.addEventListener('click', validateJSON);
}

// ============================================================================
// DYNAMIC FIELD GENERATION
// ============================================================================

/**
 * Handle table operation selection change
 * Renders the appropriate parameter fields for the selected table operation
 */
function handleTableOperationChange() {
    const operation = tableOperationSelect.value;
    if (operation && TABLE_OPERATIONS[operation]) {
        renderDynamicFields(tableDynamicFields, TABLE_OPERATIONS[operation], 'table');
        updateAddButtonState();
    } else {
        showNoOperationSelected(tableDynamicFields);
        updateAddButtonState();
    }
}

/**
 * Handle file operation selection change
 * Renders the appropriate parameter fields for the selected file operation
 */
function handleFileOperationChange() {
    const operation = fileOperationSelect.value;
    if (operation && FILE_OPERATIONS[operation]) {
        renderDynamicFields(fileDynamicFields, FILE_OPERATIONS[operation], 'file');
        updateAddButtonState();
    } else {
        showNoOperationSelected(fileDynamicFields);
        updateAddButtonState();
    }
}

/**
 * Display a message when no operation is selected
 * @param {HTMLElement} container - The container to display the message in
 */
function showNoOperationSelected(container) {
    container.innerHTML = `
        <div class="no-operation-selected">
            <i class="fas fa-arrow-up"></i>
            <p>Select an operation above to see its configuration options</p>
        </div>
    `;
}

/**
 * Render dynamic parameter fields for the selected operation
 * @param {HTMLElement} container - The container to render fields into
 * @param {Object} operationDef - The operation definition with field specifications
 * @param {string} type - The operation type ('table' or 'file')
 */
function renderDynamicFields(container, operationDef, type) {
    const html = `
        <div class="field-group">
            <h4>
                <i class="${operationDef.icon}"></i>
                ${operationDef.name} Configuration
            </h4>
            ${operationDef.fields.map(field => renderField(field, type)).join('')}
        </div>
    `;
    container.innerHTML = html;
    
    // Attach validation event listeners to all input fields
    container.querySelectorAll('input, select, textarea').forEach(input => {
        input.addEventListener('blur', () => validateField(input));
        input.addEventListener('input', () => clearFieldValidation(input));
    });
}

/**
 * Render a single input field based on its definition
 * @param {Object} field - Field definition with name, type, label, etc.
 * @param {string} type - The operation type for generating unique field IDs
 * @returns {string} HTML string for the field
 */
function renderField(field, type) {
    const fieldId = `${type}-${field.name}`;
    const required = field.required ? 'required' : '';
    const requiredMarker = field.required ? '<span class="field-required">*</span>' : '';
    
    let inputHtml = '';
    
    // Generate appropriate input HTML based on field type
    switch (field.type) {
        case 'text':
        case 'number':
            inputHtml = `<input type="${field.type}" id="${fieldId}" name="${field.name}" ${required} ${field.default ? `value="${field.default}"` : ''}>`;
            break;
        case 'textarea':
            inputHtml = `<textarea id="${fieldId}" name="${field.name}" rows="3" ${required}>${field.default || ''}</textarea>`;
            break;
        case 'select':
            const options = field.options.map(option => 
                `<option value="${option}" ${field.default === option ? 'selected' : ''}>${option}</option>`
            ).join('');
            inputHtml = `<select id="${fieldId}" name="${field.name}" ${required}>${options}</select>`;
            break;
    }
    
    return `
        <div class="field-row">
            <div class="field-item">
                <label for="${fieldId}">
                    ${field.label} ${requiredMarker}
                </label>
                ${inputHtml}
                <div class="field-help">${field.help}</div>
                <div class="validation-message" id="${fieldId}-validation"></div>
            </div>
        </div>
    `;
}

// ============================================================================
// FIELD VALIDATION
// ============================================================================

/**
 * Validate a single input field
 * @param {HTMLInputElement} input - The input element to validate
 * @returns {boolean} True if field is valid, false otherwise
 */
function validateField(input) {
    const validationDiv = document.getElementById(`${input.id}-validation`);
    const isRequired = input.hasAttribute('required');
    const value = input.value.trim();
    
    // Clear previous validation state
    input.classList.remove('valid', 'invalid');
    validationDiv.textContent = '';
    validationDiv.className = 'validation-message';
    
    // Check if required field is empty
    if (isRequired && !value) {
        input.classList.add('invalid');
        validationDiv.textContent = 'This field is required';
        validationDiv.classList.add('error');
        return false;
    }
    
    // Type-specific validation for number fields
    if (value && input.type === 'number') {
        const numValue = parseFloat(value);
        if (isNaN(numValue) || numValue < 0) {
            input.classList.add('invalid');
            validationDiv.textContent = 'Please enter a valid positive number';
            validationDiv.classList.add('error');
            return false;
        }
    }
    
    // Mark field as valid if it has a value
    if (value) {
        input.classList.add('valid');
        validationDiv.textContent = '✓ Valid';
        validationDiv.classList.add('success');
    }
    
    return true;
}

/**
 * Clear validation state from an input field
 * @param {HTMLInputElement} input - The input element to clear validation from
 */
function clearFieldValidation(input) {
    input.classList.remove('valid', 'invalid');
    const validationDiv = document.getElementById(`${input.id}-validation`);
    if (validationDiv) {
        validationDiv.textContent = '';
        validationDiv.className = 'validation-message';
    }
}

// ============================================================================
// OPERATION MANAGEMENT
// ============================================================================

/**
 * Update the enabled/disabled state of the Add Operation button
 * Button is only enabled when an operation is selected
 */
function updateAddButtonState() {
    const currentSelect = currentTab === 'table-manager' ? tableOperationSelect : fileOperationSelect;
    const hasSelection = currentSelect.value !== '';
    addOperationBtn.disabled = !hasSelection;
}

/**
 * Add the currently configured operation to the operations list
 * Validates all fields before adding
 */
function addCurrentOperation() {
    const currentSelect = currentTab === 'table-manager' ? tableOperationSelect : fileOperationSelect;
    const operationKey = currentSelect.value;
    
    if (!operationKey) return;
    
    const operationDef = currentTab === 'table-manager' ? 
        TABLE_OPERATIONS[operationKey] : FILE_OPERATIONS[operationKey];
    
    // Collect and validate field values
    const config = { function: operationKey };
    const container = currentTab === 'table-manager' ? tableDynamicFields : fileDynamicFields;
    let isValid = true;
    
    container.querySelectorAll('input, select, textarea').forEach(input => {
        if (!validateField(input)) {
            isValid = false;
        }
        
        const value = input.value.trim();
        if (value) {
            // Handle different field types and convert values appropriately
            if (input.name === 'object_paths' && value.includes(',')) {
                config[input.name] = value.split(',').map(s => s.trim());
            } else if (input.type === 'number') {
                config[input.name] = parseInt(value, 10);
            } else if (value === 'True') {
                config[input.name] = true;
            } else if (value === 'False') {
                config[input.name] = false;
            } else {
                config[input.name] = value;
            }
        }
    });
    
    // Abort if validation failed
    if (!isValid) {
        showToast('Please fix validation errors before adding the operation', 'error');
        return;
    }
    
    // Create and add operation object
    const operation = {
        id: Date.now(),
        type: currentTab === 'table-manager' ? 'table' : 'file',
        manager: currentTab === 'table-manager' ? 'table' : 'file',
        functionName: operationKey,
        displayName: operationDef.name,
        icon: operationDef.icon,
        config: config
    };
    
    operations.push(operation);
    renderOperationsList();
    updateGenerateButtonState();
    saveToLocalStorage();
    
    showToast(`${operationDef.name} operation added successfully!`, 'success');
}

/**
 * Remove an operation from the operations list
 * @param {number} id - The unique ID of the operation to remove
 */
function removeOperation(id) {
    operations = operations.filter(op => op.id !== id);
    renderOperationsList();
    updateGenerateButtonState();
    saveToLocalStorage();
    showToast('Operation removed', 'success');
}

/**
 * Clear all operations from the list after confirmation
 */
function clearAllOperations() {
    if (operations.length === 0) return;
    
    if (confirm('Are you sure you want to remove all operations?')) {
        operations = [];
        renderOperationsList();
        updateGenerateButtonState();
        saveToLocalStorage();
        showToast('All operations cleared', 'success');
    }
}

/**
 * Render the list of added operations in the UI
 * Shows empty state if no operations exist
 */
function renderOperationsList() {
    if (operations.length === 0) {
        operationsList.innerHTML = `
            <div class="empty-operations">
                <i class="fas fa-clipboard-list"></i>
                <p>No operations added yet. Configure and add operations to build your JSON.</p>
            </div>
        `;
        return;
    }
    
    const html = operations.map(operation => `
        <div class="operation-item ${operation.id === operations[operations.length - 1]?.id ? 'new' : ''}">
            <div class="operation-info">
                <div class="operation-title">
                    <span class="operation-badge badge-${operation.type}">
                        ${operation.type}
                    </span>
                    <i class="${operation.icon}"></i>
                    ${operation.displayName}
                </div>
                <div class="operation-details">
                    Function: <strong>${operation.functionName}</strong> |
                    Parameters: ${Object.keys(operation.config).filter(k => k !== 'function').length}
                </div>
            </div>
            <div class="operation-actions">
                <button class="btn btn-sm btn-remove" onclick="removeOperation(${operation.id})">
                    <i class="fas fa-times"></i>
                    Remove
                </button>
            </div>
        </div>
    `).join('');
    
    operationsList.innerHTML = html;
}

/**
 * Update the enabled/disabled state of the Generate JSON button
 * Button is only enabled when at least one operation exists
 */
function updateGenerateButtonState() {
    generateBtn.disabled = operations.length === 0;
}

// ============================================================================
// JSON GENERATION AND OUTPUT
// ============================================================================

/**
 * Generate JSON configuration from the operations list
 * Creates the final configuration object in Lakehouse Engine format
 */
function generateJSON() {
    if (operations.length === 0) {
        showToast('No operations to generate. Please add at least one operation.', 'error');
        return;
    }
    
    showLoading();
    
    // Use setTimeout to show loading animation
    setTimeout(() => {
        try {
            const config = {
                operations: operations.map(op => ({
                    manager: op.manager,
                    ...op.config
                }))
            };
            
            generatedConfig = config;
            displayJSON(config);
            enableActionButtons();
            showToast('JSON configuration generated successfully!', 'success');
            
        } catch (error) {
            console.error('Generation error:', error);
            showToast('Error generating JSON: ' + error.message, 'error');
        } finally {
            hideLoading();
        }
    }, 500);
}

/**
 * Display formatted JSON in the output area
 * @param {Object} config - The configuration object to display
 */
function displayJSON(config) {
    const formattedJSON = JSON.stringify(config, null, 2);
    jsonOutput.textContent = formattedJSON;
    highlightJSON();
}

/**
 * Apply syntax highlighting to the displayed JSON
 * Colors different JSON elements (keys, strings, numbers, booleans)
 */
function highlightJSON() {
    const content = jsonOutput.textContent;
    const highlighted = content
        .replace(/"([^"]+)":/g, '<span class="json-key">"$1":</span>')
        .replace(/: "([^"]+)"/g, ': <span class="json-string">"$1"</span>')
        .replace(/: (\d+)/g, ': <span class="json-number">$1</span>')
        .replace(/: (true|false)/g, ': <span class="json-boolean">$1</span>')
        .replace(/: null/g, ': <span class="json-null">null</span>');
    
    jsonOutput.innerHTML = highlighted;
}

/**
 * Format the generated JSON with proper indentation
 * Re-formats and re-highlights the JSON output
 */
function formatJSON() {
    if (!generatedConfig) {
        showToast('No JSON to format. Generate configuration first.', 'error');
        return;
    }
    
    try {
        const formatted = JSON.stringify(generatedConfig, null, 2);
        jsonOutput.textContent = formatted;
        highlightJSON();
        showToast('JSON formatted successfully!', 'success');
    } catch (error) {
        showToast('Error formatting JSON: ' + error.message, 'error');
    }
}

/**
 * Validate the generated JSON configuration
 * Checks for required fields and proper structure
 */
function validateJSON() {
    if (!generatedConfig) {
        showValidationResult(false, 'No JSON to validate. Generate configuration first.');
        return;
    }
    
    try {
        const config = generatedConfig;
        const errors = [];
        
        // Check for operations array
        if (!config.operations || !Array.isArray(config.operations)) {
            errors.push('Missing or invalid operations array');
        } else {
            // Validate each operation
            config.operations.forEach((op, index) => {
                if (!op.manager) {
                    errors.push(`Operation ${index + 1}: Missing manager field`);
                }
                if (!op.function) {
                    errors.push(`Operation ${index + 1}: Missing function field`);
                }
            });
        }
        
        // Display validation results
        if (errors.length === 0) {
            showValidationResult(true, `JSON configuration is valid! Contains ${config.operations.length} operation(s).`);
        } else {
            showValidationResult(false, 'Validation errors: ' + errors.join(', '));
        }
    } catch (error) {
        showValidationResult(false, 'Validation error: ' + error.message);
    }
}

/**
 * Display validation results to the user
 * @param {boolean} isValid - Whether the validation passed
 * @param {string} message - The validation message to display
 */
function showValidationResult(isValid, message) {
    validationResult.className = `validation-result ${isValid ? 'valid' : 'invalid'}`;
    validationResult.textContent = isValid ? '✅ ' + message : '❌ ' + message;
}

/**
 * Copy the generated JSON to the clipboard
 * Uses modern Clipboard API with fallback for older browsers
 */
async function copyToClipboard() {
    if (!generatedConfig) {
        showToast('No JSON to copy. Generate configuration first.', 'error');
        return;
    }
    
    try {
        const jsonString = JSON.stringify(generatedConfig, null, 2);
        await navigator.clipboard.writeText(jsonString);
        showToast('JSON copied to clipboard!', 'success');
    } catch (error) {
        // Fallback for older browsers
        const textArea = document.createElement('textarea');
        textArea.value = JSON.stringify(generatedConfig, null, 2);
        document.body.appendChild(textArea);
        textArea.select();
        document.execCommand('copy');
        document.body.removeChild(textArea);
        showToast('JSON copied to clipboard!', 'success');
    }
}

/**
 * Download the generated JSON as a file
 * Creates a timestamped filename and triggers browser download
 */
function downloadJSON() {
    if (!generatedConfig) {
        showToast('No JSON to download. Generate configuration first.', 'error');
        return;
    }
    
    const jsonString = JSON.stringify(generatedConfig, null, 2);
    const blob = new Blob([jsonString], { type: 'application/json' });
    const url = URL.createObjectURL(blob);
    
    // Generate filename with timestamp
    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
    const filename = `lakehouse-operations-${timestamp}.json`;
    
    // Trigger download
    const a = document.createElement('a');
    a.href = url;
    a.download = filename;
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
    URL.revokeObjectURL(url);
    
    showToast(`Configuration downloaded as ${filename}`, 'success');
}

// ============================================================================
// UI HELPER FUNCTIONS
// ============================================================================

/**
 * Enable the JSON action buttons (copy, download)
 * Called after JSON is successfully generated
 */
function enableActionButtons() {
    copyBtn.disabled = false;
    downloadBtn.disabled = false;
}

/**
 * Show the loading spinner overlay
 */
function showLoading() {
    loading.style.display = 'flex';
}

/**
 * Hide the loading spinner overlay
 */
function hideLoading() {
    loading.style.display = 'none';
}

/**
 * Display a toast notification message
 * @param {string} message - The message to display
 * @param {string} type - The toast type ('success' or 'error')
 */
function showToast(message, type = 'success') {
    toast.textContent = message;
    toast.className = `toast ${type}`;
    toast.classList.add('show');
    
    // Auto-hide after 3 seconds
    setTimeout(() => {
        toast.classList.remove('show');
    }, 3000);
}

// ============================================================================
// LOCAL STORAGE PERSISTENCE
// ============================================================================

/**
 * Save the current operations and state to localStorage
 * Allows users to resume work after page reload
 */
function saveToLocalStorage() {
    const data = {
        operations: operations,
        currentTab: currentTab,
        timestamp: Date.now()
    };
    localStorage.setItem('lakehouse-operations-generator', JSON.stringify(data));
}

/**
 * Load previously saved operations and state from localStorage
 * Only loads data saved within the last 24 hours
 */
function loadFromLocalStorage() {
    try {
        const saved = localStorage.getItem('lakehouse-operations-generator');
        if (saved) {
            const data = JSON.parse(saved);
            
            // Only load if saved within last 24 hours
            if (Date.now() - data.timestamp < 24 * 60 * 60 * 1000) {
                operations = data.operations || [];
                renderOperationsList();
                updateGenerateButtonState();
                
                if (data.currentTab) {
                    switchTab(data.currentTab);
                }
            }
        }
    } catch (error) {
        console.warn('Could not load saved data:', error);
    }
}

// ============================================================================
// KEYBOARD SHORTCUTS
// ============================================================================

/**
 * Handle keyboard shortcuts for common actions
 * - Ctrl/Cmd + G: Generate JSON
 * - Ctrl/Cmd + A: Add operation (when operation selector focused)
 * - Ctrl + Delete: Clear all operations
 */
document.addEventListener('keydown', function(event) {
    // Ctrl+G or Cmd+G - Generate JSON
    if ((event.ctrlKey || event.metaKey) && event.key === 'g') {
        event.preventDefault();
        generateJSON();
    }
    
    // Ctrl+A or Cmd+A when focused on operation selector - Add operation
    if ((event.ctrlKey || event.metaKey) && event.key === 'a' && 
        (event.target === tableOperationSelect || event.target === fileOperationSelect)) {
        event.preventDefault();
        addCurrentOperation();
    }
    
    // Ctrl + Delete - Clear operations
    if (event.key === 'Delete' && event.ctrlKey && operations.length > 0) {
        event.preventDefault();
        clearAllOperations();
    }
});

// ============================================================================
// FINAL INITIALIZATION
// ============================================================================

/**
 * Initialize button states when page loads
 * Ensures all buttons are in the correct enabled/disabled state
 */
document.addEventListener('DOMContentLoaded', function() {
    updateAddButtonState();
    updateGenerateButtonState();
});


================================================
FILE: lakehouse_engine_usage/managerhelper/operations-styles-mkdocs.css
================================================
/* Import base styles */
/* Operations-specific styles for MkDocs */

.managerhelper-wrapper .operation-selector {
    background: #e3f2fd;
    padding: 1.5rem;
    border-radius: 4px;
    margin-bottom: 2rem;
    border-left: 4px solid #2196f3;
}

.managerhelper-wrapper .operation-selector label {
    display: block;
    margin-bottom: 0.5rem;
    font-weight: 500;
    color: #1565c0;
}

.managerhelper-wrapper .operation-selector select {
    width: 100%;
    padding: 12px 15px;
    border: 1px solid #90caf9;
    border-radius: 4px;
    font-size: 0.875rem;
    background: white;
    color: #1565c0;
    transition: all 0.3s ease;
}

.managerhelper-wrapper .operation-selector select:focus {
    border-color: #2196f3;
    outline: none;
    box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2);
}

/* Operations List */
.managerhelper-wrapper .operations-list-container {
    background: #fafafa;
    border-radius: 4px;
    margin: 0 2rem 2rem;
    overflow: hidden;
    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12);
    border: 1px solid #e0e0e0;
}

.managerhelper-wrapper .operations-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    padding: 1.5rem;
    background: #2196f3;
    color: white;
}

.managerhelper-wrapper .operations-header h3 {
    margin: 0;
    display: flex;
    align-items: center;
    gap: 10px;
}

.managerhelper-wrapper .operations-actions {
    display: flex;
    gap: 10px;
}

.managerhelper-wrapper .operations-list {
    padding: 1rem;
    max-height: 400px;
    overflow-y: auto;
}

.managerhelper-wrapper .empty-operations {
    text-align: center;
    padding: 2rem;
    color: rgba(0, 0, 0, 0.54);
}

.managerhelper-wrapper .empty-operations i {
    font-size: 2rem;
    margin-bottom: 1rem;
    opacity: 0.5;
}

.managerhelper-wrapper .operation-item {
    background: white;
    border: 1px solid #e0e0e0;
    border-radius: 4px;
    padding: 1rem;
    margin-bottom: 0.5rem;
    display: flex;
    justify-content: space-between;
    align-items: flex-start;
    transition: all 0.3s ease;
}

.managerhelper-wrapper .operation-item:hover {
    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.15);
    transform: translateY(-1px);
}

.managerhelper-wrapper .operation-info {
    flex: 1;
}

.managerhelper-wrapper .operation-title {
    font-weight: 500;
    color: rgba(0, 0, 0, 0.87);
    margin-bottom: 0.5rem;
    display: flex;
    align-items: center;
    gap: 8px;
}

.managerhelper-wrapper .operation-title i {
    color: #2196f3;
}

.managerhelper-wrapper .operation-details {
    font-size: 0.8rem;
    color: rgba(0, 0, 0, 0.54);
}

.managerhelper-wrapper .operation-actions {
    display: flex;
    gap: 8px;
    margin-left: 1rem;
}

.managerhelper-wrapper .btn-edit {
    background: #ffd54f;
    color: rgba(0, 0, 0, 0.87);
    border: none;
}

.managerhelper-wrapper .btn-edit:hover {
    background: #ffca28;
}

.managerhelper-wrapper .btn-remove {
    background: #f44336;
    color: white;
    border: none;
}

.managerhelper-wrapper .btn-remove:hover {
    background: #d32f2f;
}

/* Field Groups */
.managerhelper-wrapper .field-group {
    background: #fafafa;
    padding: 1.5rem;
    border-radius: 4px;
    margin-bottom: 1.5rem;
    border: 1px solid #e0e0e0;
    border-left: 3px solid #4caf50;
}

.managerhelper-wrapper .field-group h4 {
    color: rgba(0, 0, 0, 0.87);
    margin-bottom: 1rem;
    display: flex;
    align-items: center;
    gap: 8px;
    font-weight: 500;
}

.managerhelper-wrapper .field-group h4 i {
    color: #4caf50;
}

.managerhelper-wrapper .field-row {
    display: grid;
    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
    gap: 1rem;
    margin-bottom: 1rem;
}

.managerhelper-wrapper .field-item {
    display: flex;
    flex-direction: column;
}

.managerhelper-wrapper .field-item label {
    margin-bottom: 0.5rem;
    font-weight: 500;
    color: rgba(0, 0, 0, 0.87);
}

.managerhelper-wrapper .field-item input,
.managerhelper-wrapper .field-item select,
.managerhelper-wrapper .field-item textarea {
    padding: 10px 12px;
    border: 1px solid #bdbdbd;
    border-radius: 4px;
    font-size: 0.875rem;
    transition: all 0.3s ease;
}

.managerhelper-wrapper .field-item input:focus,
.managerhelper-wrapper .field-item select:focus,
.managerhelper-wrapper .field-item textarea:focus {
    outline: none;
    border-color: #2196f3;
    box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2);
}

.managerhelper-wrapper .field-help {
    font-size: 0.8rem;
    color: rgba(0, 0, 0, 0.54);
    margin-top: 0.25rem;
}

.managerhelper-wrapper .field-required {
    color: #f44336;
}

/* Form validation */
.managerhelper-wrapper .form-control.invalid {
    border-color: #f44336;
    background-color: #ffebee;
}

.managerhelper-wrapper .form-control.valid {
    border-color: #4caf50;
    background-color: #f1f8e9;
}

.managerhelper-wrapper .validation-message {
    font-size: 0.8rem;
    margin-top: 0.25rem;
}

.managerhelper-wrapper .validation-message.error {
    color: #f44336;
}

.managerhelper-wrapper .validation-message.success {
    color: #4caf50;
}

/* Operation Type Badges */
.managerhelper-wrapper .operation-badge {
    display: inline-block;
    padding: 0.25rem 0.5rem;
    font-size: 0.75rem;
    font-weight: 500;
    border-radius: 0.25rem;
    text-transform: uppercase;
    margin-right: 0.5rem;
}

.managerhelper-wrapper .badge-table {
    background-color: #e3f2fd;
    color: #1565c0;
}

.managerhelper-wrapper .badge-file {
    background-color: #fff3e0;
    color: #ef6c00;
}

/* Responsive Design */
@media (max-width: 768px) {
    .managerhelper-wrapper .field-row {
        grid-template-columns: 1fr;
    }
    
    .managerhelper-wrapper .operations-header {
        flex-direction: column;
        gap: 1rem;
        align-items: stretch;
    }
    
    .managerhelper-wrapper .operations-actions {
        justify-content: center;
    }
    
    .managerhelper-wrapper .operation-item {
        flex-direction: column;
        gap: 1rem;
    }
    
    .managerhelper-wrapper .operation-actions {
        margin-left: 0;
        justify-content: flex-end;
    }
    
    .managerhelper-wrapper .operations-list-container {
        margin: 0 1rem 1rem;
    }
}


================================================
FILE: lakehouse_engine_usage/managerhelper/styles-mkdocs.css
================================================
/* MkDocs-scoped styles for Manager Helper with Material Design theme */
.managerhelper-wrapper {
    font-family: 'Roboto', 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    line-height: 1.6;
    color: #333;
    margin: 0 -24px;
    padding: 0;
}

.managerhelper-wrapper * {
    box-sizing: border-box;
}

/* Header */
.managerhelper-wrapper .header {
    text-align: center;
    margin-bottom: 0;
    padding: 2rem 1rem;
    background: #2196f3;
    color: white;
    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}

.managerhelper-wrapper .logo {
    display: flex;
    align-items: center;
    justify-content: center;
    gap: 15px;
    margin-bottom: 10px;
}

.managerhelper-wrapper .logo i {
    font-size: 2.5rem;
}

.managerhelper-wrapper .header h1 {
    font-size: 2rem;
    font-weight: 500;
    margin: 0;
}

.managerhelper-wrapper .subtitle {
    font-size: 1rem;
    opacity: 0.9;
    margin-top: 10px;
}

/* Navigation Tabs */
.managerhelper-wrapper .tabs {
    display: flex;
    gap: 0;
    margin-bottom: 0;
    border-bottom: 2px solid #e0e0e0;
    overflow-x: auto;
    background: #fafafa;
    padding: 0 2rem;
}

.managerhelper-wrapper .tab-button {
    display: flex;
    align-items: center;
    gap: 8px;
    padding: 14px 24px;
    border: none;
    background: transparent;
    cursor: pointer;
    font-size: 0.875rem;
    color: rgba(0, 0, 0, 0.6);
    border-bottom: 2px solid transparent;
    transition: all 0.3s ease;
    white-space: nowrap;
    font-weight: 500;
}

.managerhelper-wrapper .tab-button:hover {
    color: rgba(0, 0, 0, 0.87);
    background: rgba(33, 150, 243, 0.08);
}

.managerhelper-wrapper .tab-button.active {
    color: #2196f3;
    border-bottom-color: #ffd54f;
    background: white;
}

.managerhelper-wrapper .tab-button i {
    font-size: 1rem;
}

/* Form Container */
.managerhelper-wrapper .operations-container {
    padding: 2rem;
}

.managerhelper-wrapper .tab-content {
    display: none;
    animation: fadeIn 0.3s ease-in;
}

.managerhelper-wrapper .tab-content.active {
    display: block;
}

@keyframes fadeIn {
    from { opacity: 0; transform: translateY(10px); }
    to { opacity: 1; transform: translateY(0); }
}

.managerhelper-wrapper .section {
    background: #fafafa;
    padding: 2rem;
    border-radius: 4px;
    box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12);
    border: 1px solid #e0e0e0;
}

.managerhelper-wrapper .section h2 {
    display: flex;
    align-items: center;
    gap: 10px;
    color: rgba(0, 0, 0, 0.87);
    margin-bottom: 1.5rem;
    font-size: 1.5rem;
    font-weight: 500;
}

.managerhelper-wrapper .section h2 i {
    color: #2196f3;
}

/* Form Groups */
.managerhelper-wrapper .form-group {
    margin-bottom: 1.5rem;
}

.managerhelper-wrapper .form-group label {
    display: block;
    margin-bottom: 0.5rem;
    font-weight: 500;
    color: rgba(0, 0, 0, 0.87);
}

.managerhelper-wrapper .form-control {
    width: 100%;
    padding: 12px 15px;
    border: 1px solid #bdbdbd;
    border-radius: 4px;
    font-size: 0.875rem;
    transition: all 0.3s ease;
    background: white;
}

.managerhelper-wrapper .form-control:focus {
    outline: none;
    border-color: #2196f3;
    box-shadow: 0 0 0 2px rgba(33, 150, 243, 0.2);
}

.managerhelper-wrapper textarea.form-control {
    resize: vertical;
    font-family: 'Roboto Mono', 'Fira Code', monospace;
    line-height: 1.5;
}

.managerhelper-wrapper .help-text {
    display: block;
    margin-top: 0.25rem;
    color: #6c757d;
    font-size: 0.8rem;
}

/* Dynamic Fields */
.managerhelper-wrapper .dynamic-fields {
    min-height: 200px;
}

.managerhelper-wrapper .no-operation-selected {
    text-align: center;
    padding: 3rem 2rem;
    color: #757575;
}

.managerhelper-wrapper .no-operation-selected i {
    font-size: 3rem;
    margin-bottom: 1rem;
    opacity: 0.5;
}

/* Actions */
.managerhelper-wrapper .actions {
    display: flex;
    gap: 15px;
    margin-bottom: 2rem;
    flex-wrap: wrap;
    justify-content: center;
    padding: 1rem 2rem;
}

.managerhelper-wrapper .btn {
    display: inline-flex;
    align-items: center;
    gap: 8px;
    padding: 12px 20px;
    border: none;
    border-radius: 4px;
    font-size: 0.875rem;
    cursor: pointer;
    transition: all 0.3s ease;
    text-decoration: none;
    font-weight: 500;
}

.managerhelper-wrapper .btn:disabled {
    opacity: 0.6;
    cursor: not-allowed;
}

.managerhelper-wrapper .btn-primary {
    background: #2196f3;
    color: white;
}

.managerhelper-wrapper .btn-primary:hover:not(:disabled) {
    background: #1976d2;
    transform: translateY(-1px);
    box-shadow: 0 2px 8px rgba(33, 150, 243, 0.4);
}

.managerhelper-wrapper .btn-secondary {
    background: #757575;
    color: white;
}

.managerhelper-wrapper .btn-secondary:hover:not(:disabled) {
    background: #616161;
    transform: translateY(-1px);
}

.managerhelper-wrapper .btn-outline {
    background: transparent;
    border: 2px solid #f44336;
    color: #f44336;
}

.managerhelper-wrapper .btn-outline:hover {
    background: #f44336;
    color: white;
}

.managerhelper-wrapper .btn-sm {
    padding: 6px 12px;
    font-size: 0.8rem;
}

/* Output Container */
.managerhelper-wrapper .output-container {
    background: #263238;
    border-radius: 4px;
    overflow: hidden;
    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
    margin: 0 2rem 2rem;
}

.managerhelper-wrapper .output-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    padding: 1rem 1.5rem;
    background: #37474f;
    color: white;
    border-bottom: 1px solid #455a64;
}

.managerhelper-wrapper .output-header h3 {
    display: flex;
    align-items: center;
    gap: 10px;
    margin: 0;
}

.managerhelper-wrapper .output-actions {
    display: flex;
    gap: 10px;
}

.managerhelper-wrapper .json-output {
    background: #263238;
    color: #eceff1;
    padding: 1.5rem;
    margin: 0;
    font-family: 'Roboto Mono', 'Fira Code', 'Courier New', monospace;
    font-size: 0.8rem;
    line-height: 1.5;
    overflow-x: auto;
    min-height: 200px;
    white-space: pre-wrap;
}

.managerhelper-wrapper .json-output:empty::before {
    content: 'Generated JSON configuration will appear here...';
    color: #90a4ae;
    font-style: italic;
}

/* Validation Result */
.managerhelper-wrapper .validation-result {
    padding: 1rem 1.5rem;
    font-weight: 500;
    display: none;
}

.managerhelper-wrapper .validation-result.valid {
    background: #1b5e20;
    color: #81c784;
    display: block;
}

.managerhelper-wrapper .validation-result.invalid {
    background: #b71c1c;
    color: #ef5350;
    display: block;
}

/* Loading Spinner */
.managerhelper-wrapper .loading {
    position: fixed;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    background: rgba(0, 0, 0, 0.8);
    display: flex;
    flex-direction: column;
    justify-content: center;
    align-items: center;
    z-index: 2000;
    color: white;
}

.managerhelper-wrapper .spinner {
    width: 50px;
    height: 50px;
    border: 5px solid rgba(255, 255, 255, 0.3);
    border-top: 5px solid #ffd54f;
    border-radius: 50%;
    animation: spin 1s linear infinite;
    margin-bottom: 1rem;
}

@keyframes spin {
    0% { transform: rotate(0deg); }
    100% { transform: rotate(360deg); }
}

/* Toast Notification */
.managerhelper-wrapper .toast {
    position: fixed;
    top: 80px;
    right: 20px;
    background: #4caf50;
    color: white;
    padding: 1rem 1.5rem;
    border-radius: 4px;
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3);
    transform: translateX(400px);
    transition: transform 0.3s ease;
    z-index: 2001;
}

.managerhelper-wrapper .toast.show {
    transform: translateX(0);
}

.managerhelper-wrapper .toast.error {
    background: #f44336;
}

/* Responsive Design */
@media (max-width: 768px) {
    .managerhelper-wrapper {
        margin: 0 -16px;
    }
    
    .managerhelper-wrapper .header h1 {
        font-size: 1.5rem;
    }
    
    .managerhelper-wrapper .tabs {
        padding: 0 1rem;
    }
    
    .managerhelper-wrapper .operations-container {
        padding: 1rem;
    }
    
    .managerhelper-wrapper .section {
        padding: 1rem;
    }
    
    .managerhelper-wrapper .actions {
        flex-direction: column;
        padding: 1rem;
    }
    
    .managerhelper-wrapper .output-container {
        margin: 0 1rem 1rem;
    }
}


================================================
FILE: lakehouse_engine_usage/reconciliator/__init__.py
================================================
"""
.. include::reconciliator.md
"""


================================================
FILE: lakehouse_engine_usage/reconciliator/reconciliator.md
================================================
# Reconciliator

Checking if data reconciles, using this algorithm, is a matter of reading the **truth** data and the **current** data.
You can use any input specification compatible with the lakehouse engine to read **truth** or **current** data. On top
of that, you can pass a `truth_preprocess_query` and a `current_preprocess_query` so you can preprocess the data before
it goes into the actual reconciliation process. The reconciliation process is focused on joining **truth**
with `current` by all provided columns except the ones passed as `metrics`.

In the table below, we present how a simple reconciliation would look like:

| current_country | current_count | truth_country | truth_count | absolute_diff | perc_diff | yellow | red | recon_type |
|-----------------|---------------|---------------|-------------|---------------|-----------|--------|-----|------------|
| Sweden          | 123           | Sweden        | 120         | 3             | 0.025     | 0.1    | 0.2 | percentage |
| Germany         | 2946          | Sweden        | 2946        | 0             | 0         | 0.1    | 0.2 | percentage |
| France          | 2901          | France        | 2901        | 0             | 0         | 0.1    | 0.2 | percentage |
| Belgium         | 426           | Belgium       | 425         | 1             | 0.002     | 0.1    | 0.2 | percentage |

The Reconciliator algorithm uses an ACON to configure its execution. You can find the meaning of each ACON property
in [ReconciliatorSpec object](../../reference/packages/core/definitions.md#packages.core.definitions.ReconciliatorSpec).

Below there is an example of usage of reconciliator.
```python
from lakehouse_engine.engine import execute_reconciliation

truth_query = """
  SELECT
    shipping_city,
    sum(sales_order_qty) as qty,
    order_date_header
  FROM (
    SELECT
      ROW_NUMBER() OVER (
        PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city
        ORDER BY changed_on desc
      ) as rank1,
      sales_order_header,
      sales_order_item,
      sales_order_qty,
      order_date_header,
      shipping_city
    FROM truth -- truth is a locally accessible temp view created by the lakehouse engine
    WHERE order_date_header = '2021-10-01'
  ) a
WHERE a.rank1 = 1
GROUP BY a.shipping_city, a.order_date_header
"""

current_query = """
  SELECT
    shipping_city,
    sum(sales_order_qty) as qty,
    order_date_header
  FROM (
    SELECT
      ROW_NUMBER() OVER (
        PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city
        ORDER BY changed_on desc
      ) as rank1,
      sales_order_header,
      sales_order_item,
      sales_order_qty,
      order_date_header,
      shipping_city
    FROM current -- current is a locally accessible temp view created by the lakehouse engine
    WHERE order_date_header = '2021-10-01'
  ) a
WHERE a.rank1 = 1
GROUP BY a.shipping_city, a.order_date_header
"""

acon = {
    "metrics": [{"metric": "qty", "type": "percentage", "aggregation": "avg", "yellow": 0.05, "red": 0.1}],
    "truth_input_spec": {
        "spec_id": "truth",
        "read_type": "batch",
        "data_format": "csv",
        "schema_path": "s3://my_data_product_bucket/artefacts/metadata/schemas/bronze/orders.json",
        "options": {
            "delimiter": "^",
            "dateFormat": "yyyyMMdd",
        },
        "location": "s3://my_data_product_bucket/bronze/orders",
    },
    "truth_preprocess_query": truth_query,
    "current_input_spec": {
        "spec_id": "current",
        "read_type": "batch",
        "data_format": "delta",
        "db_table": "my_database.orders",
    },
    "current_preprocess_query": current_query,
}

execute_reconciliation(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensor/__init__.py
================================================
"""
.. include::sensor.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/delta_table/__init__.py
================================================
"""
.. include::delta_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/delta_table/delta_table.md
================================================
# Sensor from Delta Table

This shows how to create a **Sensor to detect new data from a Delta Table**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.
    If you want to view some examples of usage you can visit the [delta upstream sensor table](../delta_upstream_sensor_table/delta_upstream_sensor_table.md) or the [jdbc sensor](../jdbc_table/jdbc_table.md).

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and **SUGGESTED** behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data will be consumed from a delta table in streaming mode,
so if there is any new data it will give condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "delta",
        "db_table": "upstream_database.source_delta_table",
        "options": {
            "readChangeFeed": "true", # to read changes in upstream table
        },
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it 
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensor/delta_upstream_sensor_table/__init__.py
================================================
"""
.. include::delta_upstream_sensor_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md
================================================
# Sensor from other Sensor Delta Table

This shows how to create a **Sensor to detect new data from another Sensor Delta Table**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

It makes use of `generate_sensor_query` to generate the `preprocess_query`,
different from [delta_table](../delta_table/delta_table.md).

Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger 
the condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor, generate_sensor_query

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "delta",
        "db_table": "upstream_database.lakehouse_engine_sensors",
        "options": {
            "readChangeFeed": "true",
        },
    },
    "preprocess_query": generate_sensor_query("UPSTREAM_SENSOR_ID"),
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensor/file/__init__.py
================================================
"""
.. include::file.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/file/file.md
================================================
# Sensor from Files

This shows how to create a **Sensor to detect new data from a File Location**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios 

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, 
it will automatically trigger the proceeding task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "csv",  # You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles"
        "location": "s3://my_data_product_bucket/path",
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}
acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensor/jdbc_table/__init__.py
================================================
"""
.. include::jdbc_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/jdbc_table/jdbc_table.md
================================================
# Sensor from JDBC

This shows how to create a **Sensor to detect new data from a JDBC table**.

## Configuration required to have a Sensor

- **jdbc_args**: Arguments of the JDBC upstream.
- **generate_sensor_query**: Generates a Sensor query to consume data from the upstream, this function can be used on `preprocess_query` ACON option.
    - **sensor_id**: The unique identifier for the Sensor.
    - **filter_exp**: Expression to filter incoming new data.
      A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value` so that it can be replaced by the respective values from the sensor `control_db_table_name` for this specific sensor_id.
    - **control_db_table_name**: Sensor control table name.
    - **upstream_key**: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream).
    - **upstream_value**: the **first** upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). ***Note:*** This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`.
    - **upstream_table_name**: Table name to consume the upstream value. If it's empty the default value applied is `sensor_new_data`.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios 

This covers the following scenarios of using the Sensor:

1. [Generic JDBC template with `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [Generic JDBC template with `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor, generate_sensor_query

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "batch",
        "data_format": "jdbc",
        "jdbc_args": {
            "url": "JDBC_URL",
            "table": "JDBC_DB_TABLE",
            "properties": {
                "user": "JDBC_USERNAME",
                "password": "JDBC_PWD",
                "driver": "JDBC_DRIVER",
            },
        },
        "options": {
            "compress": True,
        },
    },
    "preprocess_query": generate_sensor_query(
        sensor_id="MY_SENSOR_ID",
        filter_exp="?upstream_key > '?upstream_value'",
        control_db_table_name="my_database.lakehouse_engine_sensors",
        upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",
    ),
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensor/kafka/__init__.py
================================================
"""
.. include::kafka.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/kafka/kafka.md
================================================
# Sensor from Kafka

This shows how to create a **Sensor to detect new data from Kafka**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "kafka",
        "options": {
            "kafka.bootstrap.servers": "KAFKA_SERVER",
            "subscribe": "KAFKA_TOPIC",
            "startingOffsets": "earliest",
            "kafka.security.protocol": "SSL",
            "kafka.ssl.truststore.location": "TRUSTSTORE_LOCATION",
            "kafka.ssl.truststore.password": "TRUSTSTORE_PWD",
            "kafka.ssl.keystore.location": "KEYSTORE_LOCATION",
            "kafka.ssl.keystore.password": "KEYSTORE_PWD",
        },
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensor/sap_bw_b4/__init__.py
================================================
"""
.. include::sap_bw_b4.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/sap_bw_b4/sap_bw_b4.md
================================================
# Sensor from SAP

This shows how to create a **Sensor to detect new data from a SAP LOGCHAIN table**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.


!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom
    query should be created with the source table as `sensor_new_data`.

    - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
    - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
    there is no new data detected from upstream.

Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream.
The Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream:
`generate_sensor_sap_logchain_query` and `generate_sensor_query`.

- **generate_sensor_sap_logchain_query**: This function aims
  to create a temporary table with timestamp from the SAP LOGCHAIN table, which is a process control table.
  
    !!! note
        this temporary table only lives during runtime, and it is related with the
        sap process control table but has no relationship or effect on the sensor control table.
    
        - **chain_id**: SAP Chain ID process.
        - **dbtable**: SAP LOGCHAIN db table name, default: `my_database.RSPCLOGCHAIN`.
        - **status**: SAP Chain Status of your process, default: `G`.
        - **engine_table_name**: Name of the temporary table created from the upstream data, 
        default: `sensor_new_data`.
        This temporary table will be used as source in the `query` option.

  - **generate_sensor_query**: Generates a Sensor query to consume data from the temporary table created in the `prepareQuery`.
      - **sensor_id**: The unique identifier for the Sensor.
      - **filter_exp**: Expression to filter incoming new data.
        A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value`
        so that it can be replaced by the respective values from the sensor `control_db_table_name`
        for this specific sensor_id.
      - **control_db_table_name**: Sensor control table name.
      - **upstream_key**: the key of custom sensor information to control how to identify
        new data from the upstream (e.g., a time column in the upstream).
      - **upstream_value**: the **first** upstream value to identify new data from the
        upstream (e.g., the value of a time present in the upstream).
        .. note:: This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`.
      - **upstream_table_name**: Table name to consume the upstream value.
        If it's empty the default value applied is `sensor_new_data`.
        .. note:: In case of using the `generate_sensor_sap_logchain_query` the default value for the temp table is `sensor_new_data`, so if passing a different value in the `engine_table_name` this parameter should have the same value.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data from SAP, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor, generate_sensor_query, generate_sensor_sap_logchain_query

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "batch",
        "data_format": "jdbc",
        "options": {
            "compress": True,
            "driver": "JDBC_DRIVER",
            "url": "JDBC_URL",
            "user": "JDBC_USERNAME",
            "password": "JDBC_PWD",
            "prepareQuery": generate_sensor_sap_logchain_query(chain_id="CHAIN_ID", dbtable="JDBC_DB_TABLE"),
            "query": generate_sensor_query(
                sensor_id="MY_SENSOR_ID",
                filter_exp="?upstream_key > '?upstream_value'",
                control_db_table_name="my_database.lakehouse_engine_sensors",
                upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",
            ),
        },
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```


================================================
FILE: lakehouse_engine_usage/sensor/sensor.md
================================================
# Sensor

## What is it?

The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small
single-node clusters to check if an upstream system or data product contains new data since the last execution of our
job. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new
data, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction,
Transformation, and Loading).

## How do Sensor-based jobs work?

<img src="../../assets/img/sensor_os.png" alt="image" width="1000px" height="auto">

With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source
system) have new data since the last successful job. We accomplish this through the approach illustrated above, which
can be interpreted as follows:

1. A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors;
2. The Sensor task may run in a very tiny single-node cluster to ensure cost
   efficiency ([check sensor cost efficiency](#are-sensor-based-jobs-cost-efficient));
3. If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster
   to process all the ETL tasks (data processing tasks).
4. In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options:
    1. **(Preferred)** Sense the upstream Data Product sensor control delta table;
    2. Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table
       sensor);

## The Structure and Relevance of the Data Product’s Sensors Control Table

The concept of a lakehouse engine sensor is based on a special delta table stored inside the data product that chooses
to opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that
data product. You can refer to the below table to understand the sensor delta table structure:

| Column Name                 | Type           | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
|-----------------------------|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **sensor_id**               | STRING         | A unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream.<br />Each sensor in each job should have a different sensor_id.<br />If you attempt to create 2 sensors with the same sensor_id, the engine will fail.                                                                                                                                                                                                                                                              |
| **assets**                  | ARRAY\<STRING> | A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status *PROCESSED_NEW_DATA*.                                                                                                                                                                                                                                                                                                                                                                                                                                             |
| **status**                  | STRING         | Status of the sensor. Can either be:<br /><ul><li>*ACQUIRED_NEW_DATA* – when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.</li><li>*PROCESSED_NEW_DATA* - when the job where the sensor is located has processed all the tasks in that job.</li></ul>                                                                                                                                                                                                                                             |
| **status_change_timestamp** | STRING         | Timestamp when the status has changed for the last time.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
| **checkpoint_location**     | STRING         | Base location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors.                                                                                                                                                                                                                                                                                                                                      |
| **upstream_key**            | STRING         | Upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically).<br />This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).                                                                                                                                                   |
| **upstream_value**          | STRING         | Upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key. <br />This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database). |

!!! note
    To make use of the sensors you will need to add this table to your data product.

## How is it different from scheduled jobs?

Sensor-based jobs are still scheduled, but they can be scheduled with higher frequency, as they are more cost-efficient
than ramping up a multi-node cluster supposed to do heavy ETL, only to figure out that the upstream does not have new
data.

## Are sensor-based jobs cost-efficient?

For the same schedule (e.g., 4 times a day), sensor-based jobs are more cost-efficient than scheduling a regular job, because with sensor-based jobs you can start a **very tiny single-node cluster**, and only if there is new data in the upstream the bigger ETL cluster is spin up. For this reason, they are considered more cost-efficient.
Moreover, if you have very hard SLAs to comply with, you can also play with alternative architectures where you can have several sensors in a continuous (always running) cluster, which then keeps triggering the respective data processing jobs, whenever there is new data.


## Sensor Steps

1. Create your sensor task for the upstream source. Examples of available sources:
    - [Delta Table](delta_table/delta_table.md)
    - [Delta Upstream Sensor Table](delta_upstream_sensor_table/delta_upstream_sensor_table.md)
    - [File](file/file.md)
    - [JDBC](jdbc_table/jdbc_table.md)
    - [Kafka](kafka/kafka.md)
    - [SAP BW/B4](sap_bw_b4/sap_bw_b4.md)
2. Setup/Execute your ETL task based in the Sensor Condition
3. Update the Sensor Control table status with the [Update Sensor Status](update_sensor_status/update_sensor_status.md)

================================================
FILE: lakehouse_engine_usage/sensor/update_sensor_status/__init__.py
================================================
"""
.. include::update_sensor_status.md
"""


================================================
FILE: lakehouse_engine_usage/sensor/update_sensor_status/update_sensor_status.md
================================================
# Update Sensor control delta table after processing the data

This shows how to **update the status of your Sensor after processing the new data**.

Here is an example on how to update the status of your sensor in the Sensors Control Table:
```python
from lakehouse_engine.engine import update_sensor_status

update_sensor_status(
    sensor_id="MY_SENSOR_ID",
    control_db_table_name="my_database.lakehouse_engine_sensors",
    status="PROCESSED_NEW_DATA",
    assets=["MY_SENSOR_ASSETS"]
)
```

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

================================================
FILE: lakehouse_engine_usage/sensors/__init__.py
================================================
"""
.. include::sensors.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/__init__.py
================================================
"""
.. include::heartbeat.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/delta_table/__init__.py
================================================
"""
.. include::delta_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/delta_table/delta_table.md
================================================
# Heartbeat Sensor for Delta Table

This shows how to create a Heartbeat Sensor Orchestrator to detect new data from a 
Delta Table and trigger Databricks Workflows related to them.

## Configuration required to create an orchestration task for the delta table source

- **sensor_source**: Set to `delta_table` in the Heartbeat Control Table to identify this as a Delta table source.
- **data_format**: Set to `delta` to specify the data format for reading Delta tables.
- **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`).
- **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`).
- **options**: Configuration options for Delta table reading:
    - `readChangeFeed`: Set to `"true"` to enable change data feed reading.
- **base_checkpoint_location**: `S3` path for storing checkpoint data (required if `sensor_read_type` is `streaming`).
- **domain**: Databricks workflows domain for job triggering.
- **token**: Databricks workflows token for authentication.

### Delta Table Data Feed CSV Configuration Entry

To check how the entry for a Delta table source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records).

## Code sample of listener and trigger

```python
from lakehouse_engine.engine import (
    execute_sensor_heartbeat,
    trigger_heartbeat_sensor_jobs,
)

# Create an ACON dictionary for all delta table source entries.
# This ACON dictionary is useful for passing parameters to heartbeat sensors.

heartbeat_sensor_config_acon = {
    "sensor_source": "delta_table",
    "data_format": "delta",
    "heartbeat_sensor_db_table": "my_database.heartbeat_sensor",
    "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors",
    "options": {
        "readChangeFeed": "true",
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "domain": "DATABRICKS_WORKFLOWS_DOMAIN",
    "token": "DATABRICKS_WORKFLOWS_TOKEN",
}

# Execute Heartbeat sensor and trigger jobs which have acquired new data. 
execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon)
trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon)
```


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/heartbeat.md
================================================
# Heartbeat Sensor

## What is it?

The Heartbeat Sensor is a robust, configurable system designed to continuously monitor 
upstream systems for new data. It enhances the existing sensor infrastructure by addressing 
key limitations and providing significant improvements:

**Previous Sensor Architecture Limitations:**

- Required individual sensor configurations for each data source.
- Limited scalability when monitoring multiple upstream systems.
- Manual job triggering and dependency management.
- No centralized control or monitoring of sensor status.
- Difficult to manage complex multi-source dependencies.

**Heartbeat Sensor Enhancements:**

- **Centralized Management**: Single control table to manage all sensor sources and their dependencies.
- **Automated Job Orchestration**: Automatically triggers downstream Databricks jobs when new data is detected.
- **Multi-Source Support**: Handles diverse source types (SAP, Kafka, Delta Tables, Manual Uploads, Trigger Files) in one unified system.
- **Dependency Management**: Built-in hard/soft dependency validation before triggering jobs.
- **Scalable Architecture**: Efficiently processes multiple sensors in parallel.
- **Status Tracking**: Comprehensive lifecycle tracking from detection to job completion.

This provides a centralized, efficient, and automated mechanism to detect and trigger 
downstream workflows with minimal user intervention.

## How Does the Heartbeat Sensor Work?

<img src="../../../assets/img/heartbeat_sensor_os.png" alt="image" width="1000px" height="auto">

The Heartbeat Sensor operates on a pull-based approach using a single-node cluster that continuously monitors upstream systems. Here's how the system works:

### Core Architecture Components

**1. [Centralized Control Table](#control-table-schema)**

- Tracks and manages all data sources and their configurations.
- Dynamically populated by the [Heartbeat Data Feeder Job](heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md).
- Provides structured monitoring across various upstream systems.

**2. Persistent Heartbeat Job**

- Runs continuously or on a user-defined schedule.
- Supports both real-time and batch-style data monitoring.
- Efficiently processes multiple sensors in parallel.

**3. Sensor Integration Framework**

- Leverages existing sensor mechanisms for event detection.
- Creates appropriate Sensor ACONs based on source types.
- Returns `NEW_EVENT_AVAILABLE` status when new data is detected.

**4. Automated Job Orchestration**

- Triggers Databricks jobs via Job Run API when conditions are met.
- Validates dependencies before job execution.
- Maintains comprehensive audit trail of all operations.

### Operational Flow

1. **Continuous Monitoring**: The heartbeat cluster continuously polls configured sensor sources.
2. **Event Detection**: Checks each source for `NEW_EVENT_AVAILABLE` status.
3. **Dependency Validation**: Evaluates hard/soft dependencies before triggering jobs.
4. **Automatic Triggering**: Launches Databricks jobs when all conditions are satisfied.
5. **Status Management**: Updates control table throughout the entire lifecycle.

!!! warning "Pull-Based Architecture"
    The system is designed for a "pull" approach, same as the Sensor solution.
    Downstream data product sensor clusters actively check for new events from the
    upstream. Upstream sensor clusters do not require write permissions to the downstream
    data product system. Just read access is required for upstream from downstream system.

### Control Table Schema

The Heartbeat Sensor Control Table is the central component that manages all sensor sources and their configurations. Below is the complete schema with detailed descriptions:

| Column name                        | Data Type | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | Produced/Maintained by |
|------------------------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|
| **sensor_source**                  | STRING    | Upstream source system<ul><li>`sap_b4` - SAP 4HANA</li><li>`sap_bw` SAP BW</li><li>`delta_table`</li><li>`lmu_delta_table` - Lakehouse Manual Upload</li><li>`kafka`</li><li>`trigger_file`</li></ul>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | User/Developer         |
| **sensor_id**                      | STRING    | Unique Upstream id or upstream reference.<ul><li>**sap_bw** or **sap_b4** source:<br />SAP Chain Id, example: `SAP_CHAIN_ID_SAP_TABLE`</li><li>**delta_table** source: Delta table name along with database name, examples: `my_database_1.my_table`; `my_database_2.my_table_2`</li><li>**lmu_delta_table** source: Lakehouse Manual Upload Delta table name along with database name, examples: `my_database.my_lmu_table`</li><li>**kafka** source: Kafka Topic name starting with <data_product_name:> prefix and then the topic name, example: `data_product_name: my_product.my.topic`.</li><li>**trigger_file** source: Asset name/folder name under which trigger file will be kept, example: `my_trigger`</li></ul> | User/Developer         |
| **sensor_read_type**               | STRING    | Sensor read type to fetch new event - can be batch or streaming.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | User/Developer         |
| **asset_description**              | STRING    | Description of Upstream source (It can be upstream name).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | User/Developer         |
| **upstream_key**                   | STRING    | upstream key (an attribute name from the upstream so that new data can be detected automatically), example: `load_date`.<br />This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).<br />**Note**: This attribute will be used in the `preprocess_query`, example: `SELECT * FROM sensor_new_data WHERE ?upstream_key >= current_date() - 7` will be rendered to `SELECT * FROM sensor_new_data WHERE load_date >= current_date() - 7`                    | User/Developer         |
| **preprocess_query**               | STRING    | Query to filter data returned by the upstream. **Note**: This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.<br />Example: `SELECT * FROM sensor_new_data WHERE load_date >= current_date() - 7`                                                                                                                                                                                                                                                                                                                                                                                                             | User/Developer         |
| **latest_event_fetched_timestamp** | TIMESTAMP | Latest event fetched timestamp for upstream source. It will be updated each time as soon as NEW EVENT is available.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | lakehouse-engine       |
| **trigger_job_id**                 | STRING    | Databricks Job Id of downstream application. Based on this, Job will get triggered by Heartbeat once new event is available.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | User/Developer         |
| **trigger_job_name**               | STRING    | Databricks Job Name.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | User/Developer         |
| **status**                         | STRING    | Status of the orchestration.<br/><ul><li>`NEW_EVENT_AVAILABLE` once new event is found.</li><li>`IN PROGRESS` - When job gets triggered</li><li>`COMPLETED` - once Job completed successfully</li></ul>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | lakehouse-engine       |
| **status_change_timestamp**        | STRING    | string containing the datetime when the status has changed.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | lakehouse-engine       |
| **job_start_timestamp**            | TIMESTAMP | Start timestamp of downstream Job. It will get updated as soon as Job went into `IN_PROGRESS` job_status.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | lakehouse-engine       |
| **job_end_timestamp**              | TIMESTAMP | End timestamp of downstream Job. It will get updated as soon as Job went into `COMPLETED` job_status.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | lakehouse-engine       |
| **job_state**                      | STRING    | Current status of Job in Control table. `PAUSED` or `UNPAUSED`. If `PAUSED`, Sensor will **not look** for NEW EVENTS or Trigger the dependent job.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | User/Developer         |
| **dependency_flag**                | STRING    | <ul><li>TRUE - For Hard dependency</li><li>FALSE - For SOFT dependency</li></ul>All dependent Job needs to complete successfully for HARD dependency. For SOFT → FALSE marked job will be ignored. Default - must be TRUE in case of no dependency.                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | User/Developer         |

### Control Table Reference Records

The following table shows **example records** that demonstrate how different types of sensor sources are configured in the Heartbeat Sensor Control Table. These are **sample entries** that illustrate the structure and typical values for each column across various sensor source types (Kafka, Lakehouse Manual Upload Delta Table, SAP B4, Delta Table, and Trigger File).

**Purpose of these examples:**

- Show real-world configuration patterns for different sensor sources.
- Demonstrate how different statuses (`NEW_EVENT_AVAILABLE`, `IN_PROGRESS`, `null`) appear in the table.
- Illustrate the relationship between sensor sources and their corresponding Databricks jobs.
- Provide reference values for fields like `sensor_id`, `trigger_job_id`, and status timestamps.

!!! note
    These are illustrative examples - your actual table will contain records specific to your data sources and job configurations.

| sensor_source   | sensor_id                    | sensor_read_type | asset_description                      | upstream_key | preprocess_query | latest_event_fetched_timestamp | trigger_job_id | trigger_job_name                         | status              | status_change_timestamp  | job_start_timestamp      | job_end_timestamp | job_state | dependancy_flag |
|-----------------|------------------------------|------------------|----------------------------------------|--------------|------------------|--------------------------------|----------------|------------------------------------------|---------------------|--------------------------|--------------------------|-------------------|-----------|-----------------|
| kafka           | my_product: my.topic         | streaming        | My product Kafka Topic                 | null         | null             | 2025-04-23T21:40:23.768Z       | 111111111      | my-product-kafka_consumer_job            | IN_PROGRESS         | 2025-04-23T21:40:36.88Z  | 2025-04-23T21:40:36.88Z  | null              | UNPAUSED  | TRUE            |
| lmu_delta_table | my_database.my_lmu_table     | batch            | My Lakehouse Manual Upload Delta Table | date         | null             | 2025-04-23T21:46:07.495Z       | 222222222      | my-product-lmu_table_consumer_job        | IN_PROGRESS         | 2025-04-23T21:46:19.4Z   | 2025-04-23T21:46:19.4Z   | null              | UNPAUSED  | TRUE            |
| sap_b4          | SAP_BW_CHAIN_ID_SAP_TABLE    | batch            | My SAP BW Chain Process                | LOAD_DATE    | null             | 2025-04-23T21:35:10.643Z       | 333333333      | my-product-sap_bw_consumer_job           | IN_PROGRESS         | 2025-04-23T21:35:29.248Z | 2025-04-23T21:35:29.248Z | null              | UNPAUSED  | TRUE            |
| delta_table     | my_database_1.my_table       | streaming        | My Delta Table from My Database 1      | null         | null             | 2025-04-23T22:11:56.384Z       | 444444444      | my-product-delta_and_sap_b4_consumer_job | NEW_EVENT_AVAILABLE | 2025-04-23T22:11:56.384Z | null                     | null              | UNPAUSED  | TRUE            |
| sap_b4          | SAP_4HANA_CHAIN_ID_SAP_TABLE | batch            | My SAP 4HANA Chain Process             | LOAD_DATE    | null             | null                           | 444444444      | my-product-delta_and_sap_b4_consumer_job | null                | null                     | null                     | null              | UNPAUSED  | TRUE            |
| trigger_file    | my_trigger                   | streaming        | My Trigger File                        | null         | null             | 2025-04-23T22:07:28.668Z       | 555555555      | my-product-trigger_file_consumer_job     | IN_PROGRESS         | 2025-04-23T22:07:39.865Z | 2025-04-23T22:07:39.865Z | null              | UNPAUSED  | TRUE            |


## How to Implement the Heartbeat Sensor

This step-by-step guide aims to help you through setting up, configuring, and operating the Heartbeat Sensor system from initial setup to ongoing monitoring and troubleshooting.

### Phase 1: Initial Setup and Configuration

#### Step 1: Define Your Data Source Configurations

Create a CSV file containing your data source configurations with the following required columns:

- `sensor_source`: Type of [sensor source](#control-table-schema).
- `sensor_id`: Unique upstream identifier or reference.
- `sensor_read_type`: How to read the sensor (batch or streaming).
- `asset_description`: Description of the upstream source.
- `upstream_key`: Attribute name for detecting new data automatically.
- `preprocess_query`: Optional query to filter upstream data.
- `trigger_job_id`: Databricks Job ID to trigger when new data is available.
- `trigger_job_name`: Databricks Job Name.
- `job_state`: Job control state (`UNPAUSED` or `PAUSED`).
- `dependency_flag`: Dependency type (`TRUE` for hard, `FALSE` for soft).

**Example CSV Configuration:**
```csv
sensor_source,sensor_id,sensor_read_type,asset_description,upstream_key,preprocess_query,trigger_job_id,trigger_job_name,job_state,dependency_flag
kafka,"my_product: my.topic",streaming,"My product Kafka Topic",,,"111111111","my-product-kafka_consumer_job",UNPAUSED,TRUE
delta_table,"my_database_1.my_table",streaming,"My Delta Table from My Database 1",,,"444444444","my-product-delta_and_sap_b4_consumer_job",UNPAUSED,TRUE
sap_b4,"SAP_4HANA_CHAIN_ID_SAP_TABLE",batch,"My SAP 4HANA Chain Process",LOAD_DATE,,"444444444","my-product-delta_and_sap_b4_consumer_job",UNPAUSED,TRUE
```

#### Step 2: Populate the Heartbeat Control Table

Use the [Heartbeat Sensor Control Table Data Feeder](heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md) to:

- Read your CSV configuration file.
- Validate the configuration entries.
- Ingest the data into the Heartbeat Control Table.
- Establish the foundation for monitoring and orchestration.

### Phase 2: Heartbeat Sensor Operation Workflow

#### Step 3: Continuous Monitoring and Event Detection

The Heartbeat sensor cluster (running on a single node) performs the following operations:

**3.1 Control Table Scanning**

- Scans the Heartbeat Control Table for eligible records.
- Filters records based on:
    - Supported sensor sources: `Delta Table`, `Kafka`, `SAP BW/4HANA`, `Lakehouse Manual Upload`, `Trigger file`.
    - Job state: `job_state = 'UNPAUSED'`.
    - Status conditions: `status IS NULL` or `status = 'COMPLETED'`.

!!! important "Orchestration job recommendation"
    We recommend running multiple tasks for each sensor source type in the same Heartbeat
    Sensor Orchestrator and just create specific source related jobs when it's really needed,
    example: real time processing jobs or some complex jobs that need to be triggered as
    soon as the trigger condition is satisfied (all hard dependencies has `NEW_EVENT_AVAILABLE`).

!!! note "First-Time Execution"
    For new sensor sources and IDs, the initial `status` will be `NULL`. This ensures that failed or paused jobs are not automatically triggered.

**3.2 Source-Specific Event Detection**

For each eligible record, the Heartbeat system:

- Creates the appropriate Sensor ACON (configuration) based on the `sensor_source` type.
- Passes the configuration to the respective Sensor Algorithm.
- The sensor algorithm checks for `NEW_EVENT_AVAILABLE` status for the specific `sensor_id`.

**Supported Source Types and Their Configuration:**

- **[Delta Table Sources](delta_table/delta_table.md)**: Monitor delta tables for new data.
- **[Kafka Sources](kafka/kafka.md)**: Monitor Kafka topics for new messages.
- **[Manual Table Sources](manual_table/manual_table.md)**: Monitor manually uploaded delta tables.
- **[SAP BW/B4 Sources](sap_bw_b4/sap_bw_b4.md)**: Monitor SAP systems for new process chains.
- **[Trigger File Sources](trigger_file/trigger_file.md)**: Monitor file systems for trigger files.

#### Step 4: Event Processing and Status Updates

**4.1 New Event Detection**

When a sensor detects new data:

- Updates the traditional sensor table (`lakehouse_engine_sensor`) with detection details.
- Returns `NEW_EVENT_AVAILABLE` status to the Heartbeat module.

**4.2 Heartbeat Control Table Updates**

The Heartbeat system updates the control table with:

- `status` → `NEW_EVENT_AVAILABLE`.
- `status_change_timestamp` → current timestamp.
- `latest_event_fetched_timestamp` → timestamp when event detection started.

#### Step 5: Dependency Validation and Job Triggering

**5.1 Dependency Evaluation Process**

Before triggering any jobs, the system evaluates dependencies:

1. **Filter Eligible Records**: Select records with `status = 'NEW_EVENT_AVAILABLE'`.
2. **Group by Job ID**: Group records by `trigger_job_id` to identify job dependencies.
3. **Evaluate Dependency Flags**:
    - **TRUE (Hard Dependency)**: Job must have `NEW_EVENT_AVAILABLE` status.
    - **FALSE (Soft Dependency)**: Job status is optional and doesn't block triggering.
4. **Aggregate and Validate**: Ensure all hard dependencies are satisfied before triggering.

**5.2 Triggering Logic Examples**

Consider Job 3 that depends on Job 1 and Job 2:

- **Scenario A**: Job 1 (HARD) + Job 2 (HARD) → Both must have `NEW_EVENT_AVAILABLE`.
- **Scenario B**: Job 1 (HARD) + Job 2 (SOFT) → Only Job 1 needs `NEW_EVENT_AVAILABLE`.

<img src="../../../assets/img/heartbeat_dependency_flag.png" alt="image" width="1000px" height="auto">

**5.3 Job Triggering via Databricks API**

For jobs that pass dependency validation:

- Trigger the corresponding `trigger_job_id` via Databricks Job Run API.
- Immediately update the control table:
    - `status` → `IN_PROGRESS`.
    - `job_start_timestamp` → current timestamp.
    - `status_change_timestamp` → current timestamp.

### Phase 3: Job Execution and Completion

#### Step 6: Databricks Job Execution

Each triggered Databricks job must include:

- Your primary ETL/processing tasks.
- **Final Task**: [Update Heartbeat Sensor Status](update_heartbeat_sensor_status/update_heartbeat_sensor_status.md) task.

#### Step 7: Job Completion Handling

Upon successful job completion, the update status task:

- Sets `status` → `COMPLETED`.
- Updates `status_change_timestamp` → current timestamp.
- Sets `job_end_timestamp` → job completion timestamp.

### Phase 4: Error Handling and Recovery

#### Step 8: Job Failure Recovery Process

If a Databricks job fails, follow this recovery process:

1. **Identify the Issue**: Analyze job logs and error messages.
2. **Fix the Problem**: Address the underlying cause of the failure.
3. **Manual Recovery**: Execute at least one successful manual run of the job.
4. **Automatic Resumption**: Heartbeat will resume monitoring and triggering after successful completion.

!!! warning "Important Recovery Note"
    The Heartbeat sensor will **not** resume checking failed jobs for new events until at least one successful completion occurs. This prevents repeated triggering of failing jobs.

#### Step 9: Monitoring and Maintenance

**9.1 Regular Monitoring Tasks**

- Monitor the Heartbeat Control Table for job statuses.
- Check for jobs stuck in `IN_PROGRESS` status.
- Verify dependency relationships are working correctly.
- Review `latest_event_fetched_timestamp` for regular updates.

**9.2 Control and Management**

- **Pause Jobs**: Set `job_state` to `PAUSED` to temporarily stop monitoring.
- **Resume Jobs**: Set `job_state` to `UNPAUSED` to resume monitoring.
- **Modify Dependencies**: Update `dependency_flag` to change dependency relationships.

### Phase 5: Advanced Configuration and Optimization

#### Step 10: Advanced Configuration Options

**10.1 Preprocess Queries**

Use `preprocess_query` to filter upstream data:
```sql
-- Example: Filter only recent records
SELECT * FROM sensor_new_data WHERE load_date >= current_date() - 7
```

**10.2 Parallel Processing**

The Heartbeat sensor automatically handles parallel processing of multiple sources, improving efficiency and scalability.

**10.3 Pull-Based Architecture Benefits**

- Upstream systems only need read access to downstream systems.
- No write permissions required from upstream to downstream.
- Improved security and access control.

### Troubleshooting Common Issues

| Issue                       | Symptoms                                      | Solution                                                           |
|-----------------------------|-----------------------------------------------|--------------------------------------------------------------------|
| Jobs not triggering         | Status remains `NEW_EVENT_AVAILABLE`          | Check dependency flags and ensure all hard dependencies are met.   |
| Jobs stuck in `IN_PROGRESS` | No completion status updates                  | Verify that jobs include the update status task as the final step. |
| Failed job recovery         | Jobs not resuming after fixes                 | Manually run the job successfully at least once.                   |
| Missing events              | `latest_event_fetched_timestamp` not updating | Check sensor source connectivity and configuration.                |

This workflow ensures reliable, automated data pipeline orchestration with robust error handling and dependency management.

!!! note
    Also have a look at the [Sensor documentation](../sensors.md) to have a better understanding of the underlying sensor mechanisms that power the Heartbeat Sensor system.


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/__init__.py
================================================
"""
.. include::heartbeat_sensor_data_feed.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md
================================================
# Heartbeat Sensor Control Table Data Feeder

## What is it?

It's a foundational component of the Heartbeat Sensor architecture. The primary purpose
is to populate and maintain the Control Table, which drives the entire heartbeat 
monitoring process. The Data Feeder Job is responsible for creating and updating entries
in the Control Table. Each entry in the control table represents a sensor_source (e.g., 
SAP, Kafka, Delta) for a unique combination of `sensor_id` and `trigger_job_id`.

## Configuration required to execute heartbeat sensor data feed

- **heartbeat_sensor_data_feed_path**: S3 path to the CSV file containing the heartbeat sensor control table data (e.g., `"s3://my_data_product_bucket/local_data/heartbeat_sensor/heartbeat_sensor_control_table_data.csv"`).
- **heartbeat_sensor_control_table**: Database table name for the [Heartbeat sensor control table](../heartbeat.md#control-table-schema) (e.g., `"my_database.heartbeat_sensor"`).

## How it works

1. A Heartbeat Sensor data feed job in each data product needs to be created to facilitate any
addition, update and deletion of entries.
2. Entries need to be added in CSV file format [as shown in Heartbeat Sensor Control table
Metadata description section for more](../heartbeat.md#the-structure-and-relevance-of-the-data-products-heartbeat-sensor-control-table).
Other fields in the control table will be filled automatically at different stages of 
the sensor process.
3. After adding/updating/deleting any entries in CSV, the Data feeder job needs to run again
to reflect the changes in the table.

## Code sample

```python
from lakehouse_engine.engine import execute_heartbeat_sensor_data_feed

execute_heartbeat_sensor_data_feed(
    heartbeat_sensor_data_feed_path="s3://my_data_product_bucket/local_data/heartbeat_sensor/heartbeat_sensor_control_table_data.csv" ,
    heartbeat_sensor_control_table="my_database.heartbeat_sensor"
)
```


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/kafka/__init__.py
================================================
"""
.. include::kafka.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/kafka/kafka.md
================================================
# Heartbeat Sensor for Kafka

This shows how to create a Heartbeat Sensor Orchestrator to detect new data from 
Kafka and trigger Databricks Workflows related to them.

## Configuration required to create an orchestration task for the kafka source

- **sensor_source**: Set to `kafka` in the Heartbeat Control Table to identify this as a Kafka source.
- **data_format**: Set to `kafka` to specify the data format for reading Kafka streams.
- **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`).
- **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`).
- **options**: Configuration options for Kafka reading:
    - `readChangeFeed`: Set to `"true"` to enable change data feed reading.
- **kafka_configs**: Kafka connection and security configurations:
    - `kafka_bootstrap_servers_list`: Kafka server endpoints.
    - `kafka_ssl_truststore_location`: Path to SSL truststore.
    - `truststore_pwd_secret_key`: Secret key for truststore password.
    - `kafka_ssl_keystore_location`: Path to SSL keystore.
    - `keystore_pwd_secret_key`: Secret key for keystore password.
- **kafka_secret_scope**: Databricks secret scope for Kafka credentials.
- **base_checkpoint_location**: S3 path for storing checkpoint data (required if `sensor_read_type` is `streaming`).
- **domain**: Databricks workflows domain for job triggering.
- **token**: Databricks workflows token for authentication.

### Kafka Data Feed CSV Configuration Entry

To check how the entry for a Kafka source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records).

**Additional Requirements for Kafka**:

The `sensor_id` follows a specific naming convention because you can have multiple data 
products using the same configuration file with different Kafka configuration values:

- The value for the `sensor_id` will be the Kafka Topic name starting with 
`<product_name:>` or any other prefix, example: `my_product: my.topic`.
- How it works? → Heartbeat receives a dictionary containing all kafka configurations by
product, which is passed as `kafka_configs` in the ACON.
Then it segregates the config based on `sensor_id` value present in the heartbeat
control table.
Heartbeat will split the `sensor_id` based on colon (:) and the first part of it will be
considered as product name (in our case, `my_product`) and the second part of
the split string will be the Kafka topic name (in our case, `my.topic`).
Finally, **it will make use of the product related kafka config from the `kafka_configs`**.

## Code sample of listener and trigger

```python
from lakehouse_engine.engine import (
    execute_sensor_heartbeat,
    trigger_heartbeat_sensor_jobs,
)

# Kafka configurations for the product, we strongly recommend to read these values from a external configuration file.
kafka_configs = {
  "my_product": {
    "kafka_bootstrap_servers_list": "KAFKA_SERVER",
    "kafka_ssl_truststore_location": "TRUSTSTORE_LOCATION",
    "truststore_pwd_secret_key": "TRUSTSTORE_PWD",
    "kafka_ssl_keystore_location": "KEYSTORE_LOCATION",
    "keystore_pwd_secret_key": "KEYSTORE_PWD"
  }
}

# Create an ACON dictionary for all kafka source entries.
# This ACON dictionary is useful for passing parameters to heartbeat sensors.

heartbeat_sensor_config_acon = {
    "sensor_source": "kafka",
    "data_format": "kafka",
    "heartbeat_sensor_db_table": "my_database.heartbeat_sensor",
    "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors",
    "options": {
        "readChangeFeed": "true",
    },
    "kafka_configs": kafka_configs,
    "kafka_secret_scope": "DB_SECRET_SCOPE",
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "domain": "DATABRICKS_WORKFLOWS_DOMAIN",
    "token": "DATABRICKS_WORKFLOWS_TOKEN",
}

# Execute Heartbeat sensor and trigger jobs which have acquired new data. 
execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon)
trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon)
```


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/manual_table/__init__.py
================================================
"""
.. include::manual_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/manual_table/manual_table.md
================================================
# Heartbeat Sensor for Manual Table

This shows how to create a Heartbeat Sensor Orchestrator to detect new data from a
Manual Table and trigger Databricks Workflows related to them.

**Manual Tables (Lakehouse Manual Upload)** are different from regular Delta tables because:

- **Data Upload Pattern**: Instead of continuous streaming or scheduled batch loads, data is manually uploaded by users at irregular intervals.
- **Detection Challenge**: Unlike regular Delta tables with change data feeds or append operations, manual tables are typically overwritten completely, making it harder to detect new data using standard mechanisms.
- **Custom Detection Logic**: Requires a special `upstream_key` (usually a timestamp column) to track when the table was last updated, since the table structure and most content may remain the same between uploads.
- **Sensor Source Type**: Uses `lmu_delta_table` instead of `delta_table` to indicate this special handling requirement.

## Configuration required to create an orchestration task for the manual table source

- **sensor_source**: Set to `lmu_delta_table` in the Heartbeat Control Table to identify this as a Lakehouse Manual Upload Delta table source.
- **data_format**: Set to `delta` to specify the data format for reading Delta tables.
- **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`).
- **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`).
- **domain**: Databricks workflows domain for job triggering.
- **token**: Databricks workflows token for authentication.

### Manual Tables Data Feed CSV Configuration Entry

To check how the entry for a manual table source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records).

**Additional Requirements for Manual Tables**:

- **sensor_id**: Needs to be filled with the Lakehouse Manual Upload Delta table name along with database, e.g., `my_database.my_manual_table`.
- **upstream_key**: Must specify the table date/timestamp column (typically named `date`) which indicates when the Lakehouse Manual Upload table was last overwritten. This is crucial for detecting new manual uploads.

**Setup Requirements**:

- A column named **`date`** must be added to your Lakehouse Manual Upload source Delta table.
- This column should contain a timestamp value in **YYYYMMDDHHMMSS** format.
- The value should be updated to `current_timestamp()` whenever new data is uploaded.
- This timestamp serves as the "fingerprint" that the sensor uses to detect new uploads.

!!! note
    **`date` (or any other name, but with the same purpose, need to be defined on 
    `upstream_key` CSV configuration entry) column requirement**: Since manual tables 
    are typically overwritten entirely during each upload, standard Delta table change
    detection mechanisms won't work. The Heartbeat sensor needs a reliable way to
    determine if new data has been uploaded since the last check.

## Code sample of listener and trigger

```python
from lakehouse_engine.engine import (
    execute_sensor_heartbeat,
    trigger_heartbeat_sensor_jobs,
)

# Create an ACON dictionary for all manual table source entries.
# This ACON dictionary is useful for passing parameters to heartbeat sensors.

heartbeat_sensor_config_acon = {
    "sensor_source": "lmu_delta_table",
    "data_format": "delta",
    "heartbeat_sensor_db_table": "my_database.heartbeat_sensor",
    "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors",
    "domain": "DATABRICKS_WORKFLOWS_DOMAIN",
    "token": "DATABRICKS_WORKFLOWS_TOKEN",
}

# Execute Heartbeat sensor and trigger jobs which have acquired new data. 
execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon)
trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon)
```


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/__init__.py
================================================
"""
.. include::sap_bw_b4.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/sap_bw_b4.md
================================================
# Heartbeat Sensor for SAP BW/B4

This shows how to create a Heartbeat Sensor Orchestrator to detect new data from 
SAP BW/B4 and trigger Databricks Workflows related to them.

## Configuration required to create an orchestration task for the SAP BW/B4 source

- **sensor_source**: Set to `sap_b4` or `sap_bw` in the Heartbeat Control Table to identify this as a SAP source.
- **data_format**: Set to `jdbc` to specify the data format for reading from SAP via JDBC connection.
- **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`).
- **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`).
- **options**: JDBC connection configuration:
    - `compress`: Set to `true` to enable compression.
    - `driver`: JDBC driver class name.
    - `url`: JDBC connection URL.
    - `user`: JDBC username for authentication.
    - `password`: JDBC password for authentication.
- **jdbc_db_table**: SAP logchain table name to query for process chain status.
- **domain**: Databricks workflows domain for job triggering.
- **token**: Databricks workflows token for authentication.

### SAP BW/B4 Data Feed CSV Configuration Entry

To check how the entry for a SAP BW/B4 source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records).

**Additional Requirements for SAP BW/4HANA**:

- The `sensor_id` needs to be filled with the Process Chain Name of the SAP object.
- `sensor_read_type` needs to be `batch` for SAP.

## Code sample of listener and trigger

```python
from lakehouse_engine.engine import (
    execute_sensor_heartbeat,
    trigger_heartbeat_sensor_jobs,
)

# Create an ACON dictionary for all SAP BW/B4 source entries.
# This ACON dictionary is useful for passing parameters to heartbeat sensors.

heartbeat_sensor_config_acon = {
    "sensor_source": "sap_b4|sap_bw",  # use sadp_b4 or sap_bw, depending on the source you are reading from
    "data_format": "jdbc",
    "heartbeat_sensor_db_table": "my_database.heartbeat_sensor",
    "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors",
    "options": {
        "compress": True,
        "driver": "JDBC_DRIVER",
        "url": "JDBC_URL",
        "user": "JDBC_USERNAME",
        "password": "JDBC_PSWD",
    },
    "jdbc_db_table": "SAP_LOGCHAIN_TABLE",
    "domain": "DATABRICKS_WORKFLOWS_DOMAIN",
    "token": "DATABRICKS_WORKFLOWS_TOKEN",
}

# Execute Heartbeat sensor and trigger jobs which have acquired new data. 
execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon)
trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon)
```


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/trigger_file/__init__.py
================================================
"""
.. include::trigger_file.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/trigger_file/trigger_file.md
================================================
# Heartbeat Sensor for Trigger Files

This shows how to create a Heartbeat Sensor Orchestrator to detect new data from 
Trigger Files and trigger Databricks Workflows related to them.

## Generating the trigger file

It's needed to create a task in the upstream pipeline to generate a trigger file,
indicating that the upstream source has completed and the dependent job can be triggered.
The `sensor_id` used to generate the file must match the `sensor_id` specified in the
heartbeat control table. Check here the [code example](#creation-of-the-trigger-file-following-the-sensorid-standard-code-example) of how to generate the
trigger file.

#### Creation of the trigger file following the `sensor_id` standard code example:
```pyhon
import datetime

sensor_id = "my_trigger"
file_root_path = "s3://my_data_product_bucket/triggers"

file_name = f"{sensor_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
file_path = "/".join([file_root_path, sensor_id, file_name])

### Write Trigger File to S3 location using dbutils
output = dbutils.fs.put(file_path, "Success")
```

## Configuration required to create an orchestration task for the trigger file source

- **sensor_source**: Set to `trigger_file` in the Heartbeat Control Table to identify this as a trigger file source.
- **data_format**: Set to `cloudfiles` to enable Spark Auto Loader functionality for monitoring trigger files. This format allows the system to automatically detect when new trigger files are available at the specified location and trigger the [corresponding `trigger_job_id`](../heartbeat.md#control-table-schema).
- **heartbeat_sensor_db_table**: Database table name for the Heartbeat sensor control table (e.g., `my_database.heartbeat_sensor`).
- **lakehouse_engine_sensor_db_table**: Database table name for the lakehouse engine sensors (e.g., `my_database.lakehouse_engine_sensors`).
- **options**: Cloud files configuration:
    - `cloudFiles.format`: Set to `"csv"` to specify the file format.
- **schema_dict**: Schema definition for the trigger files:
    - Defines the structure with fields like `file_name` (string) and `file_modification_time` (timestamp).
- **base_checkpoint_location**: S3 path for storing checkpoint data (required if `sensor_read_type` is `streaming`).
- **base_trigger_file_location**: S3 path where trigger files are located.
- **domain**: Databricks workflows domain for job triggering.
- **token**: Databricks workflows token for authentication.

### Trigger File Data Feed CSV Configuration Entry

To check how the entry for a trigger file source should look in the Heartbeat Control Table, [check it here](../heartbeat.md#heartbeat-sensor-control-table-reference-records).

**Additional Requirements for Trigger File**:

- The `sensor_id` will match the name used to create the trigger file. For example, if
the trigger file is named `my_trigger_YYYYMMDDHHMMSS.txt`, then the sensor_id will be
`my_trigger`.

## Code sample of listener and trigger

```python
from lakehouse_engine.engine import (
    execute_sensor_heartbeat,
    trigger_heartbeat_sensor_jobs,
)

# Create an ACON dictionary for all trigger file source entries.
# This ACON dictionary is useful for passing parameters to heartbeat sensors.

heartbeat_sensor_config_acon = {
    "sensor_source": "trigger_file",
    "data_format": "cloudfiles",
    "heartbeat_sensor_db_table": "my_database.heartbeat_sensor",
    "lakehouse_engine_sensor_db_table": "my_database.lakehouse_engine_sensors",
    "options": {
        "cloudFiles.format": "csv",
    },
    "schema_dict": {
        "type": "struct",
        "fields": [
            {
                "name": "file_name",
                "type": "string",
            },
            {
                "name": "file_modification_time",
                "type": "timestamp",
            },
        ],
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "base_trigger_file_location": "s3://my_data_product_bucket/triggers",
    "domain": "DATABRICKS_WORKFLOWS_DOMAIN",
    "token": "DATABRICKS_WORKFLOWS_TOKEN",
}

# Execute Heartbeat sensor and trigger jobs which have acquired new data. 
execute_sensor_heartbeat(acon=heartbeat_sensor_config_acon)
trigger_heartbeat_sensor_jobs(heartbeat_sensor_config_acon)
```


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/__init__.py
================================================
"""
.. include::update_heartbeat_sensor_status.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/update_heartbeat_sensor_status.md
================================================
# Update Heartbeat Sensor control delta table after processing the data

This shows how to update the status of your Heartbeat Sensor after executing the pipeline.

The `update_heartbeat_sensor_status` function is **critical for the Heartbeat Sensor lifecycle** because:

- **Completes the monitoring cycle**: When a Heartbeat sensor triggers a job, it sets the status to `IN_PROGRESS`. Without this update, the sensor would never know the job completed successfully.
- **Enables continuous monitoring**: Only after a job is marked as `COMPLETED` will the Heartbeat sensor resume monitoring that source for new events.
- **Prevents stuck jobs**: Without proper status updates, failed jobs remain in `IN_PROGRESS` status indefinitely, blocking future job triggers.
- **Supports recovery process**: This is essential for the [Job Failure Recovery Process](../heartbeat.md#heartbeat-sensor-workflow-explanation) described in the main Heartbeat documentation, where at least one successful run must be completed before the sensor resumes monitoring.

!!! note
    **When to use**: This function must be called as the **final task** in every Databricks job that is orchestrated by the Heartbeat Sensor to properly update the `status` to `COMPLETED` and record the job completion timestamp.

## Configuration required to update heartbeat sensor status

- **job_id**: The unique identifier of the Databricks job that was triggered by the Heartbeat sensor (e.g., `"MY_JOB_ID"`).
- **heartbeat_sensor_control_table**: Database table name for the Heartbeat sensor control table (e.g., `"my_database.heartbeat_sensor"`).
- **sensor_table**: Database table name for the lakehouse engine sensors table (e.g., `"my_database.lakehouse_engine_sensors"`).

## Code sample

Code sample on how to update the status of your sensor in the Heartbeat Sensors Control Table:
```python
from lakehouse_engine.engine import update_heartbeat_sensor_status

update_heartbeat_sensor_status(
    job_id="MY_JOB_ID",
    heartbeat_sensor_control_table="my_database.heartbeat_sensor",
    sensor_table="my_database.lakehouse_engine_sensors",
)
```

If you want to know more please visit the definition of the class [here](../../../../reference/packages/core/definitions.md#packages.core.definitions.HeartbeatConfigSpec).

================================================
FILE: lakehouse_engine_usage/sensors/sensor/__init__.py
================================================
"""
.. include::sensor.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/delta_table/__init__.py
================================================
"""
.. include::delta_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/delta_table/delta_table.md
================================================
# Sensor from Delta Table

This shows how to create a **Sensor to detect new data from a Delta Table**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.
    If you want to view some examples of usage you can visit the [delta upstream sensor table](../delta_upstream_sensor_table/delta_upstream_sensor_table.md) or the [jdbc sensor](../jdbc_table/jdbc_table.md).

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and **SUGGESTED** behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data will be consumed from a delta table in streaming mode,
so if there is any new data it will give condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "delta",
        "db_table": "upstream_database.source_delta_table",
        "options": {
            "readChangeFeed": "true", # to read changes in upstream table
        },
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it 
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/__init__.py
================================================
"""
.. include::delta_upstream_sensor_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md
================================================
# Sensor from other Sensor Delta Table

This shows how to create a **Sensor to detect new data from another Sensor Delta Table**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

It makes use of `generate_sensor_query` to generate the `preprocess_query`,
different from [delta_table](../delta_table/delta_table.md).

Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger 
the condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor, generate_sensor_query

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "delta",
        "db_table": "upstream_database.lakehouse_engine_sensors",
        "options": {
            "readChangeFeed": "true",
        },
    },
    "preprocess_query": generate_sensor_query("UPSTREAM_SENSOR_ID"),
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensors/sensor/file/__init__.py
================================================
"""
.. include::file.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/file/file.md
================================================
# Sensor from Files

This shows how to create a **Sensor to detect new data from a File Location**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios 

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, 
it will automatically trigger the proceeding task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "csv",  # You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles"
        "location": "s3://my_data_product_bucket/path",
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}
acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensors/sensor/jdbc_table/__init__.py
================================================
"""
.. include::jdbc_table.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/jdbc_table/jdbc_table.md
================================================
# Sensor from JDBC

This shows how to create a **Sensor to detect new data from a JDBC table**.

## Configuration required to have a Sensor

- **jdbc_args**: Arguments of the JDBC upstream.
- **generate_sensor_query**: Generates a Sensor query to consume data from the upstream, this function can be used on `preprocess_query` ACON option.
    - **sensor_id**: The unique identifier for the Sensor.
    - **filter_exp**: Expression to filter incoming new data.
      A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value` so that it can be replaced by the respective values from the sensor `control_db_table_name` for this specific sensor_id.
    - **control_db_table_name**: Sensor control table name.
    - **upstream_key**: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream).
    - **upstream_value**: the **first** upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). ***Note:*** This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`.
    - **upstream_table_name**: Table name to consume the upstream value. If it's empty the default value applied is `sensor_new_data`.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios 

This covers the following scenarios of using the Sensor:

1. [Generic JDBC template with `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [Generic JDBC template with `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor, generate_sensor_query

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "batch",
        "data_format": "jdbc",
        "jdbc_args": {
            "url": "JDBC_URL",
            "table": "JDBC_DB_TABLE",
            "properties": {
                "user": "JDBC_USERNAME",
                "password": "JDBC_PWD",
                "driver": "JDBC_DRIVER",
            },
        },
        "options": {
            "compress": True,
        },
    },
    "preprocess_query": generate_sensor_query(
        sensor_id="MY_SENSOR_ID",
        filter_exp="?upstream_key > '?upstream_value'",
        control_db_table_name="my_database.lakehouse_engine_sensors",
        upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",
    ),
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensors/sensor/kafka/__init__.py
================================================
"""
.. include::kafka.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/kafka/kafka.md
================================================
# Sensor from Kafka

This shows how to create a **Sensor to detect new data from Kafka**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.

!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as `sensor_new_data`.

- **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
- **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
there is no new data detected from upstream.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "streaming",
        "data_format": "kafka",
        "options": {
            "kafka.bootstrap.servers": "KAFKA_SERVER",
            "subscribe": "KAFKA_TOPIC",
            "startingOffsets": "earliest",
            "kafka.security.protocol": "SSL",
            "kafka.ssl.truststore.location": "TRUSTSTORE_LOCATION",
            "kafka.ssl.truststore.password": "TRUSTSTORE_PWD",
            "kafka.ssl.keystore.location": "KEYSTORE_LOCATION",
            "kafka.ssl.keystore.password": "KEYSTORE_PWD",
        },
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```

================================================
FILE: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/__init__.py
================================================
"""
.. include::sap_bw_b4.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/sap_bw_b4.md
================================================
# Sensor from SAP

This shows how to create a **Sensor to detect new data from a SAP LOGCHAIN table**.

## Configuration required to have a Sensor

- **sensor_id**: A unique identifier of the sensor in a specific job.
- **assets**: List of assets considered for the sensor, which are considered as available once the
  sensor detects new data and status is `ACQUIRED_NEW_DATA`.
- **control_db_table_name**: Name of the sensor control table.
- **input_spec**: Input spec with the upstream source.
- **preprocess_query**: Query to filter data returned by the upstream.


!!! note
    This parameter is only needed when the upstream data have to be filtered, in this case a custom
    query should be created with the source table as `sensor_new_data`.

    - **base_checkpoint_location**: Spark streaming checkpoints to identify if the upstream has new data.
    - **fail_on_empty_result**: Flag representing if it should raise `NoNewDataException` when
    there is no new data detected from upstream.

Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream.
The Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream:
`generate_sensor_sap_logchain_query` and `generate_sensor_query`.

- **generate_sensor_sap_logchain_query**: This function aims
  to create a temporary table with timestamp from the SAP LOGCHAIN table, which is a process control table.
  
    !!! note
        this temporary table only lives during runtime, and it is related with the
        sap process control table but has no relationship or effect on the sensor control table.
    
        - **chain_id**: SAP Chain ID process.
        - **dbtable**: SAP LOGCHAIN db table name, default: `my_database.RSPCLOGCHAIN`.
        - **status**: SAP Chain Status of your process, default: `G`.
        - **engine_table_name**: Name of the temporary table created from the upstream data, 
        default: `sensor_new_data`.
        This temporary table will be used as source in the `query` option.

  - **generate_sensor_query**: Generates a Sensor query to consume data from the temporary table created in the `prepareQuery`.
      - **sensor_id**: The unique identifier for the Sensor.
      - **filter_exp**: Expression to filter incoming new data.
        A placeholder `?upstream_key` and `?upstream_value` can be used, example: `?upstream_key > ?upstream_value`
        so that it can be replaced by the respective values from the sensor `control_db_table_name`
        for this specific sensor_id.
      - **control_db_table_name**: Sensor control table name.
      - **upstream_key**: the key of custom sensor information to control how to identify
        new data from the upstream (e.g., a time column in the upstream).
      - **upstream_value**: the **first** upstream value to identify new data from the
        upstream (e.g., the value of a time present in the upstream).
        .. note:: This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is `-2147483647`.
      - **upstream_table_name**: Table name to consume the upstream value.
        If it's empty the default value applied is `sensor_new_data`.
        .. note:: In case of using the `generate_sensor_sap_logchain_query` the default value for the temp table is `sensor_new_data`, so if passing a different value in the `engine_table_name` this parameter should have the same value.

If you want to know more please visit the definition of the class [here](../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

## Scenarios

This covers the following scenarios of using the Sensor:

1. [The `fail_on_empty_result=True` (the default and SUGGESTED behaviour).](#fail_on_empty_result-as-true-default-and-suggested)
2. [The `fail_on_empty_result=False`.](#fail_on_empty_result-as-false)

Data from SAP, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.

### `fail_on_empty_result` as True (default and SUGGESTED)

```python
from lakehouse_engine.engine import execute_sensor, generate_sensor_query, generate_sensor_sap_logchain_query

acon = {
    "sensor_id": "MY_SENSOR_ID",
    "assets": ["MY_SENSOR_ASSETS"],
    "control_db_table_name": "my_database.lakehouse_engine_sensors",
    "input_spec": {
        "spec_id": "sensor_upstream",
        "read_type": "batch",
        "data_format": "jdbc",
        "options": {
            "compress": True,
            "driver": "JDBC_DRIVER",
            "url": "JDBC_URL",
            "user": "JDBC_USERNAME",
            "password": "JDBC_PWD",
            "prepareQuery": generate_sensor_sap_logchain_query(chain_id="CHAIN_ID", dbtable="JDBC_DB_TABLE"),
            "query": generate_sensor_query(
                sensor_id="MY_SENSOR_ID",
                filter_exp="?upstream_key > '?upstream_value'",
                control_db_table_name="my_database.lakehouse_engine_sensors",
                upstream_key="UPSTREAM_COLUMN_TO_IDENTIFY_NEW_DATA",
            ),
        },
    },
    "base_checkpoint_location": "s3://my_data_product_bucket/checkpoints",
    "fail_on_empty_result": True,
}

execute_sensor(acon=acon)
```

### `fail_on_empty_result` as False

Using `fail_on_empty_result=False`, in which the `execute_sensor` function returns a `boolean` representing if it
has acquired new data. This value can be used to execute or not the next steps.

```python
from lakehouse_engine.engine import execute_sensor

acon = {
    [...],
    "fail_on_empty_result": False
}

acquired_data = execute_sensor(acon=acon)
```


================================================
FILE: lakehouse_engine_usage/sensors/sensor/sensor.md
================================================
# Sensor

## What is it?

The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small
single-node clusters to check if an upstream system or data product contains new data since the last execution of our
job. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new
data, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction,
Transformation, and Loading).

## How do Sensor-based jobs work?

<img src="../../../assets/img/sensor_os.png" alt="image" width="1000px" height="auto">

With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source
system) have new data since the last successful job. We accomplish this through the approach illustrated above, which
can be interpreted as follows:

1. A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors;
2. The Sensor task may run in a very tiny single-node cluster to ensure cost
   efficiency ([check sensor cost efficiency](#are-sensor-based-jobs-cost-efficient));
3. If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster
   to process all the ETL tasks (data processing tasks).
4. In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options:
    1. **(Preferred)** Sense the upstream Data Product sensor control delta table;
    2. Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table
       sensor);

## The Structure and Relevance of the Sensors Control Table

The concept of the lakehouse-engine sensor is based on a special delta table stored inside the data product that chooses to opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that data product. You can refer to the below table to understand the sensor delta table structure:

| Column Name                 | Type          | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
|-----------------------------|---------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **sensor_id**               | STRING        | A unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream.<br />Each sensor in each job should have a different sensor_id.<br />If you attempt to create 2 sensors with the same sensor_id, the engine will fail.                                                                                                                                                                                                                                                              |
| **assets**                  | ARRAY<STRING> | A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status *PROCESSED_NEW_DATA*.                                                                                                                                                                                                                                                                                                                                                                                                                                             |
| **status**                  | STRING        | Status of the sensor. Can either be:<br /><ul><li>*ACQUIRED_NEW_DATA* – when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.</li><li>*PROCESSED_NEW_DATA* - when the job where the sensor is located has processed all the tasks in that job.</li></ul>                                                                                                                                                                                                                                             |
| **status_change_timestamp** | STRING        | Timestamp when the status has changed for the last time.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
| **checkpoint_location**     | STRING        | Base location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors.                                                                                                                                                                                                                                                                                                                                      |
| **upstream_key**            | STRING        | Upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically).<br />This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).                                                                                                                                                   |
| **upstream_value**          | STRING        | Upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key. <br />This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database). |

!!! note "Control Table Requirements"
** Sensors**: You need to add this control table to your data product to use sensors.

    **Heartbeat Sensor**: Uses Sensor control table and a different heartbeat control table structure. For Heartbeat Sensor implementation, refer to the [Heartbeat Sensor Control Table structure](heartbeat/heartbeat.md#heartbeat-sensor-control-table-reference-records).

## How is it Different from Scheduled Jobs?

Both sensor-based jobs and Heartbeat Sensor jobs are still scheduled, but they can be scheduled with higher frequency because they are more cost-efficient than spinning up multi-node clusters for heavy ETL operations, only to discover that the upstream doesn't have new data.
Each job includes a sensor task that checks for new data before proceeding with ETL tasks. If no new data is found, the job exits early without consuming additional resources.

## Are Sensor-based Jobs Cost-Efficient?

Yes, for the same schedule (e.g., 4 times a day), sensor-based jobs are significantly more cost-efficient than scheduling regular jobs because:

1. **Minimal Resource Usage**: Sensor tasks run on very small single-node clusters
2. **Conditional Processing**: Larger ETL clusters are only spun up when new data is available
3. **Early Exit Strategy**: Jobs exit early if no new data is detected, saving compute costs
4. **Optimized Scheduling**: You can schedule sensor checks more frequently without proportional cost increases

For demanding SLAs, you can implement alternative architectures with continuous (always-running) sensor clusters that trigger respective data processing jobs whenever new data becomes available.

## Sensor Steps

1. Create your sensor task for the upstream source. Examples of available sources:
    - [Delta Table](delta_table/delta_table.md)
    - [Delta Upstream Sensor Table](delta_upstream_sensor_table/delta_upstream_sensor_table.md)
    - [File](file/file.md)
    - [JDBC](jdbc_table/jdbc_table.md)
    - [Kafka](kafka/kafka.md)
    - [SAP BW/B4](sap_bw_b4/sap_bw_b4.md)
2. Setup/Execute your ETL task based in the Sensor Condition
3. Update the Sensor Control table status with the [Update Sensor Status](update_sensor_status/update_sensor_status.md)


================================================
FILE: lakehouse_engine_usage/sensors/sensor/update_sensor_status/__init__.py
================================================
"""
.. include::update_sensor_status.md
"""


================================================
FILE: lakehouse_engine_usage/sensors/sensor/update_sensor_status/update_sensor_status.md
================================================
# Update Sensor control delta table after processing the data

This shows how to **update the status of your Sensor after processing the new data**.

Here is an example on how to update the status of your sensor in the Sensors Control Table:
```python
from lakehouse_engine.engine import update_sensor_status

update_sensor_status(
    sensor_id="MY_SENSOR_ID",
    control_db_table_name="my_database.lakehouse_engine_sensors",
    status="PROCESSED_NEW_DATA",
    assets=["MY_SENSOR_ASSETS"]
)
```

If you want to know more please visit the definition of the class [here](../../../../reference/packages/core/definitions.md#packages.core.definitions.SensorSpec).

================================================
FILE: lakehouse_engine_usage/sensors/sensors.md
================================================
# Sensors

## What is it?

The lakehouse engine provides two complementary sensor solutions for monitoring upstream systems and detecting new data:

### 1. Sensor

Traditional lakehouse engine sensors are abstractions that simplify complex Spark code, allowing you to check if an upstream system or data product contains new data since the last job execution. These sensors run in very small single-node clusters to ensure cost efficiency. If the upstream contains new data, the sensor triggers the rest of the job; otherwise, the job exits without spinning up larger clusters for intensive ETL operations.

**Key Characteristics:**

- Individual sensor configuration for each data source within jobs.
- Manual job execution after sensor detection or the need of adding a Sensor task in the beginning of the pipeline.
- Single-source monitoring capability per task, coupling it directly with the source and not with a source type.
- Built-in cost optimization through minimal cluster usage.

### 2. Heartbeat Sensor

The Heartbeat Sensor is a robust, centralized orchestration system that enhances the Sensor infrastructure. It provides automated event detection, efficient multiple sources parallelism detection, and seamless integration with downstream workflows. Unlike Sensors that require individual configuration for each data source, the Heartbeat Sensor manages multiple sources through a single control table and automatically triggers Databricks jobs when new data is detected.

**Key Characteristics:**

- Centralized control table for managing all Sensor sources
- Automatic Databricks job triggering via Job Run API
- Multi-source support with dependency management
- Built-in hard/soft dependency validation
- Comprehensive status tracking and lifecycle management

## When to Use Each Solution

| Aspect                    | Sensor                                                   | Heartbeat Sensor                                           |
|---------------------------|----------------------------------------------------------|------------------------------------------------------------|
| **Use Case**              | Simple, single-source monitoring within individual jobs. | Complex, multi-source orchestration with job dependencies. |
| **Configuration**         | Individual sensor setup per job.                         | Centralized control table configuration.                   |
| **Job Triggering**        | Manual job execution after sensor detection.             | Automatic Databricks job triggering via Job API.           |
| **Dependency Management** | Not supported.                                           | Built-in hard/soft dependency validation.                  |
| **Scalability**           | Limited to individual sensors.                           | Highly scalable with parallel source type processing.      |
| **Management Overhead**   | Higher (individual configurations).                      | Lower (centralized management).                            |
| **Best For**              | Single data product monitoring.                          | Enterprise-level orchestration.                            |

### Decision Guide

**Choose Sensors when:**

- You need simple monitoring for a single data source.
- Your workflow involves manual job execution or you are up to update your pipeline to have a Sensor task at the beginning.
- You have straightforward ETL pipelines without complex dependencies.
- You prefer embedded sensor logic within individual jobs.
- Your data pipeline is relatively straightforward.

**Choose Heartbeat Sensor when:**

- You need to orchestrate multiple data sources and dependencies.
- You want automated job triggering without manual intervention.
- You require centralized monitoring and management.
- You need to handle complex multi-source workflows at enterprise scale.
- You require enterprise-level orchestration capabilities.
- You need centralized monitoring and status management.

Both solutions can coexist in the same environment, allowing you to choose the appropriate sensor type based on specific use case requirements.

## How do Sensor-based Jobs Work?

With sensors, data products in the lakehouse can detect if another data product or upstream system contains new data since the last successful job execution. The workflow is as follows:

1. **Data Detection**: A data product checks if Kafka, JDBC, or any other supported Sensor source contains new data using the respective sensors.
2. **Cost-Efficient Execution**: The Sensor task runs in a very small single-node cluster to ensure cost efficiency.
3. **Conditional Processing**: If the Sensor detects new data in the upstream, you can start a different ETL job cluster to process all ETL tasks (data processing tasks).
4. **Cross-Product Sensing**: Different data products can Sense if upstream data products have new data using:
    - **(Preferred)** Sensing the upstream data product's Sensor control delta table.
    - Sensing the upstream data product's data files in S3 (files sensor) or delta tables (delta table sensor).

For detailed information about Heartbeat Sensor implementation, configuration, and usage, see the [Sensor documentation](sensor/sensor.md).

## How do Heartbeat Sensor Jobs Work?

The Heartbeat Sensor approach uses a centralized sensor cluster running on a single node that continuously checks for new events from different sensor sources mentioned in the Heartbeat sensor control table. When a new event is available from a sensor source, it automatically triggers the corresponding job via the Databricks Job Run API using a pull-based approach.

**Workflow Process:**

1. **Continuous Monitoring**: The heartbeat cluster continuously polls various sensor sources.
2. **Event Detection**: Checks for `NEW_EVENT_AVAILABLE` status from configured sources.
3. **Dependency Validation**: Validates hard/soft dependencies before job triggering.
4. **Automatic Triggering**: Automatically triggers dependent Databricks jobs.
5. **Status Management**: Updates job status throughout the lifecycle.

**Key Advantages:**

- **Centralized Control**: Single control table manages all sensor sources and dependencies.
- **Automated Orchestration**: No manual intervention required for job triggering.
- **Multi-Source Support**: Handles diverse source types (SAP, Kafka, Delta Tables, Manual Uploads, Trigger Files) in one unified system.
- **Dependency Management**: Built-in validation prevents premature job execution.
- **Status Tracking**: Comprehensive lifecycle tracking from detection to job completion.

For detailed information about Heartbeat Sensor implementation, configuration, and usage, see the [Heartbeat Sensor documentation](heartbeat/heartbeat.md).


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
    "setuptools==74.*"
]
build-backend = "setuptools.build_meta"

[project]
name = "lakehouse-engine"
requires-python = ">=3.12"
readme = "README.md"
license = {file = "LICENSE.txt"}
version = "2.0.0"
authors = [{name = "Adidas Lakehouse Foundations Team", email = "software.engineering@adidas.com"}]
description = "A configuration-driven Spark framework serving as the engine for several lakehouse algorithms and data flows."
keywords = ["framework", "big-data", "spark", "databricks", "data-quality", "data-engineering", "great-expectations",
    "lakehouse", "delta-lake", "configuration-driver"]
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Programming Language :: Python :: 3",
    "Intended Audience :: Developers",
    "Intended Audience :: Science/Research",
    "Intended Audience :: Other Audience",
    "Operating System :: OS Independent",
    "Topic :: Scientific/Engineering",
    "Topic :: Software Development",
    "License :: OSI Approved :: Apache Software License"
]
dynamic = ["dependencies", "optional-dependencies"]

[project.urls]
Repository = "https://github.com/adidas/lakehouse-engine"
Documentation = "https://adidas.github.io/lakehouse-engine-docs/index.html"
Issues = "https://github.com/adidas/lakehouse-engine/issues"
Releases = "https://github.com/adidas/lakehouse-engine/releases"

[tool.setuptools.dynamic]
dependencies = { file = ["cicd/requirements.lock"] }
optional-dependencies.os = { file = ["cicd/requirements_os.lock"] }
optional-dependencies.azure = { file = ["cicd/requirements_azure.lock"] }
optional-dependencies.dq = { file = ["cicd/requirements_dq.lock"] }
optional-dependencies.sftp = { file = ["cicd/requirements_sftp.lock"] }
optional-dependencies.sharepoint = { file = ["cicd/requirements_sharepoint.lock"] }

[tool.setuptools.packages.find]
exclude = ["tests*", "lakehouse_engine_usage*"]
namespaces = false

[tool.setuptools.package-data]
lakehouse_engine = ["configs/engine.yaml"]

[tool.isort]
profile = "black"

[tool.mypy]
warn_return_any = true
warn_unused_configs = true
ignore_missing_imports = false
strict_optional = false
disallow_untyped_defs = true

[[tool.mypy.overrides]]
module = [
    "delta.*",
    "pyspark.*",
    "py4j.*",
    "great_expectations.*",
    "pandas.*",
    "IPython.*",
    "nest_asyncio.*",
    "msgraph.*",
    "importlib.*",
    "yaml.*",
    "ruamel.*",
    "msal.*",
    "dbruntime.databricks_repl_context.*"
]
ignore_missing_imports = true

[tool.pytest.ini_options]
testpaths = [
    "tests"
]
filterwarnings = [
    # coming from GX and also on their pyproject ignores
    "ignore: Jupyter is migrating its paths to use standard platformdirs:DeprecationWarning", #1 warning
    # We are defining result_format at the Checkpoint level (which is the right one), but GX is wrongly
    # triggering the warning, because it is also considering the defaults of the expectations for triggering the warning.
    # Only place where we are not defining at Checkpoint level is for custom expectation local test, as we don't
    # need checkpoint for the test.
    "ignore:`result_format` configured at the Validator-level will not be persisted:UserWarning", # 12 warnings
    "ignore:`result_format` configured at the Expectation-level will not be persisted:UserWarning", # 12 warnings
    "ignore: jsonschema.RefResolver is deprecated as of v4.18.0:DeprecationWarning", #1985 warnings come from this one
    "ignore: The default dtype for empty Series will be 'object' instead of 'float64' in a future version.:DeprecationWarning",
    "ignore: The default dtype for empty Series will be:FutureWarning",
    # Warning about host keys on local ftp tests with paramiko
    "ignore: Unknown ssh-rsa host key for : UserWarning",
    # GX library is using fields.Number from marshmallow, which is deprecated and will be removed in Marshmallow 4.0
    "ignore: `Number` field should not be instantiated. Use `Integer`, `Float`, or `Decimal` instead.:DeprecationWarning"
]

================================================
FILE: samples/cricket_dq_tutorial.py
================================================
# This sample tutorial is based on the dataset available here: https://www.kaggle.com/datasets/vikramrn/icc-mens-cricket-odi-world-cup-wc-2023-bowling.
# The goal of the tutorial is to demonstrate how you can use the Lakehouse Engine to load data into a target location while assessing its data quality.

# You can install the Lakehouse Engine framework with below command just like any other python library,
# or you can also install it as a cluster-scoped library
pip install lakehouse-engine

# The ACON (algorithm configuration) is the way how you can interact with the Lakehouse Engine.
# Note: don't forget to change locations, buckets and databases to match your environment.
acon = {
    "input_specs": [
        {
            "spec_id": "cricket_world_cup_bronze",
            "read_type": "batch",
            "data_format": "csv",
            "options": {
                "header": True,
                "delimiter": ",",
            },
            "location": "s3://your_bucket_file_location/icc_wc_23_bowl.csv",
        }
    ],
    "dq_specs": [
        {
            "spec_id": "cricket_world_cup_data_quality",
            "input_id": "cricket_world_cup_bronze",
            "dq_type": "validator",
            "store_backend": "s3",
            "bucket": "your_bucket",
            "result_sink_location": "s3://your_bucket/dq_result_sink/gx_blog/",
            "result_sink_db_table": "your_database.gx_blog_result_sink",
            "tag_source_data": True,
            "unexpected_rows_pk": ["player", "match_id"],
            "fail_on_error": False,
            "critical_functions": [
                {
                    "function": "expect_column_values_to_be_in_set",
                    "args": {
                        "column": "team",
                        "value_set": [
                            "Sri Lanka", "Netherlands", "Australia", "England", "Bangladesh",
                            "New Zealand", "India", "Afghanistan", "South Africa", "Pakistan",
                        ],
                    },
                },
                {
                    "function": "expect_column_values_to_be_in_set",
                    "args": {
                        "column": "opponent",
                        "value_set": [
                            "Sri Lanka", "Netherlands", "Australia", "England", "Bangladesh",
                            "New Zealand", "India", "Afghanistan", "South Africa", "Pakistan",
                        ],
                    },
                },
            ],
            "dq_functions": [
                {
                    "function": "expect_column_values_to_not_be_null",
                    "args": {"column": "player"},
                },
                {
                    "function": "expect_column_values_to_be_between",
                    "args": {"column": "match_id", "min_value": 0, "max_value": 47},
                },
                {
                    "function": "expect_column_values_to_be_in_set",
                    "args": {"column": "maidens", "value_set": [0, 1]},
                },
            ],
        },
    ],
    "output_specs": [
        {
            "spec_id": "cricket_world_cup_silver",
            "input_id": "cricket_world_cup_data_quality",
            "write_type": "overwrite",
            "db_table": "your_database.gx_blog_cricket",
            "location": "s3://your_bucket/rest_of_path/gx_blog_cricket/",
            "data_format": "delta",
        }
    ],
}

# You need to import the Load Data algorithm from the Lakehouse Engine, so that you can perform Data Loads.
from lakehouse_engine.engine import load_data

# Finally, you just need to run the Load Data algorithm with the ACON that you have just defined.
load_data(acon=acon)


================================================
FILE: samples/tpch_load_and_analysis_tutorial.py
================================================
# Databricks notebook source
# MAGIC %md
# MAGIC ### How to use the Lakehouse Engine to load and analyse Data
# MAGIC This sample is composed of two main sections and goals:
# MAGIC 1. **Data Load (integrate data into the Lakehouse)**
# MAGIC     - load 2 data sources
# MAGIC     - join both sources and enhance the dataset with more information
# MAGIC     - write the output into a target table
# MAGIC 2. **Data Analysis (analyse the data ingested in the previous step)**
# MAGIC     - read the ingested data
# MAGIC     - assess the quality of that data
# MAGIC     - output this data as a DataFrame to enable further processing
# MAGIC     - analyse the data with sample Databricks Notebook Dashboards
# MAGIC
# MAGIC The base dataset used, on this sample, is the TPCH Dataset from Databricks Datasets (https://docs.databricks.com/en/discover/databricks-datasets.html).
# MAGIC Moreover, Databricks Notebook Dashboards are also used. This is why this example consists of a Databricks python Notebook, instead of simple raw python.

# COMMAND ----------

# You can install the Lakehouse Engine framework with below command just like any other python library,
# or you can also install it as a cluster-scoped library
%pip install lakehouse-engine

# COMMAND ----------

# MAGIC %md
# MAGIC #### 1. Data Load
# MAGIC On this section an example is provided in order to accomplish the following:
# MAGIC - loading `orders` and `customers` TPCH data
# MAGIC - add current date, join both data sources and identify Super VIPs
# MAGIC - write data into the final table
# MAGIC
# MAGIC **Note:** as it can be seen in the following code, the Lakehouse Engine cannot offer transformers for everything one might want to do on the data, as there may be very specific use cases. This is why the Lakehouse Engine provides full flexibility with Custom Transformations (`custom_transformation`), which can be used to pass any custom function, as the `is_a_super_vip` function used on this example. 

# COMMAND ----------

from pyspark.sql.functions import col
from pyspark.sql import DataFrame

def is_a_super_vip(df: DataFrame) -> DataFrame:
    """Example of custom transformation.
    
    It checks if the totalprice for a particular order is within the 
    10% higher and if the order priority is URGENT.
    If both criterias are met, the customer is considered a super vip.

    Args:
        df: DataFrame passed as input.

    Returns:
        DataFrame: the transformed DataFrame.
    """
    percentile_90 = df.approxQuantile("o_totalprice", [0.9], 0)[0]
    df = df.withColumn(
            "is_a_super_vip", 
            (col("o_totalprice") >= percentile_90) & 
            (col("o_orderpriority") == "1-URGENT")
        )
    return df

# COMMAND ----------

acon = {
        "input_specs": [
            # Batch (streaming is also supported) read tpch orders delta files from Databricks datasets location
            {
                "spec_id": "tpch_orders",
                "read_type": "batch",
                "data_format": "delta",
                "location": "/databricks-datasets/tpch/delta-001/orders",
            },
            # Batch read tpch customers from a samples delta table in Databricks
            {
                "spec_id": "tpch_customer",
                "read_type": "batch",
                "data_format": "delta",
                "db_table": "samples.tpch.customer",
            }
        ],
        "transform_specs": [
            {
                "spec_id": "tpch_orders_transformed",
                "input_id": "tpch_orders",
                "transformers": [
                    # Add current date to easily track when a particular row was added
                    {
                        "function": "add_current_date",
                        "args": {
                            "output_col": "lak_load_date"
                        }
                    },
                    # Join orders with customers to get the customer name.
                    # Having customer name in the table will make analysis easier
                    {
                        "function": "join",
                        "args": {
                            "join_with": "tpch_customer",
                            "join_type": "left outer",
                            "join_condition": "a.o_custkey = b.c_custkey",
                            "select_cols": ["a.*", "b.c_name as customer_name"]
                        }
                    },
                    # Custom transformation to assess if a customer should be considered Super VIP.
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": is_a_super_vip},
                    }
                ],
            },
        ],
        "output_specs": [
            # Overwrite data into an external table on top of the specified location, using delta data format.
            # Note: other write types are supported, such as append and merge, but overwrite is used for simplicity on this demo.
            {
                "spec_id": "tpch_orders_output",
                "input_id": "tpch_orders_transformed",
                "write_type": "overwrite",
                "db_table": "your_database.tpch_orders",
                "location": "s3://your_s3_bucket/silver/tpch_orders/",
                "data_format": "delta",
            }
        ],
    }

from lakehouse_engine.engine import load_data

tpch_df = load_data(acon=acon)

# COMMAND ----------

# As soon as the algorithm is finished, the dataframe output of the framework can be directly checked in order to analyse the data that have been just produced
display(tpch_df["tpch_orders_output"])

# COMMAND ----------

# MAGIC %md
# MAGIC #### 2. Data Analysis
# MAGIC On this section an example is provided in order to accomplish the following:
# MAGIC - reading the data loaded on the previous step, using a SQL query
# MAGIC - assess the quality of the data, by applying Data Quality functions/expectations
# MAGIC - output the data as a DataFrame for further processing
# MAGIC - analyse the data with sample Databricks Notebook Dashboards

# COMMAND ----------

acon = {
        "input_specs": [
            # Batch read a custom SQL query from the table we have just inserted data into
            {
                "spec_id": "tpch_orders",
                "read_type": "batch",
                "data_format": "sql",
                "query": """
                    SELECT o_orderkey, customer_name, o_totalprice, is_a_super_vip
                    FROM your_database.tpch_orders
                """,
            },
        ],
        "dq_specs": [
            # Assess the quality of data, by ensuring that the specified 3 columns have no nulls.
            {
                "spec_id": "tpch_orders_dq",
                "input_id": "tpch_orders",
                "dq_type": "validator",
                "bucket": "your_s3_bucket",
                "dq_functions": [
                    {"function": "expect_column_values_to_not_be_null", "args": {"column": "o_orderkey"}},
                    {"function": "expect_column_values_to_not_be_null", "args": {"column": "customer_name"}},
                    {"function": "expect_column_values_to_not_be_null", "args": {"column": "o_totalprice"}}
                    ]
            },
        ],
        "output_specs": [
            # As the data is being analysed, there is no need to write it into any table or location.
            # Thus, the data output is just a Dataframe that can be used for further debug or processing.
            {
                "spec_id": "validated_tpch_orders",
                "input_id": "tpch_orders_dq",
                "data_format": "dataframe",
            }
        ],
    }

from lakehouse_engine.engine import load_data

validated_tpch_df = load_data(acon=acon)

# COMMAND ----------

# Create a Temporary View to make it easier to interact with the Data using SQL
validated_tpch_df["validated_tpch_orders"].createOrReplaceTempView("tpch_order_analysis")

# COMMAND ----------

# MAGIC %sql
# MAGIC -- the data that came from the previous load_data algorithm execution can now be queried
# MAGIC -- to analyse the customers and orders classified as SUPER VIP
# MAGIC SELECT customer_name, o_totalprice, is_a_super_vip
# MAGIC FROM tpch_order_analysis
# MAGIC GROUP BY customer_name, o_totalprice, is_a_super_vip
# MAGIC ORDER BY o_totalprice desc

# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT customer_name, o_totalprice
# MAGIC FROM tpch_order_analysis
# MAGIC WHERE is_a_super_vip is True
# MAGIC GROUP BY customer_name, o_totalprice
# MAGIC ORDER BY o_totalprice desc
# MAGIC LIMIT 10

# COMMAND ----------


================================================
FILE: tests/__init__.py
================================================
"""Tests package."""


================================================
FILE: tests/configs/__init__.py
================================================
"""This module has the engine test configurations."""


================================================
FILE: tests/configs/engine.yaml
================================================
dq_bucket: /app/tests/lakehouse/out/feature
dq_dev_bucket: /app/tests/lakehouse/out/feature
notif_disallowed_email_servers:
  - smtp.test.com
engine_usage_path: file:///app/tests/lakehouse/logs/lakehouse-engine-logs
engine_dev_usage_path: file:///app/tests/lakehouse/logs/lakehouse-engine-logs
collect_engine_usage: disabled
dq_functions_column_list:
  - dq_rule_id
  - execution_point
  - filters
  - schema
  - table
  - column
  - dimension
dq_result_sink_columns_to_delete:
  - partial_unexpected_list
  - partial_unexpected_counts
  - partial_unexpected_index_list
  - unexpected_list
sharepoint_authority: https://login.microsoftonline.com
sharepoint_api_domain: https://graph.microsoft.com
sharepoint_company_domain: company_name.sharepoint.com
prod_catalog: sample_catalog

================================================
FILE: tests/conftest.py
================================================
"""Module to configure the test environment."""

from typing import Any, Generator
from unittest.mock import patch

import pytest

from lakehouse_engine.core.exec_env import ExecEnv
from tests.utils.exec_env_helpers import ExecEnvHelpers
from tests.utils.local_storage import LocalStorage

RESOURCES = "/app/tests/resources/"
FEATURE_RESOURCES = RESOURCES + "feature"
UNIT_RESOURCES = RESOURCES + "unit"
LAKEHOUSE = "/app/tests/lakehouse/"
LAKEHOUSE_FEATURE_IN = LAKEHOUSE + "in/feature"
LAKEHOUSE_FEATURE_CONTROL = LAKEHOUSE + "control/feature"
LAKEHOUSE_FEATURE_OUT = LAKEHOUSE + "out/feature"
LAKEHOUSE_FEATURE_LOGS = LAKEHOUSE + "logs/lakehouse-engine-logs"


@pytest.fixture(scope="session", autouse=True)
def patch_databricks_utils_job_info() -> Generator:
    """Patch DatabricksUtils.get_databricks_job_information to return local values."""
    with patch(
        "lakehouse_engine.utils.databricks_utils."
        "DatabricksUtils.get_databricks_job_information",
        return_value=("local", "local"),
    ):
        yield


def pytest_addoption(parser: Any) -> Any:
    """Setting extra options for pytest command."""
    parser.addoption(
        "--spark_driver_memory",
        action="store",
        help="memory limit for the spark driver (default 2g)",
    )


@pytest.fixture(scope="session", autouse=True)
def spark_driver_memory(request: Any) -> Any:
    """Fetching the value of spark_driver_memory parameter."""
    return request.config.getoption(name="--spark_driver_memory")


@pytest.fixture(scope="session", autouse=True)
def prepare_exec_env(spark_driver_memory: str) -> None:
    """Prepare the execution environment before any test is executed."""
    # remove previous test lakehouse data
    LocalStorage.clean_folder(LAKEHOUSE)
    ExecEnv.set_default_engine_config("tests.configs")
    ExecEnvHelpers.prepare_exec_env(spark_driver_memory)
    ExecEnv.SESSION.sql(f"CREATE DATABASE IF NOT EXISTS test_db LOCATION '{LAKEHOUSE}'")


@pytest.fixture(autouse=True)
def before_each_test() -> Generator:
    """Reset default spark session configs."""
    yield
    ExecEnvHelpers.reset_default_spark_session_configs()


@pytest.fixture(scope="session", autouse=True)
def test_session_closure(request: Any) -> None:
    """Finalizing resources."""

    def finalizer() -> None:
        """Close spark session."""
        ExecEnv.SESSION.stop()

    request.addfinalizer(finalizer)


================================================
FILE: tests/feature/__init__.py
================================================
"""Feature tests focusing on algorithm execution with different acon functionalities."""


================================================
FILE: tests/feature/custom_expectations/__init__.py
================================================
"""Tests related to the custom expectation's implementation."""


================================================
FILE: tests/feature/custom_expectations/test_custom_expectations.py
================================================
"""Test custom expectation validations."""

from json import loads
from typing import Any, Tuple

import pytest
from pyspark.sql import DataFrame

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import execute_dq_validation
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "custom_expectations"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "expectation_name": "expect_column_pair_a_to_be_smaller_or_equal_than_b",
            "arguments": {
                "column_A": "salesorder",
                "column_B": "amount",
                "margin": 9.78,
            },
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_pair_a_to_be_smaller_or_equal_than_b",
            "arguments": {"column_A": "salesorder", "column_B": "amount"},
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_multicolumn_column_a_must_equal_b_or_c",
            "arguments": {
                "column_list": ["item", "itemcode", "amount"],
            },
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_multicolumn_column_a_must_equal_b_or_c",
            "arguments": {
                "column_list": ["item", "itemcode", "amount"],
            },
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_queried_column_agg_value_to_be",
            "arguments": {
                "template_dict": {
                    "column": "amount",
                    "group_column_list": "year, month, day",
                    "agg_type": "max",
                    "condition": "lesser",
                    "max_value": 10000,
                },
            },
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_queried_column_agg_value_to_be",
            "arguments": {
                "template_dict": {
                    "column": "amount",
                    "group_column_list": "year,month,day",
                    "agg_type": "count",
                    "condition": "greater",
                    "min_value": 0,
                },
            },
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_values_to_be_date_not_older_than",
            "arguments": {
                "column": "date",
                "timeframe": {"years": 100},
            },
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_values_to_be_date_not_older_than",
            "arguments": {
                "column": "date",
                "timeframe": {"years": 100},
            },
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b",  # noqa: E501
            "arguments": {"column_A": "EDATU", "column_B": "ERDAT"},
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b",  # noqa: E501
            "arguments": {"column_A": "MBDAT", "column_B": "ERDATA"},
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_pair_a_to_be_not_equal_to_b",
            "arguments": {
                "column_A": "group_article",
                "column_B": "article_number",
            },
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_pair_a_to_be_not_equal_to_b",
            "arguments": {
                "column_A": "group_article",
                "column_B": "article_number",
            },
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_values_to_not_be_null_or_empty_string",
            "arguments": {
                "column": "number",
            },
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
        {
            "expectation_name": "expect_column_values_to_not_be_null_or_empty_string",
            "arguments": {
                "column": "number",
            },
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "custom_expectation_result": "success",
        },
    ],
)
def test_custom_expectation(scenario: dict, caplog: Any) -> None:
    """Test the implementation of the custom expectations.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    _clean_folders(scenario["expectation_name"])

    input_spec = {
        "spec_id": "sales_source",
        "read_type": scenario["read_type"],
        "data_format": "dataframe",
        "df_name": _generate_dataframe(
            scenario["read_type"], scenario["expectation_name"]
        ),
    }

    acon = _generate_acon(input_spec, scenario, "validator")

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario['expectation_name']}/data/control/*",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario['expectation_name']}/data/",
    )

    execute_dq_validation(acon=acon)

    dq_result_df, dq_control_df = _get_result_and_control_dfs(
        "test_db.sales_order",
        f'dq_control_{scenario["custom_expectation_result"]}',
        True,
        scenario["expectation_name"],
    )

    assert not DataframeHelpers.has_diff(
        dq_result_df.select("spec_id", "input_id", "success"),
        dq_control_df.fillna("").select("spec_id", "input_id", "success"),
    )

    for key in dq_result_df.collect():
        for result in loads(key.validation_results):
            assert {
                "success",
                "expectation_config",
            }.issubset(result.keys())


def _clean_folders(expectation_name: str) -> None:
    """Clean test folders and tables."""
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/{expectation_name}/data")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{expectation_name}/data")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{expectation_name}/checkpoint")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{expectation_name}/dq")
    ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.dq_sales")
    ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.sales_order")


def _generate_acon(
    input_spec: dict,
    scenario: dict,
    dq_type: str,
) -> dict:
    """Generate acon according to test scenario.

    Args:
        input_spec: input specification.
        scenario: the scenario being tested.
        dq_type: the type of data quality process.

    Returns: a dict corresponding to the generated acon.
    """
    dq_spec_add_options = {
        "result_sink_db_table": "test_db.sales_order",
        "result_sink_format": "json",
        "result_sink_explode": False,
        "dq_functions": [
            {
                "function": scenario["expectation_name"],
                "args": scenario["arguments"],
            }
        ],
    }

    return {
        "input_spec": input_spec,
        "dq_spec": {
            "spec_id": "dq_sales",
            "input_id": "sales_source",
            "dq_type": dq_type,
            "store_backend": "file_system",
            "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/{scenario['expectation_name']}/dq",  # noqa: E501
            **dq_spec_add_options,
        },
        "restore_prev_version": scenario.get("restore_prev_version", False),
    }


def _generate_dataframe(load_type: str, expectation_name: str) -> DataFrame:
    """Generate test dataframe.

    Args:
        load_type: batch or streaming.
        expectation_name: name of the expectation to test

    Returns: the generated dataframe.
    """
    if load_type == "batch":
        input_df = (
            ExecEnv.SESSION.read.format("csv")
            .option("header", True)
            .option("delimiter", "|")
            .schema(
                SchemaUtils.from_file(
                    f"file://{TEST_RESOURCES}/{expectation_name}/dq_sales_schema.json"
                )
            )
            .load(f"{TEST_RESOURCES}/{expectation_name}/data/source/part-01.csv")
        )
    else:
        input_df = (
            ExecEnv.SESSION.readStream.format("csv")
            .option("header", True)
            .option("delimiter", "|")
            .schema(
                SchemaUtils.from_file(
                    f"file://{TEST_RESOURCES}/{expectation_name}/dq_sales_schema.json"
                )
            )
            .load(f"{TEST_RESOURCES}/{expectation_name}/data/source/*")
        )

    return input_df


def _get_result_and_control_dfs(
    table: str, file_name: str, infer_schema: bool, expectation_name: str
) -> Tuple[DataFrame, DataFrame]:
    """Helper to get the result and control dataframes.

    Args:
        table: the table to read from.
        file_name: the file name to read from.
        infer_schema: whether to infer the schema or not.
        expectation_name: expectation name.

    Returns: the result and control dataframes.
    """
    dq_result_df = DataframeHelpers.read_from_table(table)

    dq_control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{expectation_name}/data/{file_name}.csv",
        file_format="csv",
        options={"header": True, "delimiter": "|", "inferSchema": infer_schema},
    )

    return dq_result_df, dq_control_df


================================================
FILE: tests/feature/custom_expectations/test_expectation_validity.py
================================================
"""Module with the validation code for the custom expectations."""

import copy
import importlib
import re

import pytest

from lakehouse_engine.core.definitions import DQDefaults

"""This value '✔' is used to filter the output from the GX diagnostics"""
CHECKMARK = "\u2714"
DIAGNOSTICS_VALIDATIONS = [
    " ✔ Has a docstring, including a one-line short description",
    " ✔ Has at least one positive and negative example case, and all test cases pass",
    " ✔ Has core logic and passes tests on at least one Execution Engine",
    "    ✔ All [0-9]+ tests for spark are passing",
    " ✔ Has core logic that passes tests for all applicable Execution Engines and SQL"
    " dialects",
    "    ✔ All [0-9]+ tests for spark are passing",
]

METRIC_NAME_TYPES = [
    "column_values",
    "multicolumn_values",
    "column_pair_values",
    "table_rows",
    "table_columns",
]

MAP_METRICS = []


@pytest.mark.parametrize("expectation", DQDefaults.CUSTOM_EXPECTATION_LIST.value)
def test_expectation_validity(expectation: str) -> None:
    """Validates the custom expectations defined in the project.

    Based on the diagnostics of the custom expectations this test validates if all the
    best practices are being followed.
    """
    result, metric_name = _run_diagnostics(expectation)

    _process_diagnostics_output(result)

    if metric_name:
        assert _validate_metric_name_structure(metric_name), (
            f"Metric name {metric_name} has the incorrect format. "
            f"Should be 'metric type'.'metric_name'"
        )

        MAP_METRICS.append(metric_name)

        assert len(MAP_METRICS) == len(
            set(MAP_METRICS)
        ), f"Metric names repeated: {MAP_METRICS}"


def _run_diagnostics(expectation_name: str) -> tuple:
    """Runs the diagnostics of the custom expectation.

    This function both runs the Great Expectations Diagnostics and
    retrieves the diagnostics checklist and the metric name defined.

    Args:
        expectation_name: name of the expectation file.

    Returns:
        The output of the diagnostics command and the expectation's metric name.
    """
    segments = expectation_name.split(".")[0].split("_")
    expectation_class_name = "".join(ele.title() for ele in segments[0:])

    module = importlib.import_module(
        f"lakehouse_engine.dq_processors.custom_expectations.{expectation_name}"
    )
    expectation_class = getattr(module, expectation_class_name)
    expectation = expectation_class()

    metric_name = ""

    if "map_metric" in dir(expectation):
        metric_name = expectation.map_metric

    return expectation.run_diagnostics().generate_checklist(), metric_name


def _process_diagnostics_output(diagnostics_output: str) -> None:
    """Processes the output from the expectation diagnostics.

    Args:
        diagnostics_output: the output from the diagnostics command.
    """
    validations = copy.deepcopy(DIAGNOSTICS_VALIDATIONS)
    for line in str(diagnostics_output).split("\n"):
        if CHECKMARK in line:
            for validation in validations:
                if re.match(validation, line):
                    validations.remove(validation)
                    break

    assert not validations, f"Validations not met: {validations}"


def _validate_metric_name_structure(metric_name: str) -> int:
    """Validates the structure of the custom expectation's metric name.

    The metric name must have two parts separated by a '.',
    and the first part must be the type of the expectation.

    Args:
        metric_name: custom expectation's metric name.

    Returns:
        The validation of custom expectation's the metric name.
    """
    parts = metric_name.split(".")

    if len(parts) != 2:
        return False

    if parts[0] not in METRIC_NAME_TYPES:
        return False

    return True


================================================
FILE: tests/feature/data_loader_custom_transformer/__init__.py
================================================
"""Feature tests focusing on data loader algorithm execution with custom transformer."""


================================================
FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_calculate_kpi.py
================================================
"""Tests for the DataLoader algorithm with custom transformations."""

import pytest
from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "data_loader_custom_transformer"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


def yet_another_kpi_calculator(df: DataFrame) -> DataFrame:
    """An example custom transformer that will be provided in the ACON.

    Args:
        df: DataFrame passed as input.

    Returns:
        DataFrame: the transformed DataFrame.
    """
    session = ExecEnv.SESSION
    df.createOrReplaceTempView("sales")
    kpi_df = session.sql(
        """
            SELECT date, SUM(amount) AS amount
            FROM sales
            GROUP BY date
        """
    )
    return kpi_df


def get_test_acon() -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sales_source",
                "read_type": "batch",
                "data_format": "csv",
                "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"},
                "schema_path": "file:///app/tests/lakehouse/in/feature/"
                "data_loader_custom_transformer/calculate_kpi/"
                "source_schema.json",
                "location": "file:///app/tests/lakehouse/in/feature/"
                "data_loader_custom_transformer/calculate_kpi/data",
            }
        ],
        "transform_specs": [
            {
                "spec_id": "calculated_kpi",
                "input_id": "sales_source",
                "transformers": [
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": yet_another_kpi_calculator},
                    }
                ],
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "calculated_kpi",
                "write_type": "overwrite",
                "data_format": "delta",
                "location": "file:///app/tests/lakehouse/out/feature/"
                "data_loader_custom_transformer/calculate_kpi/data",
            }
        ],
    }


@pytest.mark.parametrize("scenario", ["calculate_kpi"])
def test_calculate_kpi_and_merge(scenario: str) -> None:
    """Test full load with a custom transformation function.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/*_schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )

    load_data(acon=get_test_acon())

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_delta_load.py
================================================
"""Tests for the DataLoader algorithm with custom transformations."""

import pytest
from pyspark.sql import DataFrame
from pyspark.sql.functions import col

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "data_loader_custom_transformer"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


def multiply_by_100(df: DataFrame) -> DataFrame:
    """An example custom transformer that will be provided in the ACON.

    Args:
        df: DataFrame passed as input.

    Returns:
        DataFrame: the transformed DataFrame.
    """
    multiplied_df = df.withColumn("amount", col("amount") * 100)
    return multiplied_df


def get_test_acon() -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sales_source",
                "read_type": "streaming",
                "data_format": "csv",
                "options": {"header": True, "delimiter": "|"},
                "location": "file:///app/tests/lakehouse/in/feature/"
                "data_loader_custom_transformer/delta_load/data",
            }
        ],
        "transform_specs": [
            {
                "spec_id": "transformed_sales_source",
                "input_id": "sales_source",
                "transformers": [
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": multiply_by_100},
                    },
                    {
                        "function": "condense_record_mode_cdc",
                        "args": {
                            "business_key": ["salesorder", "item"],
                            "ranking_key_desc": [
                                "actrequest_timestamp",
                                "datapakid",
                                "partno",
                                "record",
                            ],
                            "record_mode_col": "recordmode",
                            "valid_record_modes": ["", "N", "R", "D", "X"],
                        },
                    },
                ],
            }
        ],
        "dq_specs": [
            {
                "spec_id": "checked_transformed_sales_source",
                "input_id": "transformed_sales_source",
                "dq_type": "validator",
                "store_backend": "file_system",
                "local_fs_root_dir": "/app/tests/lakehouse/out/feature/"
                "data_loader_custom_transformer/dq",
                "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
                "dq_functions": [
                    {
                        "function": "expect_column_values_to_not_be_null",
                        "args": {"column": "article"},
                    }
                ],
            },
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "checked_transformed_sales_source",
                "write_type": "merge",
                "data_format": "delta",
                "location": "file:///app/tests/lakehouse/out/feature/"
                "data_loader_custom_transformer/delta_load/data",
                "options": {
                    "checkpointLocation": "file:///app/tests/lakehouse/out/feature/"
                    "data_loader_custom_transformer/delta_load/checkpoint"
                },
                "merge_opts": {
                    "merge_predicate": "current.salesorder = new.salesorder "
                    "and current.item = new.item "
                    "and current.date <=> new.date",
                    "update_predicate": "new.actrequest_timestamp > "
                    "current.actrequest_timestamp or ( "
                    "new.actrequest_timestamp = "
                    "current.actrequest_timestamp and "
                    "new.datapakid > current.datapakid) or ( "
                    "new.actrequest_timestamp = "
                    "current.actrequest_timestamp and "
                    "new.datapakid = current.datapakid and "
                    "new.partno > current.partno) or ( "
                    "new.actrequest_timestamp = "
                    "current.actrequest_timestamp and "
                    "new.datapakid = current.datapakid and "
                    "new.partno = current.partno and new.record "
                    ">= current.record)",
                    "delete_predicate": "new.recordmode in ('R','D','X')",
                    "insert_predicate": "new.recordmode is null or new.recordmode "
                    "not in ('R','D','X')",
                },
            }
        ],
        "exec_env": {"spark.sql.streaming.schemaInference": True},
    }


@pytest.mark.parametrize("scenario", ["delta_load"])
def test_delta_load(scenario: str) -> None:
    """Test full load with a custom transformation function.

    Args:
        scenario: scenario to test.
    """
    _create_table(
        f"{scenario}",
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    load_data(acon=get_test_acon())

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-03.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    load_data(acon=get_test_acon())

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    load_data(acon=get_test_acon())

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-04.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    load_data(acon=get_test_acon())

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


def _create_table(table_name: str, location: str) -> None:
    """Create test table.

    Args:
        table_name: name of the table.
        location: location of the table.
    """
    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS test_db.{table_name} (
            actrequest_timestamp string,
            request string,
            datapakid int,
            partno int,
            record int,
            salesorder int,
            item int,
            recordmode string,
            date int,
            customer string,
            article string,
            amount int
        )
        USING delta
        LOCATION '{location}'
        """
    )


================================================
FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_sql_transformation.py
================================================
"""Tests for the DataLoader algorithm with custom transformations."""

import pytest

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "data_loader_custom_transformer"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


SQL = """
    SELECT date, SUM(amount) AS amount
    FROM sales_sql
    GROUP BY date
"""


def get_test_acon() -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sales_source",
                "read_type": "batch",
                "data_format": "csv",
                "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"},
                "schema_path": "file:///app/tests/lakehouse/in/feature/"
                "data_loader_custom_transformer/sql_transformation/"
                "source_schema.json",
                "location": "file:///app/tests/lakehouse/in/feature/"
                "data_loader_custom_transformer/sql_transformation/data",
                "temp_view": "sales_sql",
            }
        ],
        "transform_specs": [
            {
                "spec_id": "calculated_kpi",
                "input_id": "sales_source",
                "transformers": [
                    {
                        "function": "sql_transformation",
                        "args": {"sql": SQL},
                    }
                ],
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "calculated_kpi",
                "write_type": "overwrite",
                "data_format": "delta",
                "location": "file:///app/tests/lakehouse/out/feature/"
                "data_loader_custom_transformer/sql_transformation/data",
            }
        ],
    }


@pytest.mark.parametrize("scenario", ["sql_transformation"])
def test_sql_transformation_and_merge(scenario: str) -> None:
    """Test full load with a custom sql transformation function.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/*_schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )

    load_data(acon=get_test_acon())

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/delta_load/__init__.py
================================================
"""Delta load feature tests."""


================================================
FILE: tests/feature/delta_load/test_delta_load_group_and_rank.py
================================================
"""Test delta loads with group and rank."""

from typing import List

import pytest

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "delta_load/group_and_rank"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        ["with_duplicates_in_same_file", "batch"],
        ["with_duplicates_in_same_file", "streaming"],
        ["fail_with_duplicates_in_same_file", "batch"],
        ["fail_with_duplicates_in_same_file", "streaming"],
    ],
)
def test_delta_load_group_and_rank(scenario: List[str]) -> None:
    """Test delta loads in batch mode.

    Args:
        scenario: scenario to test.
            with_duplicates_in_same_file - This test includes duplicated rows in the
            same file produced by the source (e.g., an order is cancelled and created
            within the same file).
            fail_with_duplicates_in_same_file - purposely checks if the delta load fails
            (result has a diff compared to the control data), because sales order 7 item
            1 as cancelled status before created in the second source data file.
    """
    _create_table(scenario)

    execute_loads(scenario, 1)

    if scenario[1] == "streaming":
        # simulate a scenario where the same data is loaded twice in streaming mode
        execute_loads(scenario, 2)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/control/{scenario[1]}.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/{scenario[1]}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/"
            f"control_{scenario[1]}_schema.json"
        ),
    )

    if scenario[0] == "fail_with_duplicates_in_same_file":
        # sales order 7 item 1 in second file has event cancelled before created
        assert DataframeHelpers.has_diff(result_df, control_df)
    else:
        assert not DataframeHelpers.has_diff(result_df, control_df)


def execute_loads(scenario: List[str], iteration: int) -> None:
    """Execute the data loads.

    Args:
        scenario: scenario to test.
        iteration: number indicating the iteration in the testing process.
            This is useful because in this test we want to repeat the same loading
            process twice, to simulate a scenario where the same data is loaded twice.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/WE_SO_SCL_202108111400000000.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}"
        f"/data/WE_SO_SCL_202108111400000000.csv{iteration}",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/",
    )
    load_data(
        f"file://{TEST_RESOURCES}/{scenario[0]}/"
        f"{scenario[1] + ('_init' if scenario[1] == 'batch' else '_delta')}.json"
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/WE_SO_SCL_202108111500000000.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}"
        f"/data/WE_SO_SCL_202108111500000000.csv{iteration}",
    )
    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}_delta.json"
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/WE_SO_SCL_202108111600000000.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}"
        f"/data/WE_SO_SCL_202108111600000000.csv{iteration}",
    )
    load_data(acon=acon)


def _create_table(scenario: List[str]) -> None:
    """Create test table.

    Args:
        scenario: scenario being tested.
    """
    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS test_db.{scenario[0]}_{scenario[1]} (
            salesorder int,
            item int,
            event string,
            changed_on int,
            date int,
            customer string,
            article string,
            amount int,
            {"extraction_date string,"
             if scenario[1] == "streaming" else "lhe_row_id int,"}
            {"lhe_batch_id int," if scenario[1] == "streaming" else ""}
            {"lhe_row_id int"
             if scenario[1] == "streaming" else "extraction_date string"}
        )
        USING delta
        LOCATION '{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data'
        """
    )


================================================
FILE: tests/feature/delta_load/test_delta_load_merge_options.py
================================================
"""Test delta loads with different merge options."""

from typing import List

import pytest

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "delta_load/merge_options"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["update_column_set", "insert_column_set", "update_all"],
)
def test_delta_load_merge_options(scenario: List[str]) -> None:
    """Test upsert for specific columns in batch mode.

    Args:
        scenario: scenario to test.
            update_column_set - This test uses whenMatchedUpdate option. It allows to
                update a matched table row based on the rules defined in
                update_column_set, instead of updating all the columns of the matched
                table row with the values of the corresponding columns in the source
                 row.
            insert_column_set - This test uses whenNotMatchedInsert option. It allows to
                insert a new row to the target table based on the rules defined in
                insert_column_set, instead of inserting a new target Delta table row
                by assigning the target columns to the values of the corresponding
                columns in the source row.
            update_all - This test uses whenMatchedUpdateAll option. It allows to
                update a matched table updating all the columns with the values
                of the corresponding columns in the source row.
    """
    execute_loads(scenario)

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=InputFormat.DELTAFILES.value,
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/batch.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/batch.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_batch_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


def execute_loads(scenario: List[str]) -> None:
    """Execute the data loads.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/WE_SO_SCL_202108111400000000.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/WE_SO_SCL_202108111400000000.csv",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/WE_SO_SCL_202108111500000000.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/WE_SO_SCL_202108111500000000.csv",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_delta.json")
    load_data(acon=acon)


================================================
FILE: tests/feature/delta_load/test_delta_load_record_mode_cdc.py
================================================
"""Test delta loads with record mode based cdc."""

from typing import List

import pytest

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "delta_load/record_mode_cdc"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        ["with_deletes_additional_columns", "csv"],
        ["with_duplicates", "csv"],
        ["with_upserts_only_removed_columns", "json"],
    ],
)
def test_batch_delta_load(scenario: List[str]) -> None:
    """Test delta loads in batch mode.

    Args:
        scenario: scenario to test (name and file format).
    """
    _create_table(f"{scenario[0]}", f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data")

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-01.{scenario[1]}",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/",
    )
    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario[0]}/batch_init.json"
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-0[2,3,4].{scenario[1]}",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/",
    )

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario[0]}/batch_delta.json"
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize(
    "scenario",
    [
        ["late_arriving_changes", "batch"],
        ["out_of_order_changes", "batch"],
        ["late_arriving_changes", "streaming"],
        ["out_of_order_changes", "streaming"],
    ],
)
def test_file_by_file(scenario: str) -> None:
    """Test delta loads in batch mode.

    Args:
        scenario: scenario to test.
            late_arriving_changes - This test checks if if changes arrive late (certain
            changes on part-02 are incomplete and only arrive in part-03), the data
            stays consistent.
            out_of_order_changes - This test checks if by loading the data out of order
            (part-03 is loaded before part-02) the delta table stays consistent.
    """
    _create_table(
        f"{scenario[0]}_{scenario[1]}",
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/",
    )
    load_data(
        f"file://{TEST_RESOURCES}/{scenario[0]}/"
        f"{scenario[1] + ('_init' if scenario[1] == 'batch' else '_delta')}.json"
    )

    if scenario[0] == "out_of_order_changes":
        second_file = "part-03"
        third_file = "part-02"
    else:
        second_file = "part-02"
        third_file = "part-03"

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/{second_file}.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/",
    )
    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}_delta.json"
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/{third_file}.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/",
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-04.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/",
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/{scenario[1]}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize("scenario", ["backfill"])
def test_backfill(scenario: str) -> None:
    """Test backfill process of a delta load based table.

    Args:
        scenario: scenario to test.
            This test performs a regular delta load and, after that, backfills from the
            source where we simulate that all data contained in part-2, part-3 and
            part-04 has changed to be amount * 10.
    """
    _create_table(f"{scenario}", f"{TEST_LAKEHOUSE_OUT}/{scenario}/data")

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-0[2,3,4].csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_delta.json")
    load_data(acon=acon)

    LocalStorage.delete_file(f"{TEST_LAKEHOUSE_IN}/{scenario}/data/part-0[2,3,4].csv")

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-05.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario}/batch_backfill.json"
    )
    load_data(acon=acon)

    LocalStorage.delete_file(f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/part-01.csv")

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize("scenario", ["direct_silver_load"])
def test_direct_silver_load(scenario: str) -> None:
    """Test a delta load based process that loads to bronze and silver in the same run.

    We get data from the source, load it to bronze and then into silver, without needing
    to run two separate algorithms.

    Args:
        scenario: scenario to test.
    """
    _create_table(f"{scenario}_bronze", f"{TEST_LAKEHOUSE_OUT}/{scenario}/bronze/data")
    _create_table(f"{scenario}_silver", f"{TEST_LAKEHOUSE_OUT}/{scenario}/silver/data")

    scenario = "direct_silver_load"
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-0[2,3,4].csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_delta.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/bronze/data/",
    )

    bronze_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/bronze/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_bronze_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/bronze/data", file_format="csv"
    )

    assert not DataframeHelpers.has_diff(bronze_df, control_bronze_df)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-02.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/silver/data/",
    )

    silver_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/silver/data",
        file_format=InputFormat.DELTAFILES.value,
    )
    control_silver_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/silver/data", file_format="csv"
    )

    assert not DataframeHelpers.has_diff(silver_df, control_silver_df)


def _create_table(table_name: str, location: str) -> None:
    """Create test table.

    Args:
        table_name: name of the table.
        location: location of the table.
    """
    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS test_db.{table_name} (
            extraction_timestamp string,
            actrequest_timestamp string,
            request string,
            datapakid int,
            partno int,
            record int,
            salesorder int,
            item int,
            recordmode string,
            date int,
            customer string,
            article string,
            amount int
        )
        USING delta
        LOCATION '{location}'
        """
    )


================================================
FILE: tests/feature/test_append_load.py
================================================
"""Test append loads."""

from typing import Any

import pytest
from py4j.protocol import Py4JJavaError

from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "append_load"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"


@pytest.mark.parametrize("scenario", ["jdbc_permissive"])
def test_permissive_jdbc_append_load(scenario: str) -> None:
    """Test append loads from jdbc source with permissive read mode.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    _append_data_into_source(scenario)
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    _append_data_into_source(scenario)
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-03.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    _append_data_into_source(scenario)
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_table(f"test_db.{scenario}_table")
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize("scenario", ["failfast"])
def test_failfast_append_load(scenario: str) -> None:
    """Test append loads with failfast read mode.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch_init.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-0[2,3].csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )

    with pytest.raises(Py4JJavaError) as e:
        # should raise malformed records due to failfast, as amount column was
        # renamed to amount2 and there is one more column in the pat-03.csv file.
        acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch.json")
        load_data(acon=acon)

    assert "Malformed CSV record" in str(e.value)


@pytest.mark.parametrize("scenario", ["streaming_dropmalformed"])
def test_streaming_dropmalformed(scenario: str) -> None:
    """Test append loads, in streaming mode, with dropmalformed read mode.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/streaming.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-03.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_table(f"test_db.{scenario}_table")
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        schema=ConfigUtils.read_json_acon(
            f"file://{TEST_RESOURCES}/{scenario}/streaming.json"
        )["input_specs"][0]["schema"],
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize("scenario", ["streaming_with_terminators"])
def test_streaming_with_terminators(scenario: str, caplog: Any) -> None:
    """Test append loads, in streaming mode, with terminator functions.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/streaming.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_table(f"test_db.{scenario}_table")
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        schema=ConfigUtils.read_json_acon(
            f"file://{TEST_RESOURCES}/{scenario}/streaming.json"
        )["input_specs"][0]["schema"],
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)
    assert (
        "sql command: OPTIMIZE test_db.streaming_with_terminators_table" in caplog.text
    )
    assert "Vacuuming table: test_db.streaming_with_terminators_table" in caplog.text
    assert (
        "sql command: ANALYZE TABLE test_db.streaming_with_terminators_table "
        "COMPUTE STATISTICS" in caplog.text
    )


def _append_data_into_source(scenario: str) -> None:
    """Append data into jdbc sql lite table used as source for append load tests.

    Args:
        scenario: scenario being tested.
    """
    source_df = DataframeHelpers.read_from_file(f"{TEST_LAKEHOUSE_IN}/{scenario}/data")
    DataframeHelpers.write_into_jdbc_table(
        source_df, f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/tests.db", f"{scenario}"
    )


================================================
FILE: tests/feature/test_data_quality.py
================================================
"""Test data quality process in different types of data loads."""

from json import loads
from typing import Any

import pytest
from pyspark.sql import DataFrame
from pyspark.sql.functions import array_sort, col, regexp_replace, transform
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

from lakehouse_engine.core.definitions import (
    DQExecutionPoint,
    DQFunctionSpec,
    DQSpec,
    DQType,
)
from lakehouse_engine.dq_processors.dq_factory import DQFactory
from lakehouse_engine.dq_processors.exceptions import DQValidationsFailedException
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.dq_utils import PrismaUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.dq_rules_table_utils import _create_dq_functions_source_table
from tests.utils.local_storage import LocalStorage

TEST_PATH = "data_quality"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "delta_with_duplicates",
            "read_type": "streaming",
            "results_exploded": True,
            "tag_source_data": False,
        },
        {
            "name": "delta_with_duplicates_tag",
            "read_type": "streaming",
            "results_exploded": True,
            "tag_source_data": True,
        },
        {
            "name": "delta_with_dupl_tag_gen_fail",
            "read_type": "streaming",
            "results_exploded": True,
            "tag_source_data": True,
        },
        {
            "name": "no_transformers",
            "read_type": "streaming",
            "results_exploded": False,
            "tag_source_data": False,
        },
        {
            "name": "full_overwrite",
            "read_type": "batch",
            "results_exploded": True,
            "tag_source_data": False,
        },
        {
            "name": "full_overwrite_tag",
            "read_type": "batch",
            "results_exploded": True,
            "tag_source_data": True,
        },
    ],
)
def test_load_with_dq_validator(scenario: dict) -> None:
    """Test the data quality validator process as part of the load_data algorithm.

    Description of the test scenarios:
        - delta_with_duplicates - test the DQ process for a streaming
        init and delta load with duplicates and merge strategy scenario.
        It's generated a DQ result_sink where some columns are exploded to make easier
        the analysis.
        - delta_with_duplicates_tag - similar to delta_with_duplicates but using DQ Row
        Tagging. The scenarios with tagging, test not only the loads and the result
        DQ sink, but also the resulting data to assert the "dq_validations" column
        that gets added into the source data used. This scenario covers different
        kinds of expectations (table, column aggregated, column, multi-column,
        column pair) with successes and failures.
        - delta_with_dupl_tag_gen_fail - similar to delta_with_duplicates_tag, but
        tests DQ success on init and then only general failures (not row level).
        - no_transformers - test the DQ process for a streaming init and delta
        without transformers or micro batch transformers. It's generated a DQ
        result_sink in a raw format.
        - full_overwrite - test the DQ process for a batch full overwrite scenario.
        It's generated a DQ result_sink where some columns are exploded to make easier
        the analysis, in which includes some extra columns set by
        the user to be included (using parameter result_sink_extra_columns).
        - full_overwrite_tag - similar to full_overwrite but using DQ Row
        Tagging. This scenario covers different kinds of expectations, all succeeded.

    Args:
        scenario: scenario to test.
            name - name of the scenario.
            read_type - type of read, namely batch or streaming.
            results_exploded - flag to generate a DQ result_sink in a raw format
                (False) or an exploded format easier for analysis (True).
            tag_source_data - whether the test scenario tests tagging the source
                data with the DQ results or not.
    """
    test_name = "load_with_dq_validator"
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/",
    )
    load_data(
        f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"{scenario['read_type']}_init.json"
    )

    if "full_overwrite" in scenario["name"]:
        LocalStorage.clean_folder(
            f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data",
        )

    result_sink_df = DataframeHelpers.read_from_table(
        f"test_db.validator_{scenario['name']}"
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"data/source/part-0[2,3,4].csv",
        f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/",
    )
    load_data(
        f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"{scenario['read_type']}_new.json"
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"data/control/data_validator.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/control/sales.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"data/control/*_schema.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/",
    )

    result_sink_df = DataframeHelpers.read_from_table(
        f"test_db.validator_{scenario['name']}"
    )

    control_sink_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/",
        file_format="json",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/"
            f"{scenario['name']}/validator/data_validator_schema.json"
        ),
    )

    # drop columns for which the values vary from run to run (ex: depending on date)
    cols_to_drop = [
        "checkpoint_config",
        "run_name",
        "run_time",
        "run_results",
        "validation_results",
        "validation_result_identifier",
        "exception_info",
        "batch_id",
        "run_time_year",
        "run_time_month",
        "run_time_day",
        "kwargs",
        "processed_keys",
    ]

    assert (
        result_sink_df.columns
        == control_sink_df.select(*result_sink_df.columns).columns
    )

    assert not DataframeHelpers.has_diff(
        result_sink_df.drop(*cols_to_drop),
        control_sink_df.drop(*cols_to_drop),
    )

    if scenario["tag_source_data"]:
        result_data_df = _prepare_validation_df(
            DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_OUT}/{test_name}/{scenario['name']}/data",
                file_format="delta",
            )
        )

        control_data_df = _prepare_validation_df(
            DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/",
                file_format="json",
                schema=SchemaUtils.from_file_to_dict(
                    f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/"
                    f"{scenario['name']}/validator/sales_schema.json"
                ),
            )
        )

        assert not DataframeHelpers.has_diff(result_data_df, control_data_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "delta_with_duplicates_tag",
            "read_type": "streaming",
            "results_exploded": True,
        },
        {
            "name": "delta_with_dupl_tag_gen_fail",
            "read_type": "streaming",
            "results_exploded": True,
        },
        {
            "name": "full_overwrite_tag",
            "read_type": "batch",
            "results_exploded": True,
        },
    ],
)
def test_load_with_dq_validator_table(scenario: dict) -> None:
    """Test the data quality validator process as part of the load_data algorithm.

    Description of the test scenarios:
        - delta_with_duplicates_tag - test the DQ process for a streaming
        init and delta load with duplicates and merge strategy scenario.
        It's generated a DQ result_sink where some columns are exploded to make easier
        the analysis using DQ Row Tagging. The scenarios with tagging, test
        not only the loads and the result DQ sink, but also the resulting data to
        assert the "dq_validations" column that gets added into the source data used.
        This scenario covers different kinds of expectations (table, column aggregated,
        column, multi-column, column pair) with successes and failures.
        - delta_with_dupl_tag_gen_fail - similar to delta_with_duplicates_tag, but
        tests DQ success on init and then only general failures (not row level).
        - full_overwrite_tag - test the DQ process for a batch full overwrite scenario.
        It's generated a DQ result_sink where some columns are exploded to make easier
        the analysis, in which includes some extra columns set by
        the user to be included (using parameter result_sink_extra_columns).
        This scenario covers different kinds of expectations, all succeeded.

    Args:
        scenario: scenario to test.
            name - name of the scenario.
            read_type - type of read, namely batch or streaming.
            results_exploded - flag to generate a DQ result_sink in a raw format
                (False) or an exploded format easier for analysis (True).
            tag_source_data - whether the test scenario tests tagging the source
                data with the DQ results or not.
    """
    test_name = "load_with_dq_table"

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/",
    )
    _create_dq_functions_source_table(
        test_resources_path=TEST_RESOURCES,
        lakehouse_in_path=TEST_LAKEHOUSE_IN,
        lakehouse_out_path=TEST_LAKEHOUSE_OUT,
        test_name=f"{test_name}/{scenario['name']}",
        scenario=scenario["name"],
        table_name=f"test_db.dq_functions_source_{test_name}_{scenario['name']}_init",
    )
    load_data(
        f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"{scenario['read_type']}_init.json"
    )

    if "full_overwrite" in scenario["name"]:
        LocalStorage.clean_folder(
            f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data",
        )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"data/source/part-0[2,3,4].csv",
        f"{TEST_LAKEHOUSE_IN}/{test_name}/{scenario['name']}/data/",
    )
    _create_dq_functions_source_table(
        test_resources_path=TEST_RESOURCES,
        lakehouse_in_path=TEST_LAKEHOUSE_IN,
        lakehouse_out_path=TEST_LAKEHOUSE_OUT,
        test_name=f"{test_name}/{scenario['name']}",
        scenario=scenario["name"],
        table_name=f"test_db.dq_functions_source_{test_name}_{scenario['name']}_new",
    )
    load_data(
        f"file://{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"{scenario['read_type']}_new.json"
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"data/control/data_validator.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/data/control/sales.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{test_name}/{scenario['name']}/"
        f"data/control/*_schema.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/",
    )

    result_sink_df = DataframeHelpers.read_from_file(
        location=f"{LAKEHOUSE_FEATURE_OUT}/{scenario['name']}/result_sink/",
        file_format="delta",
    )

    control_sink_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/validator/data/",
        file_format="json",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/"
            f"{scenario['name']}/validator/data_validator_schema.json"
        ),
    )

    # drop columns for which the values vary from run to run (ex: depending on date)
    cols_to_drop = [
        "checkpoint_config",
        "run_name",
        "run_time",
        "run_results",
        "validation_results",
        "validation_result_identifier",
        "exception_info",
        "batch_id",
        "run_time_year",
        "run_time_month",
        "run_time_day",
        "kwargs",
        "meta",
    ]

    assert (
        result_sink_df.columns
        == control_sink_df.select(*result_sink_df.columns).columns
    )

    assert not DataframeHelpers.has_diff(
        result_sink_df.drop(*cols_to_drop),
        control_sink_df.drop(*cols_to_drop),
    )

    result_data_df = _prepare_validation_df(
        DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_OUT}/{test_name}/{scenario['name']}/data",
            file_format="delta",
        )
    )

    control_data_df = _prepare_validation_df(
        DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_CONTROL}/{test_name}/{scenario['name']}/data/",
            file_format="json",
            schema=SchemaUtils.from_file_to_dict(
                f"file://{TEST_LAKEHOUSE_CONTROL}/{test_name}/"
                f"{scenario['name']}/validator/sales_schema.json"
            ),
        )
    )

    assert not DataframeHelpers.has_diff(result_data_df, control_data_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "spec_id": "dq_success",
            "dq_type": "validator",
            "dq_functions": [
                DQFunctionSpec("expect_column_to_exist", {"column": "article"}),
                DQFunctionSpec(
                    "expect_table_row_count_to_be_between",
                    {"min_value": 0, "max_value": 50},
                ),
            ],
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "spec_id": "dq_failure",
            "dq_type": "validator",
            "dq_functions": [
                DQFunctionSpec("expect_column_to_exist", {"column": "article"}),
                DQFunctionSpec(
                    "expect_table_row_count_to_be_between",
                    {"min_value": 0, "max_value": 1},
                ),
            ],
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "spec_id": "dq_failure_error_disabled",
            "dq_type": "validator",
            "dq_functions": [
                DQFunctionSpec("expect_column_to_exist", {"column": "article"}),
                DQFunctionSpec(
                    "expect_table_row_count_to_be_between",
                    {"min_value": 0, "max_value": 1},
                ),
            ],
            "fail_on_error": False,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "spec_id": "dq_failure_critical_functions",
            "dq_type": "validator",
            "dq_functions": [
                DQFunctionSpec("expect_column_to_exist", {"column": "article"}),
            ],
            "fail_on_error": False,
            "critical_functions": [
                DQFunctionSpec(
                    "expect_table_row_count_to_be_between",
                    {
                        "min_value": 0,
                        "max_value": 1,
                    },
                ),
            ],
            "max_percentage_failure": None,
        },
        {
            "spec_id": "dq_failure_max_percentage",
            "dq_type": "validator",
            "dq_functions": [
                DQFunctionSpec("expect_column_to_exist", {"column": "article"}),
            ],
            "fail_on_error": False,
            "critical_functions": [
                DQFunctionSpec(
                    "expect_table_row_count_to_be_between",
                    {
                        "min_value": 0,
                        "max_value": 1,
                    },
                ),
            ],
            "max_percentage_failure": 0.2,
        },
        {
            "spec_id": "dq_success",
            "dq_type": "prisma",
            "dq_db_table": "test_db.dq_functions_source_dq_success",
            "dq_table_table_filter": "dummy_sales",
            "data_product_name": "dq_success",
            "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
        },
        {
            "spec_id": "dq_failure_error_disabled",
            "dq_type": "prisma",
            "fail_on_error": False,
            "dq_db_table": None,
            "dq_functions": [
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {
                        "min_value": 0,
                        "max_value": 1,
                        "meta": {
                            "dq_rule_id": "rule_1",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_sales",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
                {
                    "function": "expect_table_column_count_to_be_between",
                    "args": {
                        "min_value": 0,
                        "max_value": 50,
                        "meta": {
                            "dq_rule_id": "rule_2",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_sales",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
            ],
            "critical_functions": [],
            "data_product_name": "dq_failure_error_disabled",
            "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
            "max_percentage_failure": None,
        },
    ],
)
def test_validator_dq_spec(scenario: dict, caplog: Any) -> None:
    """Test the data quality process using DQSpec.

    Data Quality Functions tested using validator:
    - dq_success: it tests two expectations and both are succeeded.
    - dq_failure: it tests two expectations and one of them fails, raising an exception
    in the DQ process.
    - dq_failure_error_disabled: it tests one expectation and it fails, but no exception
    is raised, because the fail_on_error is set to false.
    - dq_failure_critical_functions: it tests two expectations where one fails, since
    the one that fails is part of the "critical_functions" an exception is raised.
    - dq_failure_max_percentage: it tests two expectations where one fails, since the
    "max_percentage_failure" variable is not respected, an exception is thrown.
    - dq_success: it tests two expectations defined using prisma and both succeed.
    - dq_failure_error_disabled: it tests one expectation defined in prisma, by
    manually defining the functions in the acon, and it fails, but no exception
    is raised, because the fail_on_error is set to false.


    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/validator/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario['dq_type']}/{scenario['spec_id']}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/validator/data/control/data_validator.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario['dq_type']}/{scenario['spec_id']}/data/",
    )
    input_data = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_IN}/{scenario['dq_type']}/{scenario['spec_id']}/data",
        file_format="csv",
        options={"header": True, "delimiter": "|", "inferSchema": True},
    )
    location = TEST_LAKEHOUSE_OUT.replace("file://", "")

    if scenario["dq_type"] == DQType.PRISMA.value:
        if scenario["dq_db_table"]:
            _create_dq_functions_source_table(
                test_resources_path=TEST_RESOURCES,
                lakehouse_in_path=TEST_LAKEHOUSE_IN,
                lakehouse_out_path=TEST_LAKEHOUSE_OUT,
                test_name="validator",
                scenario=scenario["spec_id"],
                table_name=scenario["dq_db_table"],
            )
            dq_functions = PrismaUtils.build_prisma_dq_spec(
                scenario,
                DQExecutionPoint.AT_REST.value,
            )["dq_functions"]
        else:
            dq_functions = scenario["dq_functions"]

        dq_spec = DQSpec(
            spec_id=scenario["spec_id"],
            input_id="sales_orders",
            dq_type=scenario["dq_type"],
            dq_db_table=scenario["dq_db_table"],
            store_backend="file_system",
            local_fs_root_dir=f"{location}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/",
            result_sink_format="json",
            result_sink_explode=False,
            processed_keys_location=f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/processed_keys",
            dq_functions=[
                DQFunctionSpec(
                    function=dq_function["function"], args=dq_function["args"]
                )
                for dq_function in dq_functions
            ],
            unexpected_rows_pk=scenario["unexpected_rows_pk"],
            result_sink_location=f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/data",
            fail_on_error=scenario["fail_on_error"],
            max_percentage_failure=scenario["max_percentage_failure"],
        )
    else:
        dq_spec = DQSpec(
            spec_id=scenario["spec_id"],
            input_id="sales_orders",
            dq_type=scenario["dq_type"],
            store_backend="file_system",
            local_fs_root_dir=f"{location}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/",
            result_sink_format="json",
            result_sink_explode=False,
            unexpected_rows_pk=[
                "salesorder",
                "item",
                "date",
                "customer",
            ],
            dq_functions=scenario["dq_functions"],
            result_sink_location=f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/data",
            fail_on_error=scenario["fail_on_error"],
            critical_functions=scenario["critical_functions"],
            max_percentage_failure=scenario["max_percentage_failure"],
        )

    if scenario["spec_id"] == "dq_failure":
        with pytest.raises(DQValidationsFailedException) as ex:
            DQFactory.run_dq_process(dq_spec, input_data)
        assert "Data Quality Validations Failed!" in str(ex.value)
    elif scenario["spec_id"] == "dq_failure_critical_functions":
        if scenario["dq_type"] != DQType.PRISMA.value:
            with pytest.raises(DQValidationsFailedException) as ex:
                DQFactory.run_dq_process(dq_spec, input_data)
            assert (
                "Data Quality Validations Failed, the following critical expectations "
                "failed: ['expect_table_row_count_to_be_between']." in str(ex.value)
            )
        else:
            DQFactory.run_dq_process(dq_spec, input_data)
    elif scenario["spec_id"] == "dq_failure_max_percentage":
        with pytest.raises(DQValidationsFailedException) as ex:
            DQFactory.run_dq_process(dq_spec, input_data)
        assert "Max error threshold is being surpassed!" in str(ex.value)
    else:
        DQFactory.run_dq_process(dq_spec, input_data)

        result_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_OUT}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/data",
            file_format="json",
        )

        if scenario["spec_id"] == "dq_failure_error_disabled":
            assert (
                "1 out of 2 Data Quality Expectation(s) have failed! "
                "Failed Expectations" in caplog.text
            )

        control_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario['dq_type']}/"
            f"{scenario['spec_id']}/data",
            file_format="csv",
            options={"header": True, "delimiter": "|", "inferSchema": True},
        ).fillna("")

        assert not DataframeHelpers.has_diff(
            result_df.filter(result_df["spec_id"] == scenario["spec_id"]).select(
                "spec_id", "input_id", "success"
            ),
            control_df.filter(control_df["spec_id"] == scenario["spec_id"]).select(
                "spec_id", "input_id", "success"
            ),
        )

        assert result_df.columns == control_df.select(*result_df.columns).columns

        _test_result_structure(result_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "result": "success",
            "tag_source_data": False,
            "num_chunks": 2,
            "num_rows": 10,
            "dq_functions": [
                {
                    "function": "expect_column_value_lengths_to_be_between",
                    "args": {
                        "column": "id",
                        "min_value": 0,
                        "max_value": 5,
                        "meta": {
                            "dq_rule_id": "rule_2",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_data",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
                {
                    "function": "expect_column_value_lengths_to_be_between",
                    "args": {
                        "column": "static_column",
                        "min_value": 0,
                        "max_value": 5,
                        "meta": {
                            "dq_rule_id": "rule_3",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_data",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
            ],
        },
        {
            "result": "failure",
            "tag_source_data": False,
            "num_chunks": 20,
            "num_rows": 15,
            "dq_functions": [
                {
                    "function": "expect_column_value_lengths_to_be_between",
                    "args": {
                        "column": "id",
                        "min_value": 0,
                        "max_value": 1,
                        "meta": {
                            "dq_rule_id": "rule_2",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_data",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
                {
                    "function": "expect_column_value_lengths_to_be_between",
                    "args": {
                        "column": "static_column",
                        "min_value": 0,
                        "max_value": 1,
                        "meta": {
                            "dq_rule_id": "rule_3",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_data",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
            ],
        },
        {
            "result": "success",
            "tag_source_data": True,
            "num_chunks": 6,
            "num_rows": 15,
            "dq_functions": [
                {
                    "function": "expect_column_value_lengths_to_be_between",
                    "args": {
                        "column": "id",
                        "min_value": 0,
                        "max_value": 1,
                        "meta": {
                            "dq_rule_id": "rule_2",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_data",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
                {
                    "function": "expect_column_value_lengths_to_be_between",
                    "args": {
                        "column": "static_column",
                        "min_value": 0,
                        "max_value": 20,
                        "meta": {
                            "dq_rule_id": "rule_3",
                            "execution_point": "in_motion",
                            "schema": "test_db",
                            "table": "dummy_data",
                            "column": "",
                            "dimension": "",
                            "filters": "",
                        },
                    },
                },
            ],
        },
    ],
)
def test_chunked_result_sink(scenario: dict, caplog: Any) -> None:
    """Test the chunked result sink for data quality validation.

    Scenario 0: test two expectations and both are successful.
    Scenario 1: test two expectations, both with errors
    Scenario 2: test two expectations, one with error and one without and
        the tagging functionality when multiple chunks exist.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    LocalStorage.clean_folder(f"{LAKEHOUSE_FEATURE_OUT}/test_dp/")
    schema = StructType(
        [
            StructField("id", IntegerType(), False),
            StructField("static_column", StringType(), False),
        ]
    )

    data = []
    for x in range(0, scenario["num_rows"]):
        data.append((x, True))

    df = DataframeHelpers.create_dataframe(data=data, schema=schema)

    acon = {
        "input_specs": [
            {
                "spec_id": "test_in",
                "read_type": "batch",
                "data_format": "dataframe",
                "df_name": df,
            },
        ],
        "dq_specs": [
            {
                "spec_id": "test_dq",
                "input_id": "test_in",
                "dq_type": DQType.PRISMA.value,
                "store_backend": "file_system",
                "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/chunked_result_sink/",
                "result_sink_format": "json",
                "data_product_name": "test_dp",
                "unexpected_rows_pk": ["id", "static_column"],
                "result_sink_chunk_size": 1,
                "dq_functions": scenario["dq_functions"],
                "tag_source_data": scenario["tag_source_data"],
            }
        ],
        "output_specs": [
            {
                "spec_id": "test_out",
                "input_id": "test_dq",
                "data_format": "dataframe",
                "write_type": "overwrite",
            }
        ],
    }

    result_df = load_data(acon=acon)["test_out"]

    result_sink = DataframeHelpers.read_from_file(
        location=f"{LAKEHOUSE_FEATURE_OUT}/test_dp/result_sink/", file_format="json"
    )
    assert result_sink.count() == scenario["num_chunks"]
    processed_keys = DataframeHelpers.read_from_file(
        location=f"{LAKEHOUSE_FEATURE_OUT}/test_dp/dq_processed_keys/",
        file_format="json",
    )
    assert processed_keys.count() == scenario["num_rows"]

    if scenario["result"] == "failure":
        assert (
            "2 out of 2 Data Quality Expectation(s) have failed! Failed Expectations"
            in caplog.text
        )

    if scenario["tag_source_data"]:
        final_df = result_df.groupBy("dq_validations").count()

        assert final_df.count() == 2
        for ele in final_df.collect():
            if ele.dq_validations.dq_failure_details:
                assert ele["count"] == 5
            else:
                assert ele["count"] == 10


def _test_result_structure(df: DataFrame) -> None:
    """Test if a dataframe has the expected keys in its structure.

    Tests the validity of a dataframe, by checking if some keys are part of the
    base structure of that dataframe.

    Args:
        df: dataframe to test.
    """
    for key in df.collect():
        for result in loads(key.validation_results):
            assert {
                "success",
                "expectation_config",
            }.issubset(result.keys())


def _prepare_validation_df(df: DataFrame) -> DataFrame:
    """Given a DataFrame apply necessary transformations to prepare it for validations.

    It performs necessary transformations like removing the date from the run_name and
    removing the batch_id from the dq_failure_details.

    Args:
        df: dataframe to transform.

    Returns: the transformed dataframe
    """
    return df.withColumn(
        "dq_validations",
        col("dq_validations")
        .withField(
            "run_name", regexp_replace(col("dq_validations.run_name"), "[0-9]", "")
        )
        .withField(
            "dq_failure_details",
            array_sort(
                transform(
                    "dq_validations.dq_failure_details",
                    lambda x: x.withField(
                        "kwargs",
                        regexp_replace(
                            x.kwargs,
                            '"batch_id":.*?,',
                            "",
                        ),
                    ),
                ),
            ),
        ),
    )


================================================
FILE: tests/feature/test_dq_validator.py
================================================
"""Test data quality validator."""

from json import loads
from typing import Any, Dict, List, Tuple, Union

import py4j
import pytest
from pyspark.sql import DataFrame
from pyspark.sql.utils import StreamingQueryException

from lakehouse_engine.core.definitions import DQType
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.dq_processors.exceptions import (
    DQDuplicateRuleIdException,
    DQValidationsFailedException,
)
from lakehouse_engine.engine import execute_dq_validation, load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.dq_rules_table_utils import _create_dq_functions_source_table
from tests.utils.local_storage import LocalStorage

_LOGGER = LoggingHandler(__name__).get_logger()

TEST_NAME = "dq_validator"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "spec_id": "spec_without_duplicate",
            "name": "table_batch_dq_rule",
            "dq_type": "prisma",
            "read_type": "batch",
            "input_type": "file_reader",
            "dq_table_table_filter": "dummy_sales",
            "dq_validator_result": "success",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_table_rule_id_success",
            "max_percentage_failure": None,
        },
        {
            "spec_id": "spec_with_duplicate",
            "name": "table_batch_dq_rule",
            "dq_type": "prisma",
            "read_type": "batch",
            "input_type": "file_reader",
            "dq_table_table_filter": "dummy_sales",
            "dq_validator_result": "failed",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_table_rule_id_failure",
            "max_percentage_failure": None,
        },
        {
            "spec_id": "streaming_spec_without_duplicate",
            "name": "table_streaming_dq_rule",
            "dq_type": "prisma",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_table_table_filter": "dummy_sales",
            "dq_validator_result": "success",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_table_rule_id_success",
            "max_percentage_failure": None,
        },
        {
            "spec_id": "streaming_spec_with_duplicate",
            "name": "table_streaming_dq_rule",
            "dq_type": "prisma",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_table_table_filter": "dummy_sales",
            "dq_validator_result": "failed",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_table_rule_id_failure",
            "max_percentage_failure": None,
        },
    ],
)
def test_dq_rule_id_uniqueness(scenario: dict, caplog: Any) -> None:
    """Test the function to detect duplicate dq_rule_id.

    Dq_rule_id scenarios:
    - scenario 1: using the file reader in batch to test if the dq_db_table
    has duplicated dq_rule_id. This scenario do not have duplicates.
    - scenario 2: Using the file reader in batch mode to check for duplicate
    dq_rule_id values in the dq_db_table. In this scenario, duplicates are found
    in rule_3 and rule_4.
    - scenario 3: using the file reader in streaming to test if the dq_db_table
    has duplicated dq_rule_id. This scenario do not have duplicates.
    - scenario 4: using the file reader in streaming mode to check for duplicate
    dq_rule_id values in the dq_db_table. In this scenario, duplicates are found
    in rule_3 and rule_5.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    _clean_folders()

    _create_table("dq_sales")

    _execute_load(scenario["read_type"])

    input_spec = {
        "spec_id": "sales_source",
        "data_format": "delta",
        "read_type": scenario["read_type"],
        "location": f"{TEST_LAKEHOUSE_OUT}/data/",
    }

    _create_dq_functions_source_table(
        test_resources_path=TEST_RESOURCES,
        lakehouse_in_path=TEST_LAKEHOUSE_IN,
        lakehouse_out_path=TEST_LAKEHOUSE_OUT,
        test_name=scenario["name"],
        scenario=scenario["read_type"],
        table_name=scenario["dq_db_table"],
    )

    acon = _generate_acon(
        input_spec, scenario, scenario.get("dq_type", DQType.VALIDATOR.value)
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    if (scenario["dq_validator_result"] == "failed") and ("batch" in scenario["name"]):
        with pytest.raises(DQDuplicateRuleIdException) as error:
            execute_dq_validation(acon=acon)
        assert "rule_3" and "rule_4" in error.value.args[0]
        _LOGGER.critical(error.value.args[0])
    elif (scenario["dq_validator_result"] == "failed") and (
        "streaming" in scenario["name"]
    ):
        with pytest.raises(DQDuplicateRuleIdException) as error:
            execute_dq_validation(acon=acon)
        assert "rule_3" and "rule_5" in error.value.args[0]
        _LOGGER.critical(error.value.args[0])
    else:
        execute_dq_validation(acon=acon)
        assert "A duplicate dq_rule_id was found!!!" not in caplog.text


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "batch_dataframe_success",
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "dq_validator_result": "success",
            "restore_prev_version": False,
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_dataframe_failure",
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": False,
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_failure_disabled",
            "read_type": "streaming",
            "input_type": "table_reader",
            "dq_validator_result": "failure_disabled",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "name": "batch_failure",
            "read_type": "batch",
            "input_type": "table_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": True,
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_failure",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": True,
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_failure_critical",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": True,
            "fail_on_error": True,
            "critical_functions": [
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {"min_value": 3, "max_value": 11},
                }
            ],
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_failure_critical_notes",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": True,
            "fail_on_error": True,
            "critical_functions": [
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {
                        "min_value": 3,
                        "max_value": 11,
                        "meta": {"notes": "Test notes"},
                    },
                }
            ],
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_failure_critical_markdown",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": True,
            "fail_on_error": True,
            "critical_functions": [
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {
                        "min_value": 3,
                        "max_value": 11,
                        "meta": {
                            "notes": {"format": "markdown", "content": "**Test Notes**"}
                        },
                    },
                }
            ],
            "max_percentage_failure": None,
        },
        {
            "name": "streaming_failure_percentage",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "failure",
            "restore_prev_version": True,
            "fail_on_error": True,
            "critical_functions": None,
            "max_percentage_failure": 0.2,
        },
        {
            "name": "table_batch_success",
            "dq_type": "prisma",
            "read_type": "batch",
            "input_type": "file_reader",
            "dq_validator_result": "success_explode",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_functions_source_table_success",
            "max_percentage_failure": None,
        },
        {
            "name": "table_batch_failure_disabled",
            "dq_type": "prisma",
            "read_type": "batch",
            "input_type": "file_reader",
            "dq_validator_result": "success_explode_disabled",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_functions_source_table_failure",
            "max_percentage_failure": None,
        },
        {
            "name": "table_streaming_success",
            "dq_type": "prisma",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "success_explode",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_functions_source_table_success",
            "max_percentage_failure": None,
        },
        {
            "name": "table_streaming_failure_disabled",
            "dq_type": "prisma",
            "read_type": "streaming",
            "input_type": "file_reader",
            "dq_validator_result": "success_explode_disabled",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_functions_source_table_failure",
            "max_percentage_failure": None,
        },
        {
            "name": "table_batch_dataframe_success",
            "dq_type": "prisma",
            "read_type": "batch",
            "input_type": "dataframe_reader",
            "dq_validator_result": "success_explode",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_functions_source_table_success",
            "max_percentage_failure": None,
        },
        {
            "name": "table_batch_dataframe_failure_disabled",
            "dq_type": "prisma",
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "dq_validator_result": "success_explode_disabled",
            "restore_prev_version": False,
            "fail_on_error": False,
            "critical_functions": None,
            "dq_db_table": "test_db.dq_functions_source_table_failure",
            "max_percentage_failure": None,
        },
    ],
)
def test_dq_validator(scenario: dict, caplog: Any) -> None:
    """Test the Data Quality Validator algorithm with DQ Type Validator.

    Data Quality Validator scenarios:
    - scenario 1: test DQ Validator having a generated dataframe as input
    that passes all the expectations defined.
    - scenario 2: test DQ Validator, reading a generated dataframe as
    stream that fails one of the expectations defined.
    - scenario 3: test DQ Validator, reading as streaming a delta table,
    failing one of the expectations but not failing the complete DQ process
    as fail_on_error is disabled.
    - scenario 4: test DQ Validator, reading a delta table (batch),
    that fails one of the expectations defined and a previous version of the
    delta table is restored.
    - scenario 5: test DQ Validator, reading as streaming a set of files in a
    specific location, that fail one of the expectations defined and a
    previous version of the delta table is restored.
    - scenario 6: test DQ Validator, reading as streaming a set of files in a
    specific location, that fails one of the expectations that is defined as
    critical.
    - scenario 7: test DQ Validator, reading as streaming a set of files in a
    specific location, that fails one of the expectations that is defined as
    critical and notes in default format.
    - scenario 8: test DQ Validator, reading as streaming a set of files in a
    specific location, that fails one of the expectations that is defined as
    critical and notes with markdown.
    - scenario 9: test DQ Validator, reading as streaming a set of files in a
    specific location, that fails the whole expectation suite because the
    maximum percentage threshold is surpassed.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    _clean_folders()

    if "dataframe" in scenario["input_type"]:
        input_spec = {
            "spec_id": "sales_source",
            "read_type": scenario["read_type"],
            "data_format": "dataframe",
            "df_name": _generate_dataframe(scenario["read_type"]),
        }
    else:
        _create_table("dq_sales")

        _execute_load(scenario["read_type"])

        if "table" in scenario["input_type"]:
            input_spec = {
                "spec_id": "sales_source",
                "read_type": scenario["read_type"],
                "db_table": "test_db.dq_sales",
            }
        else:
            input_spec = {
                "spec_id": "sales_source",
                "data_format": "delta",
                "read_type": scenario["read_type"],
                "location": f"{TEST_LAKEHOUSE_OUT}/data/",
            }

    if "dq_db_table" in scenario.keys():
        _create_dq_functions_source_table(
            test_resources_path=TEST_RESOURCES,
            lakehouse_in_path=TEST_LAKEHOUSE_IN,
            lakehouse_out_path=TEST_LAKEHOUSE_OUT,
            test_name=scenario["name"],
            scenario=scenario["read_type"],
            table_name=scenario["dq_db_table"],
        )

    acon = _generate_acon(
        input_spec, scenario, scenario.get("dq_type", DQType.VALIDATOR.value)
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    if scenario["dq_validator_result"] == "failure":
        with pytest.raises(
            (DQValidationsFailedException, StreamingQueryException),
            match=".*Data Quality Validations Failed!.*",
        ):
            execute_dq_validation(acon=acon)
    else:
        execute_dq_validation(acon=acon)

    if scenario["restore_prev_version"] is True:
        data_result_df, data_control_df = _get_result_and_control_dfs(
            "test_db.dq_sales", "data_restore_control", False
        )

        assert not DataframeHelpers.has_diff(data_result_df, data_control_df)
        assert "Data Quality Expectation(s) have failed!" in caplog.text

    if scenario["dq_validator_result"] == "failure_disabled":
        assert (
            "1 out of 3 Data Quality Expectation(s) have failed! "
            "Failed Expectations" in caplog.text
        )

    dq_result_df, dq_control_df = _get_result_and_control_dfs(
        result=f"{LAKEHOUSE_FEATURE_OUT}/{scenario['name']}/result_sink/",
        control=f'dq_control_{scenario["dq_validator_result"]}',
        infer_schema=True,
        result_is_table=False,
    )

    assert not DataframeHelpers.has_diff(
        dq_result_df.select("spec_id", "input_id", "success"),
        dq_control_df.fillna("").select("spec_id", "input_id", "success"),
    )

    for key in dq_result_df.collect():
        validation_results = loads(key.validation_results)
        result = (
            validation_results[0]
            if isinstance(validation_results, list)
            else validation_results
        )
        assert {
            "success",
            "expectation_config",
        }.issubset(result.keys())


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "streaming_dataframe_two_runs",
            "dq_type": "prisma",
            "read_type": "streaming",
            "input_type": "dataframe_reader",
            "dq_validator_result": "success_explode",
            "dq_db_table_first_run": "test_db.dq_functions_streaming_dataframe_two_runs_first_run",  # noqa: E501
            "dq_db_table_second_run": "test_db.dq_functions_streaming_dataframe_two_runs_second_run",  # noqa: E501
            "fail_on_error": False,
            "critical_functions": None,
            "max_percentage_failure": None,
            "restore_prev_version": False,
        },
    ],
)
def test_dq_validator_two_runs(scenario: dict, caplog: Any) -> None:
    """Test the integrity of the result sink after two runs.

    This tests performs two runs of the Data Quality Validator with the same
    scenario but different dq functions source tables. The goal is to ensure
    that the result sink does not have void types and that it is able to
    be read without issues.
    This is a regression test for the case when the Data Quality Validator
    was writing a column with void types to the result sink, which caused
    issues when reading the result sink.

    Data Quality Validator scenarios:
    - scenario 1: test result sink structure by having two runs writing to the same
    result sink without creating an issue with void types.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    _clean_folders()

    input_spec = {
        "spec_id": "sales_source",
        "read_type": scenario["read_type"],
        "data_format": "dataframe",
        "df_name": _generate_dataframe(scenario["read_type"]),
    }

    _create_dq_functions_source_table(
        test_resources_path=TEST_RESOURCES,
        lakehouse_in_path=TEST_LAKEHOUSE_IN,
        lakehouse_out_path=TEST_LAKEHOUSE_OUT,
        test_name=scenario["name"],
        scenario=scenario["read_type"],
        table_name=scenario["dq_db_table_first_run"],
    )

    _create_dq_functions_source_table(
        test_resources_path=TEST_RESOURCES,
        lakehouse_in_path=TEST_LAKEHOUSE_IN,
        lakehouse_out_path=TEST_LAKEHOUSE_OUT,
        test_name=scenario["name"],
        scenario=scenario["read_type"],
        table_name=scenario["dq_db_table_second_run"],
    )

    scenario["dq_db_table"] = scenario["dq_db_table_first_run"]

    first_acon = _generate_acon(
        input_spec, scenario, scenario.get("dq_type", DQType.PRISMA.value)
    )

    scenario["dq_db_table"] = scenario["dq_db_table_second_run"]

    second_acon = _generate_acon(
        input_spec, scenario, scenario.get("dq_type", DQType.PRISMA.value)
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    execute_dq_validation(acon=first_acon)

    execute_dq_validation(acon=second_acon)

    result_sink_path = f"{LAKEHOUSE_FEATURE_OUT}/{scenario['name']}/result_sink/"
    df = ExecEnv.SESSION.sql(
        f"""select * from delta.`{result_sink_path}`"""  # nosec B608
    )

    try:
        df.show()
    except py4j.protocol.Py4JJavaError:
        pytest.fail("Failed to write to result sink due to void type in the dataframe.")


def _clean_folders() -> None:
    """Clean test folders and tables."""
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/data")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/data")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/checkpoint")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/dq")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/profiling")
    ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.dq_sales")
    ExecEnv.SESSION.sql("DROP TABLE IF EXISTS test_db.dq_validator")


def _create_table(table_name: str) -> None:
    """Create test table.

    Args:
        table_name: name of the test table.
    """
    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS test_db.{table_name} (
            salesorder string,
            item string,
            date string,
            customer string,
            article string,
            amount string
        )
        USING delta
        LOCATION '{TEST_LAKEHOUSE_OUT}/data'
        TBLPROPERTIES(
          'lakehouse.primary_key'='salesorder, `item`, date ,`customer`',
          'delta.enableChangeDataFeed'='false'
        )
        """
    )


def _execute_load(load_type: str) -> None:
    """Helper function to reuse for loading the data for the scenario tests.

    Args:
        load_type: batch or streaming.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{load_type}.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    load_data(acon=acon)


def _generate_acon(
    input_spec: dict,
    scenario: dict,
    dq_type: str,
) -> dict:
    """Generate acon according to test scenario.

    Args:
        input_spec: input specification.
        scenario: the scenario being tested.
        dq_type: the type of data quality process.

    Returns:
        A dict corresponding to the generated acon.
    """
    if "dataframe" in scenario["input_type"]:
        unexpected_rows_pk: Dict[str, Union[str, List[str]]] = {
            "unexpected_rows_pk": ["salesorder", "item", "date", "customer"]
        }
    else:
        unexpected_rows_pk = {"tbl_to_derive_pk": "test_db.dq_sales"}

    if dq_type == DQType.VALIDATOR.value or dq_type == DQType.PRISMA.value:
        dq_spec_add_options = {
            "result_sink_location": f"{LAKEHOUSE_FEATURE_OUT}/"
            f"{scenario['name']}/result_sink/",
            "dq_db_table": scenario.get("dq_db_table"),
            "dq_table_table_filter": "dummy_sales",
            "result_sink_format": "delta",
            "fail_on_error": scenario["fail_on_error"],
            "critical_functions": scenario["critical_functions"],
            "max_percentage_failure": scenario["max_percentage_failure"],
            "result_sink_explode": False,
            "data_product_name": scenario["name"],
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "article"}},
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {"min_value": 3, "max_value": 11},
                },
                {
                    "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b",
                    "args": {"column_A": "salesorder", "column_B": "amount"},
                },
            ],
        }
        dq_spec_add_options.update(unexpected_rows_pk)

    return {
        "input_spec": input_spec,
        "dq_spec": {
            "spec_id": "dq_sales",
            "input_id": "sales_source",
            "dq_type": dq_type,
            "store_backend": "file_system",
            "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/dq",
            **dq_spec_add_options,
        },
        "restore_prev_version": scenario.get("restore_prev_version", False),
    }


def _generate_dataframe(load_type: str) -> DataFrame:
    """Generate test dataframe.

    Args:
        load_type: batch or streaming.

    Returns: the generated dataframe.
    """
    if load_type == "batch":
        input_df = (
            ExecEnv.SESSION.read.format("csv")
            .schema(
                SchemaUtils.from_file(f"file://{TEST_RESOURCES}/dq_sales_schema.json")
            )
            .load(f"{TEST_RESOURCES}/data/source/part-01.csv")
        )
    else:
        input_df = (
            ExecEnv.SESSION.readStream.format("csv")
            .schema(
                SchemaUtils.from_file(f"file://{TEST_RESOURCES}/dq_sales_schema.json")
            )
            .load(f"{TEST_RESOURCES}/data/source/*")
        )

    return input_df


def _get_result_and_control_dfs(
    result: str, control: str, infer_schema: bool, result_is_table: bool = True
) -> Tuple[DataFrame, DataFrame]:
    """Helper to get the result and control dataframes.

    Args:
        result: the table to read from.
        control: the file name to read from.
        infer_schema: whether to infer the schema or not.
        result_is_table: whether the result is a table or a file.

    Returns: the result and control dataframes.
    """
    if result_is_table:
        dq_result_df = DataframeHelpers.read_from_table(result)
    else:
        dq_result_df = DataframeHelpers.read_from_file(
            location=result,
            file_format="delta",
        )

    dq_control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{control}.csv",
        file_format="csv",
        options={"header": True, "delimiter": "|", "inferSchema": infer_schema},
    )

    return dq_result_df, dq_control_df


================================================
FILE: tests/feature/test_engine_usage_stats.py
================================================
"""Tests for the log lakehouse engine function."""

import os
import re
from datetime import datetime

import pytest
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import execute_dq_validation, load_data, manage_table
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_LOGS,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "engine_usage_stats"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"
TIMESTAMP = datetime.now()
YEAR = TIMESTAMP.year
MONTH = TIMESTAMP.month


def custom_transformation(df: DataFrame) -> DataFrame:
    """A sample custom transformation to use in the ACON.

    Args:
        df: DataFrame passed as input.

    Returns:
        DataFrame: the transformed DataFrame.
    """
    return df.withColumn("new_column", lit("literal"))


def _get_test_acon(scenario_name: str) -> dict:
    """Creates a test ACON with the desired logic for the test.

    Args:
        scenario_name: name of the test scenario running.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    df = ExecEnv.SESSION.read.options(
        header="True", inferSchema="True", delimiter="|"
    ).csv(f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/")
    input_spec: dict = {
        "spec_id": "sales_source",
        "read_type": "batch",
    }
    transformers = [
        {
            "function": "rename",
            "args": {"cols": {"salesorder": "salesorder1"}},
        }
    ]
    if "simple_acon" not in scenario_name:
        transformers.append(
            {
                "function": "custom_transformation",
                "args": {"custom_transformer": custom_transformation},
            }
        )
        input_spec = {**input_spec, "data_format": "dataframe", "df_name": df}
    else:
        input_spec = {
            **input_spec,
            "data_format": "csv",
            "options": {
                "mode": "FAILFAST",
                "header": True,
                "delimiter": "|",
                "password": "dummy_password",
            },
            "location": f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/",
        }

    return {
        "input_specs": [input_spec],
        "transform_specs": [
            {
                "spec_id": "renamed_kpi",
                "input_id": "sales_source",
                "transformers": transformers,
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "renamed_kpi",
                "write_type": "overwrite",
                "data_format": "delta",
                "location": f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data/",
            }
        ],
        "exec_env": {"dp_name": scenario_name},
    }


@pytest.mark.parametrize("scenario", ["load_simple_acon", "load_custom_transf_and_df"])
def test_load_data(scenario: str) -> None:
    """Test Data Loader with different scenarios.

    Scenarios:
        engine_usage_stats:

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    load_data(
        acon=_get_test_acon(scenario),
        spark_confs={"dp_name": "dp_name"},
        collect_engine_usage="enabled",
    )

    _prepare_and_compare_dfs(scenario)


@pytest.mark.parametrize("scenario", ["table_manager"])
def test_table_manager(scenario: str) -> None:
    """Test Table Manager with different scenarios.

    Scenarios:
        table_manager: table_manager logging behaviour

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    acon = {
        "function": "execute_sql",
        "sql": "select 1",
        "exec_env": {"dp_name": scenario},
    }

    manage_table(
        acon=acon, spark_confs={"dp_name": "dp_name"}, collect_engine_usage="enabled"
    )

    _prepare_and_compare_dfs(scenario)


@pytest.mark.parametrize("scenario", ["dq_validator"])
def test_dq_validator(scenario: str) -> None:
    """Test DQ Validator with different scenarios.

    Scenarios:
        dq_validator: dq_validator logging behaviour

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    acon = {
        "input_spec": {
            "spec_id": "sales_source",
            "read_type": "batch",
            "data_format": "csv",
            "options": {"mode": "FAILFAST", "header": True, "delimiter": "|"},
            "location": f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
        },
        "dq_spec": {
            "spec_id": "dq_sales",
            "input_id": "sales_source",
            "dq_type": "validator",
            "store_backend": "file_system",
            "local_fs_root_dir": f"{TEST_LAKEHOUSE_OUT}/dq",
            "result_sink_db_table": "test_db.dq_validator",
            "result_sink_format": "json",
            "result_sink_explode": False,
            "dq_functions": [
                {"function": "expect_column_to_exist", "args": {"column": "article"}},
                {
                    "function": "expect_table_row_count_to_be_between",
                    "args": {"min_value": 3, "max_value": 11},
                },
                {
                    "function": "expect_column_pair_a_to_be_smaller_or_equal_than_b",
                    "args": {"column_A": "salesorder", "column_B": "amount"},
                },
            ],
        },
        "exec_env": {"dp_name": scenario},
    }

    execute_dq_validation(
        acon=acon, spark_confs={"dp_name": "dp_name"}, collect_engine_usage="enabled"
    )

    _prepare_and_compare_dfs(scenario)


def _prepare_and_compare_dfs(scenario: str) -> None:
    """Prepare DF and compare test and control dataframes.

    Args:
        scenario: Scenario to load dataframes to compare.
    """
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        "json",
        options={"inferSchema": True},
    )

    log_folder_path = f"{LAKEHOUSE_FEATURE_LOGS}/{scenario}/{YEAR}/{MONTH}/"
    log_file_path = os.listdir(log_folder_path)[-1]

    eng_usage_df = DataframeHelpers.read_from_file(
        f"{log_folder_path}{log_file_path}", "json"
    )

    assert eng_usage_df.columns == control_df.columns
    assert (
        eng_usage_df.select("start_timestamp").first()[0]
        >= control_df.select("start_timestamp").first()[0]
    )

    assert _prepare_df_comparison(eng_usage_df) == _prepare_df_comparison(control_df)


def _prepare_df_comparison(df: DataFrame) -> str:
    """Prepared DF to be comparable by dropping columns and converting it to string.

    Args:
        df: DataFrame to be prepared.

    Returns: a string representation of the Dataframe, ready to be compared.
    """
    cols_to_ignore = ["start_timestamp", "engine_version"]
    str_df = str(df.drop(*cols_to_ignore).first()[0])
    str_df = re.sub("'<function ", "", str_df)
    return re.sub(" at.*'", "", str_df)


================================================
FILE: tests/feature/test_extract_from_sap_b4.py
================================================
"""Test extractions from SAP B4."""

from datetime import datetime, timezone

import pytest
from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "extract_from_sap_b4"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"
LOGGER = LoggingHandler(__name__).get_logger()
DB_TABLE = "dummy_table"
"""Scenario - Description:
    no_part_col_no_lower_and_upper_bound_extra_cols - no strategy to split the
        extraction. Moreover, test adding single extra column from the activation
        requests table.
    int_part_col_provide_upper_bound_&_min_timestamp - partition column of type int,
        manually provided upper_bound to parallelize the extraction. Moreover, it
        provides the min_timestamp to use to get the data from the changelog in the
        delta extraction after the init, which mimics the possible situation, in
        which people might need to provide a specific timestamp for backfilling,
        instead of deriving it from an existing location.
    int_part_col_generate_predicates_multi_extra_cols - partition column of type int to
        automatically generate predicates and parallelize the extraction. Moreover, test
        adding multiple extra columns from the activation requests table.
    str_part_col_generate_predicates - partition column of type str to
        automatically generate predicates and parallelize the extraction.
    str_part_col_predicates_list - partition column of type str,
        manually provided predicates list to parallelize the extraction.
    date_part_col_calculate_upper_bound - partition column of type date to automatically
        calculate the upper_bound and parallelize the extraction.
    timestamp_part_col_calculate_upper_bound - partition column of type timestamp to
        automatically calculate the upper_bound and parallelize the extraction from.
    default_calc_upper_bound - empty partition of type int to force the default on
        the upper bound calculation.
    no_part_col_join_condition - no strategy to split the extraction. Test to
        validate custom join condition on activation table.
"""
TEST_SCENARIOS = [
    {
        "scenario_name": "no_part_col_no_lower_and_upper_bound_extra_cols",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": None,
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_req_status_tbl": "req.records_read",
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "int_part_col_provide_upper_bound_&_min_timestamp",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": "upper_bound int",
        "part_col": "item",
        "lower_bound": 1,
        "upper_bound": 3,
        "min_timestamp": "20210713151010000000000",
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_req_status_tbl": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "int_part_col_generate_predicates_multi_extra_cols",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": "item",
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": True,
        "predicates_list": None,
        "extra_cols_req_status_tbl": "req.records_read, req.records_updated",
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "str_part_col_generate_predicates",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": '"/bic/article"',
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": True,
        "predicates_list": None,
        "extra_cols_req_status_tbl": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "str_part_col_predicates_list",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": None,
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": [
            "\"/bic/article\"='article1'",
            "\"/bic/article\"='article2'",
            "\"/bic/article\"='article3'",
            "\"/bic/article\"='article4'",
            "\"/bic/article\"='article5'",
            "\"/bic/article\"='article6'",
            "\"/bic/article\"='article7'",
            "\"/bic/article\"='article33'",
            "\"/bic/article\"='article60'",
            '"/bic/article" IS NULL',
        ],
        "extra_cols_req_status_tbl": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "date_part_col_calculate_upper_bound",
        "calculate_upper_bound": True,
        "calculate_upper_bound_schema": "upper_bound date",
        "part_col": "date",
        "lower_bound": "2000-01-01",
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_req_status_tbl": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "timestamp_part_col_calculate_upper_bound",
        "calculate_upper_bound": True,
        "calculate_upper_bound_schema": "upper_bound timestamp",
        "part_col": "time",
        "lower_bound": "2000-01-01 01:01:01.000",
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_req_status_tbl": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "no_part_col_join_condition",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": None,
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_req_status_tbl": None,
        "act_req_join_condition": "tbl.reqtsn = req.request_tsn "
        "AND tbl.reqtsn = req.last_process_tsn",
    },
]


@pytest.mark.parametrize("scenario", TEST_SCENARIOS)
def test_extract_aq_dso(scenario: dict) -> None:
    """Test the extraction from SAP B4 AQ DSO.

    Args:
        scenario: scenario to test.
    """
    extra_params = {
        "changelog_table": DB_TABLE,
        "test_name": "extract_aq_dso",
        "adso_type": "AQ",
    }

    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    _prepare_files(scenario["scenario_name"], extra_params)
    _load_test_table("rspmrequest", scenario["scenario_name"], extra_params)

    _execute_and_validate(scenario, extra_params)


@pytest.mark.parametrize("scenario", TEST_SCENARIOS)
def test_extract_cl_dso(scenario: dict) -> None:
    """Test the extraction from SAP B4 CL DSO.

    Args:
        scenario: scenario to test.
    """
    extra_params = {
        "changelog_table": f"{DB_TABLE}_cl",
        "test_name": "extract_cl_dso",
        "adso_type": "CL",
    }

    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    _prepare_files(scenario["scenario_name"], extra_params)
    _load_test_table("rspmrequest", scenario["scenario_name"], extra_params)

    _execute_and_validate(scenario, extra_params)


def _execute_and_validate(scenario: dict, extra_params: dict) -> None:
    """Helper function to reuse for triggering the load data and validation of results.

    Args:
        scenario: scenario being tested.
        extra_params: extra params for the scenario being tested.
    """
    _execute_load(scenario=scenario, extraction_type="init", extra_params=extra_params)

    _execute_load(
        scenario=scenario,
        extraction_type="delta",
        iteration=1,
        extra_params=extra_params,
    )

    _execute_load(
        scenario=scenario,
        extraction_type="delta",
        iteration=2,
        extra_params=extra_params,
    )

    _validate(
        scenario["scenario_name"],
        extra_params,
        scenario["min_timestamp"] is not None,
    )


def _execute_load(
    scenario: dict,
    extra_params: dict,
    extraction_type: str,
    iteration: int = None,
) -> None:
    """Helper function to reuse for loading the data for the scenario tests.

    Args:
        scenario: scenario being tested.
        extra_params: extra params for the scenario being tested.
        extraction_type: type of extraction (delta or init).
        iteration: number of the iteration, in case it is to test a delta.
    """
    write_type = "overwrite" if extraction_type == "init" else "append"

    _load_test_table(
        extra_params["changelog_table"] if extraction_type != "init" else DB_TABLE,
        scenario["scenario_name"],
        extra_params,
        iteration,
    )

    # if it is an init, we need to provide an extraction_timestamp, otherwise the
    # current time would be used and data would be filtered accordingly.
    acon = _get_test_acon(
        extraction_timestamp=(
            "20210713151010"
            if extraction_type == "init"
            else datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
        ),
        extraction_type=extraction_type,
        write_type=write_type,
        scenario=scenario,
        extra_params=extra_params,
    )

    load_data(acon=acon)


def _get_test_acon(
    extraction_type: str,
    write_type: str,
    scenario: dict,
    extra_params: dict,
    extraction_timestamp: str = None,
) -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Args:
        extraction_type: type of extraction (delta or init).
        write_type: the spark write type to be used.
        scenario: the scenario being tested.
        extra_params: extra params for the scenario being tested.
        extraction_timestamp: timestamp of the extraction. For local tests
            we specify it in the init, otherwise would be calculated and
            tests would fail.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sales_source",
                "read_type": "batch",
                "data_format": "sap_b4",
                "calculate_upper_bound": scenario["calculate_upper_bound"],
                "calc_upper_bound_schema": scenario["calculate_upper_bound_schema"],
                "generate_predicates": scenario["generate_predicates"],
                "options": {
                    "driver": "org.sqlite.JDBC",
                    "user": "dummy_user",
                    "password": "dummy_pwd",
                    "url": f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/"
                    f"{scenario['scenario_name']}/{extra_params['test_name']}/tests.db",
                    "dbtable": DB_TABLE,
                    "data_target": "dummy_table",
                    "act_req_join_condition": scenario["act_req_join_condition"],
                    "changelog_table": extra_params["changelog_table"],
                    "customSchema": "reqtsn DECIMAL(23,0), datapakid STRING, "
                    "record INTEGER, extraction_start_timestamp DECIMAL(15,0)",
                    "request_status_tbl": "rspmrequest",
                    "extra_cols_req_status_tbl": scenario["extra_cols_req_status_tbl"],
                    "latest_timestamp_data_location": f"file:///{TEST_LAKEHOUSE_OUT}/"
                    f"{scenario['scenario_name']}/{extra_params['test_name']}/data",
                    "extraction_type": extraction_type,
                    "numPartitions": 2,
                    "partitionColumn": scenario["part_col"],
                    "lowerBound": scenario["lower_bound"],
                    "upperBound": scenario["upper_bound"],
                    "default_upper_bound": scenario.get("default_upper_bound", "Null"),
                    "extraction_timestamp": extraction_timestamp,
                    "min_timestamp": scenario["min_timestamp"],
                    "predicates": scenario["predicates_list"],
                    "adso_type": extra_params["adso_type"],
                },
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "sales_source",
                "write_type": write_type,
                "data_format": "delta",
                "partitions": ["reqtsn"],
                "location": f"file:///{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/"
                f"{extra_params['test_name']}/data",
            }
        ],
        "exec_env": {
            "spark.databricks.delta.schema.autoMerge.enabled": (
                True if scenario["extra_cols_req_status_tbl"] else False
            )
        },
    }


def _prepare_files(scenario: str, extra_params: dict) -> None:
    """Copy all the files needed for the tests.

    Args:
         scenario: scenario being tested.
         extra_params: extra params for the scenario being tested.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{extra_params['test_name']}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/source/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{extra_params['test_name']}/*.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/*_schema.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/",
    )

    if scenario == "no_part_col_join_condition":
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/"
            f"{extra_params['test_name']}/data/control/"
            f"dummy_table_join_condition.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/",
        )
    else:
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/"
            f"dummy_table.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/",
        )


def _load_test_table(
    db_table: str, scenario: str, extra_params: dict, iteration: int = None
) -> DataFrame:
    """Load the JDBC tables for the tests and return a Dataframe with the content.

    Args:
        db_table: table being loaded.
        scenario: scenario being tested.
        extra_params: extra params for the scenario being tested.
        iteration: number of the iteration, in case it is to test a delta.

    Returns:
        A Dataframe with the content of the JDBC table loaded.
    """
    file_name = f"{db_table}_{iteration}" if iteration else db_table

    source_df = DataframeHelpers.read_from_file(
        location=f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/"
        f"source/{file_name}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/"
            f"{db_table}_schema.json"
        ),
        options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"},
    )

    DataframeHelpers.write_into_jdbc_table(
        source_df,
        f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/"
        f"{extra_params['test_name']}/tests.db",
        db_table,
    )

    return DataframeHelpers.read_from_jdbc(
        f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/"
        f"{extra_params['test_name']}/tests.db",
        db_table,
    )


def _validate(scenario: str, extra_params: dict, min_timestamp: bool) -> None:
    """Perform the validation part of the local tests.

    Args:
        scenario: the scenario being tested.
        extra_params: extra params for the scenario being tested.
        min_timestamp: whether the min_timestamp is provided or not.
    """
    control_df = DataframeHelpers.read_from_file(
        location=f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/"
        f"data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/"
            f"{extra_params['test_name']}/dummy_table_schema.json"
        ),
        options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"},
    )

    control_df_columns = control_df.columns
    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/{extra_params['test_name']}/data",
        file_format=OutputFormat.DELTAFILES.value,
    ).select(control_df_columns)

    if min_timestamp:
        # when we fill the min_timestamp, it means it can either skip or
        # re-extract things, depending on the timestamp provided. In our scenario
        # is expected to re-extract, causing duplicates, thus if we remove the
        # duplicates we expect to match the non-duplicated control dataframe
        result_df = result_df.drop_duplicates()

    assert not DataframeHelpers.has_diff(control_df, result_df)


================================================
FILE: tests/feature/test_extract_from_sap_bw.py
================================================
"""Test extractions from SAP BW."""

import re
from datetime import datetime, timezone

import pytest
from _pytest.logging import LogCaptureFixture
from pyspark.sql import DataFrame

from lakehouse_engine.core.definitions import OutputFormat, WriteType
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.extraction.sap_bw_extraction_utils import (
    SAPBWExtraction,
    SAPBWExtractionUtils,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "extract_from_sap_bw"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"
LOGGER = LoggingHandler(__name__).get_logger()
DB_TABLE = "dummy_table"
"""Scenario - Description:
    no_part_col_no_lower_and_upper_bound_extra_cols - no strategy to split the
        extraction. Moreover, test adding single extra column from the activation
        requests table.
    int_part_col_provide_upper_bound_&_min_timestamp - partition column of type int,
        manually provided upper_bound to parallelize the extraction. Moreover, it
        provides the min_timestamp to use to get the data from the changelog in the
        delta extraction after the init, which mimics the possible situation, in
        which people might need to provide a specific timestamp for backfilling,
        instead of deriving it from an existing location.
    int_part_col_generate_predicates_multi_extra_cols - partition column of type int to
        automatically generate predicates and parallelize the extraction. Moreover, test
        adding multiple extra columns from the activation requests table.
    str_part_col_generate_predicates - partition column of type str to
        automatically generate predicates and parallelize the extraction.
    str_part_col_predicates_list - partition column of type str,
        manually provided predicates list to parallelize the extraction.
    date_part_col_calculate_upper_bound - partition column of type date to automatically
        calculate the upper_bound and parallelize the extraction.
    timestamp_part_col_calculate_upper_bound - partition column of type timestamp to
        automatically calculate the upper_bound and parallelize the extraction from.
    init_timestamp_from_actrequest - get the init timestamp from act_request
        table instead of assuming a given timestamp.
    fail_calc_upper_bound - empty partition of type date to force failure on
        the upper bound calculation.
    no_part_col_join_condition - no strategy to split the extraction. Test to
        validate custom join condition on activation table.
"""
TEST_SCENARIOS = [
    {
        "scenario_name": "no_part_col_no_lower_and_upper_bound_extra_cols",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": None,
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": "act_req.request as activation_request",
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "int_part_col_provide_upper_bound_&_min_timestamp",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": "upper_bound int",
        "part_col": "item",
        "lower_bound": 1,
        "upper_bound": 3,
        "min_timestamp": "20211004151010",
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "int_part_col_generate_predicates_multi_extra_cols",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": "item",
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": True,
        "predicates_list": None,
        "extra_cols_act_request": "act_req.request as actrequest_request, status",
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "str_part_col_generate_predicates",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": '"/bic/article"',
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": True,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "str_part_col_predicates_list",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": None,
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": [
            "\"/bic/article\"='article1'",
            "\"/bic/article\"='article2'",
            "\"/bic/article\"='article3'",
            "\"/bic/article\"='article4'",
            "\"/bic/article\"='article5'",
            "\"/bic/article\"='article6'",
            "\"/bic/article\"='article7'",
            "\"/bic/article\"='article33'",
            "\"/bic/article\"='article60'",
            '"/bic/article" IS NULL',
        ],
        "extra_cols_act_request": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "date_part_col_calculate_upper_bound",
        "calculate_upper_bound": True,
        "calculate_upper_bound_schema": "upper_bound date",
        "part_col": "date",
        "lower_bound": "2000-01-01",
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "timestamp_part_col_calculate_upper_bound",
        "calculate_upper_bound": True,
        "calculate_upper_bound_schema": "upper_bound timestamp",
        "part_col": "time",
        "lower_bound": "2000-01-01 01:01:01.000",
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "init_timestamp_from_actrequest",
        "calculate_upper_bound": True,
        "calculate_upper_bound_schema": "upper_bound timestamp",
        "part_col": "time",
        "lower_bound": "2000-01-01 01:01:01.000",
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "get_timestamp_from_act_request": True,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "fail_calc_upper_bound",
        "calculate_upper_bound": True,
        "calculate_upper_bound_schema": "upper_bound date",
        "part_col": "order_date",
        "lower_bound": "2000-01-01",
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "act_req_join_condition": None,
    },
    {
        "scenario_name": "no_part_col_join_condition",
        "calculate_upper_bound": False,
        "calculate_upper_bound_schema": None,
        "part_col": None,
        "lower_bound": None,
        "upper_bound": None,
        "min_timestamp": None,
        "generate_predicates": False,
        "predicates_list": None,
        "extra_cols_act_request": None,
        "act_req_join_condition": "changelog_tbl.request = act_req.actrequest "
        "AND changelog_tbl.request = act_req.request",
    },
]


@pytest.mark.parametrize("scenario", TEST_SCENARIOS)
def test_extract_dso(scenario: dict, caplog: LogCaptureFixture) -> None:
    """Test the extraction from SAP BW DSO.

    Args:
        scenario: scenario to test.
        caplog: fixture to capture console logs.
    """
    extra_params = {
        "request_col_name": "actrequest",
        "changelog_table": f"{DB_TABLE}_cl",
        "test_name": "extract_dso",
        "include_changelog_tech_cols": True,
    }

    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    _prepare_files(scenario["scenario_name"], extra_params)
    _load_test_table("rsodsactreq", scenario["scenario_name"], extra_params)

    _execute_and_validate("extract_dso", scenario, extra_params, caplog)


@pytest.mark.parametrize("scenario", TEST_SCENARIOS)
def test_extract_write_optimised_dso(scenario: dict, caplog: LogCaptureFixture) -> None:
    """Test the extraction from SAP BW Write Optimised DSO.

    Args:
        scenario: scenario to test.
        caplog: fixture to capture console logs.
    """
    extra_params = {
        "request_col_name": "request",
        "changelog_table": DB_TABLE,
        "test_name": "extract_write_optimised_dso",
        "include_changelog_tech_cols": False,
    }

    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    _prepare_files(scenario["scenario_name"], extra_params)
    _load_test_table("rsodsactreq", scenario["scenario_name"], extra_params)

    _execute_and_validate("extract_wodso", scenario, extra_params, caplog)


def _execute_and_validate(
    test_name: str, scenario: dict, extra_params: dict, caplog: LogCaptureFixture
) -> None:
    """Helper function to reuse for trigger loading data and validation of results.

    Args:
        test_name: test being executed (for dso or wodso).
        scenario: scenario being tested.
        extra_params: extra params for the scenario being tested.
        caplog: fixture to capture console logs.
    """
    if scenario["scenario_name"] == "fail_calc_upper_bound":
        with pytest.raises(AttributeError, match="Not able to calculate upper bound"):
            _execute_load(
                scenario=scenario, extraction_type="init", extra_params=extra_params
            )
    elif test_name == "extract_dso" and "from_actrequest" in scenario["scenario_name"]:
        with pytest.raises(
            AttributeError, match="Not able to get the extraction query"
        ):
            _execute_load(
                scenario=scenario, extraction_type="init", extra_params=extra_params
            )
    else:
        _execute_load(
            scenario=scenario, extraction_type="init", extra_params=extra_params
        )

        changelog_table = extra_params["changelog_table"]
        assert f"The changelog table derived is: '{changelog_table}'" in caplog.text

        _execute_load(
            scenario=scenario,
            extraction_type="delta",
            iteration=1,
            extra_params=extra_params,
        )

        _execute_load(
            scenario=scenario,
            extraction_type="delta",
            iteration=2,
            extra_params=extra_params,
        )

        _validate(
            scenario["scenario_name"],
            extra_params,
            scenario["min_timestamp"] is not None,
        )


def _execute_load(
    scenario: dict,
    extra_params: dict,
    extraction_type: str,
    iteration: int = None,
) -> None:
    """Helper function to reuse for loading the data for the scenario tests.

    Args:
        scenario: scenario being tested.
        extra_params: extra params for the scenario being tested.
        extraction_type: type of extraction (delta or init).
        iteration: number of the iteration, in case it is to test a delta.
    """
    write_type = "overwrite" if extraction_type == "init" else "append"

    _load_test_table(
        DB_TABLE if extraction_type == "init" else extra_params["changelog_table"],
        scenario["scenario_name"],
        extra_params,
        iteration,
    )

    # if it is an init, we need to provide an extraction_timestamp, otherwise the
    # current time would be used and data would be filtered accordingly.
    acon = _get_test_acon(
        extraction_timestamp=(
            "20211004151010"
            if extraction_type == "init"
            else datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
        ),
        extraction_type=extraction_type,
        write_type=write_type,
        scenario=scenario,
        extra_params=extra_params,
    )

    load_data(acon=acon)


def _get_test_acon(
    extraction_type: str,
    write_type: str,
    scenario: dict,
    extra_params: dict,
    extraction_timestamp: str = None,
) -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Args:
        extraction_type: type of extraction (delta or init).
        write_type: the spark write type to be used.
        scenario: the scenario being tested.
        extra_params: extra params for the scenario being tested.
        extraction_timestamp: timestamp of the extraction. For local tests
            we specify it in the init, otherwise would be calculated and
            tests would fail.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sales_source",
                "read_type": "batch",
                "data_format": "sap_bw",
                "calculate_upper_bound": scenario["calculate_upper_bound"],
                "calc_upper_bound_schema": scenario["calculate_upper_bound_schema"],
                "generate_predicates": scenario["generate_predicates"],
                "options": {
                    "driver": "org.sqlite.JDBC",
                    "user": "dummy_user",
                    "password": "dummy_pwd",
                    "url": f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/"
                    f"{scenario['scenario_name']}/{extra_params['test_name']}/tests.db",
                    "dbtable": DB_TABLE,
                    "changelog_table": (
                        extra_params["changelog_table"]
                        if "changelog_table" in extra_params.keys()
                        else None
                    ),
                    "customSchema": "actrequest_timestamp DECIMAL(15,0), "
                    "datapakid STRING, request STRING, "
                    "partno INTEGER, record INTEGER, "
                    "extraction_start_timestamp DECIMAL(15,0)",
                    "act_request_table": "rsodsactreq",
                    "extra_cols_act_request": scenario["extra_cols_act_request"],
                    "latest_timestamp_data_location": f"file:///{TEST_LAKEHOUSE_OUT}/"
                    f"{scenario['scenario_name']}/{extra_params['test_name']}/data",
                    "extraction_type": extraction_type,
                    "numPartitions": 2,
                    "partitionColumn": scenario["part_col"],
                    "lowerBound": scenario["lower_bound"],
                    "upperBound": scenario["upper_bound"],
                    "default_upper_bound": "Null",
                    "extraction_timestamp": extraction_timestamp,
                    "min_timestamp": scenario["min_timestamp"],
                    "request_col_name": extra_params["request_col_name"],
                    "act_req_join_condition": scenario["act_req_join_condition"],
                    "include_changelog_tech_cols": extra_params[
                        "include_changelog_tech_cols"
                    ],
                    "predicates": scenario["predicates_list"],
                    "get_timestamp_from_act_request": scenario.get(
                        "get_timestamp_from_act_request", False
                    ),
                },
            }
        ],
        "transform_specs": [
            {
                "spec_id": "filtered_sales",
                "input_id": "sales_source",
                "transformers": [
                    {
                        "function": "expression_filter",
                        "args": {"exp": "`/bic/article` like 'article%'"},
                    }
                ],
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "sales_source",
                "write_type": write_type,
                "data_format": "delta",
                "partitions": ["actrequest_timestamp"],
                "location": f"file:///{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/"
                f"{extra_params['test_name']}/data",
            }
        ],
        "exec_env": {
            "spark.databricks.delta.schema.autoMerge.enabled": (
                True if scenario["extra_cols_act_request"] else False
            )
        },
    }


def _prepare_files(scenario: str, extra_params: dict) -> None:
    """Copy all the files needed for the tests.

    Args:
         scenario: scenario being tested.
         extra_params: extra params for the scenario being tested.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{extra_params['test_name']}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/source/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{extra_params['test_name']}/*.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/*_schema.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/",
    )

    if (
        "optimised_dso" in extra_params["test_name"]
        and scenario == "init_timestamp_from_actrequest"
    ):
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/"
            f"{extra_params['test_name']}/data/control/"
            f"dummy_table_actreq_timestamp.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/",
        )
    elif scenario == "no_part_col_join_condition":
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/"
            f"{extra_params['test_name']}/data/control/"
            f"dummy_table_join_condition.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/",
        )
    else:
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{extra_params['test_name']}/data/control/"
            f"dummy_table.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/data/",
        )


def _load_test_table(
    db_table: str, scenario: str, extra_params: dict, iteration: int = None
) -> DataFrame:
    """Load the JDBC tables for the tests and return a Dataframe with the content.

    Args:
        db_table: table being loaded.
        scenario: scenario being tested.
        extra_params: extra params for the scenario being tested.
        iteration: number of the iteration, in case it is to test a delta.

    Returns:
        A Dataframe with the content of the JDBC table loaded.
    """
    file_name = f"{db_table}_{iteration}" if iteration else db_table

    source_df = DataframeHelpers.read_from_file(
        location=f"{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/"
        f"source/{file_name}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}/{extra_params['test_name']}/"
            f"{db_table}_schema.json"
        ),
        options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"},
    )

    DataframeHelpers.write_into_jdbc_table(
        source_df,
        f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/"
        f"{extra_params['test_name']}/tests.db",
        db_table,
    )

    return DataframeHelpers.read_from_jdbc(
        f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario}/"
        f"{extra_params['test_name']}/tests.db",
        db_table,
    )


def _validate(scenario: str, extra_params: dict, min_timestamp: bool) -> None:
    """Perform the validation part of the local tests.

    Args:
        scenario: the scenario being tested.
        extra_params: extra params for the scenario being tested.
        min_timestamp: whether the min_timestamp is provided or not.
    """
    control_df = DataframeHelpers.read_from_file(
        location=f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/{extra_params['test_name']}/"
        f"data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/"
            f"{extra_params['test_name']}/dummy_table_schema.json"
        ),
        options={"header": True, "delimiter": "|", "dateFormat": "yyyyMMdd"},
    )
    control_df_columns = control_df.columns
    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/{extra_params['test_name']}/data",
        file_format=OutputFormat.DELTAFILES.value,
    ).select(control_df_columns)

    if min_timestamp:
        # when we fill the min_timestamp, it means it can either skip or
        # re-extract things, depending on the timestamp provided. In our scenario
        # is expected to re-extract, causing duplicates, thus if we remove the
        # duplicates we expect to match the non-duplicated control dataframe
        result_df = result_df.drop_duplicates()

    assert not DataframeHelpers.has_diff(control_df, result_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "derive_changelog_table_name",
            "odsobject": "testtable",
            "logsys": "DHACLNT003",
        },
        {
            "name": "derive_changelog_table_name",
            "odsobject": "test_table",
        },
    ],
)
def test_changelog_table_name_derivation(scenario: dict) -> None:
    """Test the changelog table name derivation.

    Args:
        scenario: scenario to be tested.
    """
    LocalStorage.copy_file(
        f"""{TEST_RESOURCES}/{scenario["name"]}/data/source/*.csv""",
        f"""{TEST_LAKEHOUSE_IN}/{scenario["name"]}/source/""",
    )
    LocalStorage.copy_file(
        f"""{TEST_RESOURCES}/{scenario["name"]}/*.json""",
        f"""{TEST_LAKEHOUSE_IN}/{scenario["name"]}/""",
    )

    for table in ["RSTSODS", "RSBASIDOC"]:
        source_df = DataframeHelpers.read_from_file(
            location=f"""{TEST_LAKEHOUSE_IN}/{scenario["name"]}/"""
            f"""source/{table}.csv""",
            schema=SchemaUtils.from_file_to_dict(
                f"""file://{TEST_LAKEHOUSE_IN}/{scenario["name"]}/"""
                f"""{table}_schema.json"""
            ),
            options={"header": True, "delimiter": "|"},
        )

        DataframeHelpers.write_into_jdbc_table(
            source_df,
            f"""jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario["name"]}/tests.db""",
            table,
            write_type=WriteType.OVERWRITE.value,
        )

    extraction_utils = SAPBWExtractionUtils(
        SAPBWExtraction(  # nosec B106
            sap_bw_schema="",
            odsobject=scenario["odsobject"],
            dbtable="dummy_table",
            driver="org.sqlite.JDBC",
            user="dummy_user",
            password="dummy_pwd",
            url=f"""jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario["name"]}/tests.db""",
            **(
                {"logsys": scenario["logsys"]}
                if "logsys" in scenario and scenario["logsys"] is not None
                else {}
            ),
        )
    )

    assert re.match(
        f"""{scenario["odsobject"]}_OA""",
        extraction_utils.get_changelog_table(),
    )


================================================
FILE: tests/feature/test_file_manager.py
================================================
"""Test file manager."""
import logging
from typing import Any

import boto3
import pytest
from moto import mock_s3  # type: ignore

from lakehouse_engine.engine import manage_files
from tests.conftest import FEATURE_RESOURCES

TEST_PATH = "file_manager"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"


@mock_s3
def test_file_manager(caplog: Any) -> None:
    """Test functions from file manager.

    Args:
        caplog: captured log.
    """
    s3_res = boto3.resource("s3", region_name="us-east-1")
    s3_cli = boto3.client("s3", region_name="us-east-1")

    s3_res.create_bucket(Bucket="test_bucket")
    s3_res.create_bucket(Bucket="destination_bucket")

    with caplog.at_level(logging.INFO):
        # Creating test files/folders in S3
        # 2000 files are created to test the pagination is being correctly performed
        s3_cli.put_object(Bucket="test_bucket", Key="test_single_file.json", Body="")
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="")
        for x in range(0, 2000):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory/test_recursive_file{x}.json",
                Body="",
            )
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory_test/", Body="")
        for x in range(0, 2000):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory_test/test_recursive_file{x}.json",
                Body="",
            )

        _test_file_manager_copy(caplog, s3_cli)
        _test_file_manager_delete(caplog, s3_cli)


def _test_file_manager_copy(caplog: Any, s3_cli: Any) -> None:
    """Testing file manager copy operations.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
    """
    manage_files(
        f"file://{TEST_RESOURCES}/copy_object/acon_copy_single_object_dry_run.json"
    )
    assert "{'test_single_file.json': ['test_single_file.json']}" in caplog.text

    manage_files(
        f"file://{TEST_RESOURCES}/copy_object/acon_copy_directory_dry_run.json"
    )
    for x in range(0, 2000):
        assert f"test_directory/test_recursive_file{x}.json" in caplog.text

    manage_files(f"file://{TEST_RESOURCES}/copy_object/acon_copy_single_object.json")
    assert "'KeyCount': 1" in str(s3_cli.list_objects_v2(Bucket="destination_bucket"))

    manage_files(f"file://{TEST_RESOURCES}/copy_object/acon_copy_directory.json")

    assert "'KeyCount': 2002" in str(
        s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000)
    )


def _test_file_manager_delete(caplog: Any, s3_cli: Any) -> None:
    """Testing file manager delete operations.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
    """
    manage_files(
        f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects_dry_run.json"
    )
    assert (
        "{'test_single_file.json': ['test_single_file.json'], "
        "'test_directory/': ['test_directory/'" in caplog.text
    )
    for x in range(0, 2000):
        assert f"test_directory/test_recursive_file{x}.json" in caplog.text

    manage_files(f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects.json")

    assert "'KeyCount': 2001" in str(
        s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000)
    )


@mock_s3
@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "glacier", "storage_class": "GLACIER"},
        {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"},
        {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"},
    ],
)
def test_file_manager_restore_archive(scenario: dict, caplog: Any) -> None:
    """Test restore functions from file manager.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    s3_res = boto3.resource("s3", region_name="us-east-1")
    s3_cli = boto3.client("s3", region_name="us-east-1")

    s3_res.create_bucket(Bucket="test_bucket")
    s3_res.create_bucket(Bucket="destination_bucket")

    with caplog.at_level(logging.INFO):
        s3_cli.put_object(
            Bucket="test_bucket",
            Key="test_single_file.json",
            Body="",
            StorageClass=scenario.get("storage_class"),
        )
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory", Body="")
        for x in range(0, 3):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory/test_recursive_file{x}.json",
                Body="",
                StorageClass=scenario.get("storage_class"),
            )

        _test_file_manager_restore_request(caplog, s3_cli, s3_res)
        _test_file_manager_restore_check(caplog, s3_cli, s3_res)


def _test_file_manager_restore_check(caplog: Any, s3_cli: Any, s3_res: Any) -> None:
    """Testing file manager restore check.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
        s3_res: s3 resource interface.
    """
    test_bucket = s3_res.Bucket("test_bucket")
    expected_restored_objects = 4
    restored_objects = 0

    manage_files(
        f"file://{TEST_RESOURCES}/check_restore_status/"
        "acon_check_restore_status_directory.json"
    )
    for x in range(0, 3):
        assert (
            f"Checking restore status for: test_directory/test_recursive_file{x}.json"
            in caplog.text
        )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 5" in str(
        s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000)
    )
    assert expected_restored_objects == restored_objects


def _test_file_manager_restore_request(caplog: Any, s3_cli: Any, s3_res: Any) -> None:
    """Testing file manager restore request.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
        s3_res: s3 resource interface.
    """
    test_bucket = s3_res.Bucket("test_bucket")
    expected_restored_objects = 4
    restored_objects = 0

    manage_files(
        f"file://{TEST_RESOURCES}/request_restore/"
        "acon_request_restore_single_object.json"
    )
    manage_files(
        f"file://{TEST_RESOURCES}/request_restore/"
        "acon_request_restore_directory.json"
    )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 5" in str(
        s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000)
    )
    assert expected_restored_objects == restored_objects


@mock_s3
@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "glacier", "storage_class": "GLACIER"},
        {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"},
        {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"},
    ],
)
def test_file_manager_restore_sync(scenario: dict, caplog: Any) -> None:
    """Test restore functions from file manager.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    s3_res = boto3.resource("s3", region_name="us-east-1")
    s3_cli = boto3.client("s3", region_name="us-east-1")

    s3_res.create_bucket(Bucket="test_bucket")
    s3_res.create_bucket(Bucket="destination_bucket")

    with caplog.at_level(logging.INFO):
        s3_cli.put_object(
            Bucket="test_bucket",
            Key="test_single_file.json",
            Body="",
            StorageClass=scenario.get("storage_class"),
        )
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="")
        for x in range(0, 3):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory/test_recursive_file{x}.json",
                Body="",
                StorageClass=scenario.get("storage_class"),
            )

        _test_file_manager_restore_sync(caplog, s3_cli, s3_res)
        _test_file_manager_restore_sync_retrieval_tier_exception(caplog)


def _test_file_manager_restore_sync(caplog: Any, s3_cli: Any, s3_res: Any) -> None:
    """Testing file manager restore file sync.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
        s3_res: s3 resource interface.
    """
    test_bucket = s3_res.Bucket("test_bucket")
    expected_single_restored_objects = 1
    restored_objects = 0

    manage_files(
        f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/"
        "acon_request_restore_to_destination_and_wait_single_object.json"
    )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 1" in str(
        s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000)
    )
    assert expected_single_restored_objects == restored_objects

    restored_objects = 0
    expected_restored_objects = 4

    manage_files(
        f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/"
        "acon_request_restore_to_destination_and_wait_directory.json"
    )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 5" in str(
        s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000)
    )
    assert expected_restored_objects == restored_objects


def _test_file_manager_restore_sync_retrieval_tier_exception(caplog: Any) -> None:
    """Testing file manager restore sync operation when raising exception.

    Args:
        caplog: captured log.
    """
    with pytest.raises(ValueError) as exception:
        manage_files(
            f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/"
            "acon_request_restore_to_destination_and_wait_single"
            "_object_raise_error.json"
        )

    assert (
        "Retrieval Tier Bulk not allowed on this operation! "
        "This kind of restore should be used just with `Expedited` retrieval tier "
        "to save cluster costs." in str(exception.value)
    )


================================================
FILE: tests/feature/test_file_manager_dbfs.py
================================================
"""Test file manager for dbfs."""

import logging
import os
import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Iterator
from unittest.mock import patch

import pytest

from lakehouse_engine.engine import manage_files
from lakehouse_engine.utils.databricks_utils import DatabricksUtils
from tests.conftest import FEATURE_RESOURCES

TEST_PATH = "file_manager_dbfs"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_DBFS = "tests/lakehouse/dbfs"


@dataclass
class FileInfoFixture:
    """This class mocks the DBUtils FileInfo object."""

    path: str
    name: str
    size: int

    def isDir(self) -> bool:
        """Construct to check if the path is a directory.

        Returns:
            A bool as true is it is a directory.
        """
        return os.path.isdir(self.path)

    def isFile(self) -> bool:
        """Construct to check if the path is a file.

        Returns:
            A bool as true is it is a file.
        """
        return os.path.isfile(self.path)


class DBUtilsFixture:
    """This class is used for mocking the behaviour of DBUtils inside tests."""

    def __init__(self) -> None:
        """Construct to mock DBUtils filesystem operations."""
        self.fs = self

    @staticmethod
    def cp(src: str, dest: str, recurse: bool = False) -> None:
        """This mocks the behavior of dbutils when copy files or directories.

        Args:
            src: string with the path to copy from.
            dest: string with the path to copy to.
            recurse: bool to recursively move files or directories.
        """
        if os.path.isfile(src):
            shutil.copy(src, dest)
        elif recurse:
            shutil.copytree(src, dest)
        else:
            shutil.copy(src, dest)

    @staticmethod
    def ls(path: str) -> list:
        """This mocks the behavior of dbutils when reading a directory or files inside.

        Args:
            path: string with the path to read the directory or files inside.
        """
        paths = Path(path).glob("*")
        objects = [
            FileInfoFixture(str(p.absolute()), p.name, p.stat().st_size) for p in paths
        ]
        return objects

    @staticmethod
    def mkdirs(path: str) -> None:
        """This mocks the behavior of dbutils when creating a directory.

        Args:
            path: string with the path to create the directory.
        """
        Path(path).mkdir(parents=True, exist_ok=True)

    @staticmethod
    def mv(src: str, dest: str, recurse: bool = False) -> None:
        """This mocks the behavior of dbutils when moving files or directories.

        Args:
            src: string with the path to move from.
            dest: string with the path to move to.
            recurse: bool to recursively move files or directories.
        """
        if os.path.isfile(src):
            shutil.move(src, dest, copy_function=shutil.copy)
        elif recurse:
            shutil.move(src, dest, copy_function=shutil.copytree)
        else:
            shutil.move(src, dest, copy_function=shutil.copy)

    @staticmethod
    def put(path: str, content: str, overwrite: bool = False) -> None:
        """This mocks the behavior of dbutils when inserting in files.

        Args:
            path: string with the path to insert content.
            content: string with the content to insert in the file.
            overwrite: bool to overwrite file with the content.
        """
        file = Path(path)

        if file.exists() and not overwrite:
            raise FileExistsError("File already exists")

        file.write_text(content, encoding="utf-8")

    @staticmethod
    def rm(path: str, recurse: bool = False) -> None:
        """This mocks the behavior of dbutils when removing files or directories.

        Args:
            path: string with the path to remove.
            recurse: bool to recursively remove files or directories.
        """
        if os.path.isfile(path):
            os.remove(path)
        elif recurse:
            shutil.rmtree(path)
        else:
            os.remove(path)


@pytest.fixture(scope="session", autouse=True)
def dbutils_fixture() -> Iterator[None]:
    """This fixture patches the `get_db_utils` function."""
    with patch.object(DatabricksUtils, "get_db_utils", lambda _: DBUtilsFixture()):
        yield


@patch(
    "lakehouse_engine.utils.storage.file_storage_functions."
    "FileStorageFunctions.is_boto3_configured",
    return_value=False,
)
def test_file_manager_dbfs(_patch: Any, caplog: Any) -> None:
    """Test functions from file manager.

    Args:
        caplog: captured log.
    """
    dbutils = DBUtilsFixture()

    with caplog.at_level(logging.INFO):
        # Creating test files/folders in dbfs
        dbutils.fs.mkdirs(path=TEST_LAKEHOUSE_DBFS)
        dbutils.fs.put(path=f"{TEST_LAKEHOUSE_DBFS}/test_single_file.json", content="")
        dbutils.fs.mkdirs(path=f"{TEST_LAKEHOUSE_DBFS}/test_directory/")
        for x in range(0, 2000):
            dbutils.fs.put(
                path=f"{TEST_LAKEHOUSE_DBFS}/test_directory/"
                f"test_recursive_file{x}.json",
                content="",
            )
        dbutils.fs.mkdirs(path=f"{TEST_LAKEHOUSE_DBFS}/test_directory_test/")
        for x in range(0, 2000):
            dbutils.fs.put(
                path=f"{TEST_LAKEHOUSE_DBFS}/test_directory_test/"
                f"test_recursive_file{x}.json",
                content="",
            )

        _test_file_manager_dbfs_copy(caplog, dbutils)
        _test_file_manager_dbfs_delete(caplog, dbutils)
        _test_file_manager_dbfs_move(caplog, dbutils)


def _list_objects(path: str, objects_list: list, dbutils: Any) -> list:
    list_objects = dbutils.fs.ls(path)

    for file_or_directory in list_objects:
        if file_or_directory.isDir():
            _list_objects(file_or_directory.path, objects_list, dbutils)
        else:
            objects_list.append(file_or_directory.path)
    return objects_list


def _test_file_manager_dbfs_copy(caplog: Any, dbutils: Any) -> None:
    """Testing file manager copy operations.

    Args:
        caplog: captured log.
        dbutils: Dbutils from databricks.
    """
    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/"
        f"acon_copy_directory_dry_run.json"
    )
    for x in range(0, 2000):
        assert (
            f"/app/tests/lakehouse/dbfs/test_directory/test_recursive_file{x}.json"
            in caplog.text
        )

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_directory.json"
    )

    assert len(dbutils.fs.ls("tests/lakehouse/dbfs/test_directory")) == len(
        dbutils.fs.ls("tests/lakehouse/dbfs/destination_directory")
    )

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_single_object.json"
    )

    assert "tests/lakehouse/dbfs/test_single_file.json" in str(
        dbutils.fs.ls("tests/lakehouse/dbfs/")
    )


def _test_file_manager_dbfs_delete(caplog: Any, dbutils: Any) -> None:
    """Testing file manager delete operations.

    Args:
        caplog: captured log.
        dbutils: Dbutils from databricks.
    """
    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/delete_objects/"
        f"acon_delete_objects_dry_run.json"
    )
    assert (
        "{'tests/lakehouse/dbfs/test_directory': "
        "['/app/tests/lakehouse/dbfs/test_directory/" in caplog.text
    )
    for x in range(0, 2000):
        assert (
            f"/app/tests/lakehouse/dbfs/test_directory/"
            f"test_recursive_file{x}.json" in caplog.text
        )
    for x in range(0, 2000):
        assert (
            f"/app/tests/lakehouse/dbfs/destination_directory/"
            f"test_recursive_file{x}.json" in caplog.text
        )

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects.json"
    )
    assert len(dbutils.fs.ls("tests/lakehouse/dbfs/destination_directory")) == 0


def _test_file_manager_dbfs_move(caplog: Any, dbutils: Any) -> None:
    """Testing file manager move operations.

    Args:
        caplog: captured log.
        dbutils: Dbutils from databricks.
    """
    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/move_objects/acon_move_objects_dry_run.json"
    )
    assert (
        "{'tests/lakehouse/dbfs/test_directory': "
        "['/app/tests/lakehouse/dbfs/test_directory/" in caplog.text
    )
    for x in range(0, 2000):
        assert (
            f"/app/tests/lakehouse/dbfs/test_directory/"
            f"test_recursive_file{x}.json" in caplog.text
        )
    for x in range(0, 2000):
        assert (
            f"/app/tests/lakehouse/dbfs/destination_directory/"
            f"test_recursive_file{x}.json" in caplog.text
        )

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/move_objects/acon_move_objects.json"
    )
    assert len(dbutils.fs.ls("tests/lakehouse/dbfs/test_directory")) == 0
    assert len(dbutils.fs.ls("tests/lakehouse/dbfs/test_mv_directory")) == 2000


================================================
FILE: tests/feature/test_file_manager_s3.py
================================================
"""Test file manager for s3."""

import logging
from typing import Any

import boto3
import pytest
from moto import mock_s3, mock_sts  # type: ignore

from lakehouse_engine.engine import manage_files
from tests.conftest import FEATURE_RESOURCES

TEST_PATH = "file_manager_s3"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"


@mock_sts
def test_get_caller_identity_with_default_credentials() -> None:
    """Test get_caller_identity of sts client."""
    boto3.client("sts", region_name="us-east-1").get_caller_identity()


@mock_s3
def test_file_manager_s3(caplog: Any) -> None:
    """Test functions from file manager.

    Args:
        caplog: captured log.
    """
    s3_res = boto3.resource("s3", region_name="us-east-1")
    s3_cli = boto3.client("s3", region_name="us-east-1")
    test_get_caller_identity_with_default_credentials()

    s3_res.create_bucket(Bucket="test_bucket")
    s3_res.create_bucket(Bucket="destination_bucket")

    with caplog.at_level(logging.INFO):
        # Creating test files/folders in S3
        # 2000 files are created to test the pagination is being correctly performed
        s3_cli.put_object(Bucket="test_bucket", Key="test_single_file.json", Body="")
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="")
        for x in range(0, 2000):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory/test_recursive_file{x}.json",
                Body="",
            )
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory_test/", Body="")
        for x in range(0, 2000):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory_test/test_recursive_file{x}.json",
                Body="",
            )

        _test_file_manager_s3_copy(caplog, s3_cli)
        _test_file_manager_s3_delete(caplog, s3_cli)


def _test_file_manager_s3_copy(caplog: Any, s3_cli: Any) -> None:
    """Testing file manager copy operations.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
    """
    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/"
        f"acon_copy_single_object_dry_run.json"
    )
    assert "{'test_single_file.json': ['test_single_file.json']}" in caplog.text

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/"
        f"acon_copy_directory_dry_run.json"
    )
    for x in range(0, 2000):
        assert f"test_directory/test_recursive_file{x}.json" in caplog.text

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_single_object.json"
    )
    assert "'KeyCount': 1" in str(s3_cli.list_objects_v2(Bucket="destination_bucket"))

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/copy_objects/acon_copy_directory.json"
    )
    assert "'KeyCount': 2002" in str(
        s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000)
    )


def _test_file_manager_s3_delete(caplog: Any, s3_cli: Any) -> None:
    """Testing file manager delete operations.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
    """
    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/delete_objects/"
        f"acon_delete_objects_dry_run.json"
    )
    assert (
        "{'test_single_file.json': ['test_single_file.json'], "
        "'test_directory/': ['test_directory/'" in caplog.text
    )
    for x in range(0, 2000):
        assert f"test_directory/test_recursive_file{x}.json" in caplog.text

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/delete_objects/acon_delete_objects.json"
    )
    assert "'KeyCount': 2001" in str(
        s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000)
    )


@mock_s3
@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "glacier", "storage_class": "GLACIER"},
        {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"},
        {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"},
    ],
)
def test_file_manager_s3_restore_archive(scenario: dict, caplog: Any) -> None:
    """Test restore functions from file manager.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    s3_res = boto3.resource("s3", region_name="us-east-1")
    s3_cli = boto3.client("s3", region_name="us-east-1")
    test_get_caller_identity_with_default_credentials()

    s3_res.create_bucket(Bucket="test_bucket")
    s3_res.create_bucket(Bucket="destination_bucket")

    with caplog.at_level(logging.INFO):
        s3_cli.put_object(
            Bucket="test_bucket",
            Key="test_single_file.json",
            Body="",
            StorageClass=scenario.get("storage_class"),
        )
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory", Body="")
        for x in range(0, 3):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory/test_recursive_file{x}.json",
                Body="",
                StorageClass=scenario.get("storage_class"),
            )

        _test_file_manager_s3_restore_request(caplog, s3_cli, s3_res)
        _test_file_manager_s3_restore_check(caplog, s3_cli, s3_res)


def _test_file_manager_s3_restore_check(caplog: Any, s3_cli: Any, s3_res: Any) -> None:
    """Testing file manager restore check.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
        s3_res: s3 resource interface.
    """
    test_bucket = s3_res.Bucket("test_bucket")
    expected_restored_objects = 4
    restored_objects = 0

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/check_restore_status/"
        f"acon_check_restore_status_directory.json"
    )
    for x in range(0, 3):
        assert (
            f"Checking restore status for: test_directory/test_recursive_file{x}.json"
            in caplog.text
        )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 5" in str(
        s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000)
    )
    assert expected_restored_objects == restored_objects


def _test_file_manager_s3_restore_request(
    caplog: Any, s3_cli: Any, s3_res: Any
) -> None:
    """Testing file manager restore request.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
        s3_res: s3 resource interface.
    """
    test_bucket = s3_res.Bucket("test_bucket")
    expected_restored_objects = 4
    restored_objects = 0

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/request_restore/"
        f"acon_request_restore_single_object.json"
    )
    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/request_restore/"
        f"acon_request_restore_directory.json"
    )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 5" in str(
        s3_cli.list_objects_v2(Bucket="test_bucket", MaxKeys=100000)
    )
    assert expected_restored_objects == restored_objects


@mock_s3
@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "glacier", "storage_class": "GLACIER"},
        {"scenario_name": "glacier_ir", "storage_class": "GLACIER_IR"},
        {"scenario_name": "deep_archive", "storage_class": "DEEP_ARCHIVE"},
    ],
)
def test_file_manager_s3_restore_sync(scenario: dict, caplog: Any) -> None:
    """Test restore functions from file manager.

    Args:
        scenario: scenario to test.
        caplog: captured log.
    """
    s3_res = boto3.resource("s3", region_name="us-east-1")
    s3_cli = boto3.client("s3", region_name="us-east-1")
    test_get_caller_identity_with_default_credentials()

    s3_res.create_bucket(Bucket="test_bucket")
    s3_res.create_bucket(Bucket="destination_bucket")

    with caplog.at_level(logging.INFO):
        s3_cli.put_object(
            Bucket="test_bucket",
            Key="test_single_file.json",
            Body="",
            StorageClass=scenario.get("storage_class"),
        )
        s3_cli.put_object(Bucket="test_bucket", Key="test_directory/", Body="")
        for x in range(0, 3):
            s3_cli.put_object(
                Bucket="test_bucket",
                Key=f"test_directory/test_recursive_file{x}.json",
                Body="",
                StorageClass=scenario.get("storage_class"),
            )

        _test_file_manager_s3_restore_sync(caplog, s3_cli, s3_res)
        _test_file_manager_s3_restore_sync_retrieval_tier_exception(caplog)


def _test_file_manager_s3_restore_sync(caplog: Any, s3_cli: Any, s3_res: Any) -> None:
    """Testing file manager restore file sync.

    Args:
        caplog: captured log.
        s3_cli: s3 client interface.
        s3_res: s3 resource interface.
    """
    test_bucket = s3_res.Bucket("test_bucket")
    expected_single_restored_objects = 1
    restored_objects = 0

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/"
        f"acon_request_restore_to_destination_and_wait_single_object.json"
    )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 1" in str(
        s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000)
    )
    assert expected_single_restored_objects == restored_objects

    restored_objects = 0
    expected_restored_objects = 4

    manage_files(
        acon_path=f"file://{TEST_RESOURCES}/request_restore_to_destination_and_wait/"
        f"acon_request_restore_to_destination_and_wait_directory.json"
    )

    for bucket_object in test_bucket.objects.all():
        obj = s3_res.Object(bucket_object.bucket_name, bucket_object.key)
        if obj.restore is not None and 'ongoing-request="false"' in obj.restore:
            restored_objects += 1

    assert "'KeyCount': 5" in str(
        s3_cli.list_objects_v2(Bucket="destination_bucket", MaxKeys=100000)
    )
    assert expected_restored_objects == restored_objects


def _test_file_manager_s3_restore_sync_retrieval_tier_exception(caplog: Any) -> None:
    """Testing file manager restore sync operation when raising exception.

    Args:
        caplog: captured log.
    """
    with pytest.raises(ValueError) as exception:
        manage_files(
            acon_path=f"file://{TEST_RESOURCES}/request_restore_to_destination_"
            f"and_wait/acon_request_restore_to_destination_and_wait_"
            f"single_object_raise_error.json"
        )

    assert (
        "Retrieval Tier Bulk not allowed on this operation! "
        "This kind of restore should be used just with `Expedited` retrieval tier "
        "to save cluster costs." in str(exception.value)
    )


================================================
FILE: tests/feature/test_full_load.py
================================================
"""Test full loads."""

from typing import List

import pytest

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "full_load"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        ["with_filter", InputFormat.PARQUET.value],
        ["with_filter_partition_overwrite", InputFormat.DELTAFILES.value],
        ["full_overwrite", InputFormat.DELTAFILES.value],
    ],
)
def test_batch_full_load(scenario: List[str]) -> None:
    """Test full loads in batch mode.

    Args:
        scenario: scenario to test.
             with_filter - loads in full but applies a filter to the source.
             with_filter_partition_overwrite - loads in full but only overwrites
             partitions that are contained in the data being loaded, keeping
             untouched partitions in the target table, therefore not doing a
             complete overwrite.
             full_overwrite - loads in full and overwrites target table.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/",
    )
    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario[0]}/batch_init.json"
    )
    load_data(acon=acon)

    LocalStorage.clean_folder(
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/source/part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/data/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario[0]}/batch.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data",
        file_format=scenario[1],
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/test_gab.py
================================================
"""Module with integration tests for gab feature."""

from typing import Any, Optional

import pendulum
import pytest
from _pytest.fixtures import SubRequest
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import Row

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import execute_gab, load_data
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "gab"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"
_LOGGER = LoggingHandler(__name__).get_logger()
_CALENDAR_MIN_DATE = pendulum.from_format("2016-01-01", "YYYY-MM-DD")
_CALENDAR_MAX_DATE = pendulum.from_format("2023-01-01", "YYYY-MM-DD")
_SETUP_DELTA_TABLES = {
    "dim_calendar": "calendar",
    "lkp_query_builder": "lkp_query_builder",
    "gab_use_case_results": "gab_use_case_results",
    "gab_log_events": "gab_log_events",
}
_USE_CASE_TABLES = ["order_events", "dummy_sales_kpi"]


def _create_gab_tables() -> None:
    """Create necessary tables to use GAB."""
    for table_name, table_column_file in _SETUP_DELTA_TABLES.items():
        DataframeHelpers.create_delta_table(
            cols=SchemaUtils.from_file_to_dict(
                f"file:///{TEST_RESOURCES}/setup/column_list/{table_column_file}.json"
            ),
            table=table_name,
        )


def _generate_calendar_test_dates() -> list:
    """Generate calendar date between the test period."""
    calendar_dates: list[Row] = []
    calendar_date = _CALENDAR_MIN_DATE

    for _ in range(1, _CALENDAR_MIN_DATE.diff(_CALENDAR_MAX_DATE).in_days()):
        calendar_date = calendar_date.add(days=1)
        calendar_dates.append(Row(value=calendar_date.strftime("%Y-%m-%d")))

    return calendar_dates


def _transform_dates_list_to_dataframe(dates: list) -> DataFrame:
    """Create calendar dates DataFrame from a list of dates.

    Args:
        dates: list of dates to create the calendar DataFrame.
    """
    calendar_dates = ExecEnv.SESSION.createDataFrame(dates)
    calendar_dates = calendar_dates.withColumn(
        "calendar_date", to_date(col("value"), "yyyy-MM-dd")
    ).drop(calendar_dates.value)

    return calendar_dates


def _feed_dim_calendar(df: DataFrame) -> DataFrame:
    """Feed dim calendar table."""
    df.createOrReplaceTempView("dates_completed")

    df_cal = ExecEnv.SESSION.sql(
        """
        WITH monday_calendar AS (
            SELECT
                 calendar_date,
                WEEKOFYEAR(calendar_date) AS weeknum_mon,
                DATE_FORMAT(calendar_date, 'E') AS day_en,
                MIN(calendar_date) OVER (PARTITION BY CONCAT(DATE_PART(
                    'YEAROFWEEK', calendar_date
                ),
                WEEKOFYEAR(calendar_date)) ORDER BY calendar_date) AS weekstart_mon
            FROM dates_completed
            ORDER BY
                calendar_date
        ),
        monday_calendar_plus_week_num_sunday AS (
            SELECT
                monday_calendar.*,
                LEAD(weeknum_mon) OVER(ORDER BY calendar_date) AS weeknum_sun
            FROM monday_calendar
        ),
        calendar_complementary_values AS (
            SELECT
                calendar_date,
                weeknum_mon,
                day_en,
                weekstart_mon,
                weekstart_mon+6 AS weekend_mon,
                LEAD(weekstart_mon-1) OVER(ORDER BY calendar_date) AS weekstart_sun,
                DATE(DATE_TRUNC('MONTH', calendar_date)) AS month_start,
                DATE(DATE_TRUNC('QUARTER', calendar_date)) AS quarter_start,
                DATE(DATE_TRUNC('YEAR', calendar_date)) AS year_start
            FROM monday_calendar_plus_week_num_sunday
        )
        SELECT
            calendar_date,
            day_en,
            weeknum_mon,
            weekstart_mon,
            weekend_mon,
            weekstart_sun,
            weekstart_sun+6 AS weekend_sun,
            month_start,
            add_months(month_start, 1)-1 AS month_end,
            quarter_start,
            ADD_MONTHS(quarter_start, 3)-1 AS quarter_end,
            year_start,
            ADD_MONTHS(year_start, 12)-1 AS year_end
        FROM calendar_complementary_values
        """
    )

    return df_cal


def _feed_table_with_test_data(
    table_name: str,
    source_dataframe: Optional[DataFrame] = None,
    transformer_specs: list = None,
    input_id_to_write: str = "data_to_load",
) -> None:
    """Feed table with test data.

    Args:
        table_name: name of the table to feed.
        source_dataframe: dataframe to feed the table, present when load_type is
            dataframe.
        transformer_specs: acon transformations.
        input_id_to_write: input id used in the write step.
    """
    input_spec: dict[str, Any]
    if source_dataframe:
        input_spec = {
            "spec_id": "data_to_load",
            "read_type": "batch",
            "data_format": "dataframe",
            "df_name": source_dataframe,
        }
    else:
        input_spec = {
            "spec_id": "data_to_load",
            "read_type": "batch",
            "data_format": "csv",
            "schema_path": f"file:///{TEST_RESOURCES}/setup/schema/{table_name}.json",
            "options": {
                "header": True,
                "delimiter": "|",
                "mode": "FAILFAST",
                "nullValue": "null",
            },
            "location": f"file:///{TEST_RESOURCES}/setup/data/{table_name}.csv",
        }

    acon = {
        "input_specs": [input_spec],
        "transform_specs": transformer_specs if transformer_specs else [],
        "output_specs": [
            {
                "spec_id": "loaded_table",
                "input_id": input_id_to_write,
                "write_type": "overwrite",
                "data_format": "delta",
                "db_table": f"test_db.{table_name}",
            },
        ],
    }
    load_data(acon=acon)


def _create_and_load_source_data_for_use_case(source_table: str) -> None:
    """Create and load source for use case.

    Args:
        source_table: source table to create/feed the data.
    """
    DataframeHelpers.create_delta_table(
        cols=SchemaUtils.from_file_to_dict(
            f"file:///{TEST_RESOURCES}/setup/column_list/{source_table}.json"
        ),
        table=source_table,
    )

    _feed_table_with_test_data(table_name=source_table)


def _import_use_case_sql(use_case_name: str) -> None:
    """Import use case SQL stage files.

    Args:
        use_case_name: name of the use case.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/usecases/{use_case_name}/*.sql",
        f"{TEST_LAKEHOUSE_IN}/usecases_sql/{use_case_name}/",
    )


def _setup_use_case(use_case_name: str) -> None:
    """Set up the use case.

    Args:
        use_case_name: name of hte use case.
    """
    _create_and_load_source_data_for_use_case(use_case_name)
    _import_use_case_sql(use_case_name)


@pytest.fixture(scope="session", autouse=True)
def _gab_setup() -> None:
    """Execute the GAB setup.

    Create and load config gab tables.
    """
    _LOGGER.info("Creating gab config tables...")

    _create_gab_tables()
    _feed_table_with_test_data(table_name="lkp_query_builder")

    calendar_dates = _generate_calendar_test_dates()
    calendar_dates_df = _transform_dates_list_to_dataframe(calendar_dates)
    _feed_table_with_test_data(
        table_name="dim_calendar",
        source_dataframe=calendar_dates_df,
        input_id_to_write="transformed_data",
        transformer_specs=[
            {
                "spec_id": "transformed_data",
                "input_id": "data_to_load",
                "transformers": [
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": _feed_dim_calendar},
                    }
                ],
            }
        ],
    )

    _LOGGER.info("Created with success...")


@pytest.fixture(scope="session", autouse=True, params=[_USE_CASE_TABLES])
def _run_setup_use_case(request: SubRequest) -> None:
    """Create and load use case gab tables.

    Args:
        request: fixture request, giving access to the `params`.
    """
    _LOGGER.info("Creating use case config tables...")
    for use_case in request.param:
        _setup_use_case(use_case)

    _LOGGER.info("Created with success...")


@pytest.mark.usefixtures("_gab_setup", "_run_setup_use_case")
@pytest.mark.parametrize(
    "scenario",
    [
        {
            "use_case_name": "order_events",
            "gold_assets": ["vw_orders_all", "vw_orders_filtered"],
            "gold_asset_schema": "vw_orders",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "order_events_snapshot",
            "gold_assets": ["vw_orders_all_snapshot", "vw_orders_filtered_snapshot"],
            "gold_asset_schema": "vw_orders",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "order_events_nam",
            "gold_assets": [
                "vw_nam_orders_all_snapshot",
                "vw_nam_orders_filtered_snapshot",
            ],
            "gold_asset_schema": "vw_orders",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "order_events_negative_timezone_offset",
            "gold_assets": [
                "vw_negative_offset_orders_all",
                "vw_negative_offset_orders_filtered",
            ],
            "gold_asset_schema": "vw_orders",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "dummy_sales_kpi",
            "gold_assets": ["vw_dummy_sales_kpi"],
            "gold_asset_schema": "vw_dummy_sales_kpi",
            "use_case_stages": "dummy_sales_kpi",
        },
        {
            "use_case_name": "skip_use_case_by_empty_reconciliation",
            "query_label": "order_events_empty_reconciliation_window",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "skip_use_case_by_empty_requested_cadence",
            "query_label": "order_events_negative_timezone_offset",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "skip_use_case_by_not_configured_cadence",
            "query_label": "order_events_negative_timezone_offset",
            "use_case_stages": "order_events",
        },
        {
            "use_case_name": "skip_use_case_by_unexisting_cadence",
            "query_label": "order_events_unexisting_cadence",
            "use_case_stages": "order_events",
        },
    ],
)
def test_gold_asset_builder(scenario: dict, caplog: Any) -> None:
    """Test the feature of using gab to generate gold assets.

    Args:
        scenario: scenario to test.
        caplog: captured log.

    Scenarios:
        order_events: tests gab features:
            - Cadence
            - Recon Window
            - Metrics
            - Extended Window Calculator
            Also test the generation of two different views for the same asset.
        order_events_snapshot: tests gab features:
            - Cadence
            - Recon Window
            - Metrics
            - Extended Window Calculator
            - Snapshot
            Also test the generation of two different views for the same asset.
        order_events_nam: tests gab features:
            - Cadence
            - Recon Window
            - Metrics
            - Extended Window Calculator
            - Snapshot
            Also test the generation of two different views for the same asset and the
                use case `query_type` equals to `NAM`.
        order_events_negative_timezone_offset: tests gab features:
            - Cadence
            - Recon Window
            - Metrics
            - Extended Window Calculator
            - Offset
            - Snapshot
            Also test the generation of two different views for the same asset.
       dummy_sales_kpi: tests almost all gab features:
            - Cadence
            - Recon Window
            - Metrics
            - Extended Window Calculator
            Also test multiple stages for the asset creation.

    """
    use_case_name = scenario["use_case_name"]
    execute_gab(
        f"file://{TEST_RESOURCES}/usecases/{scenario['use_case_stages']}/scenario/"
        f"{use_case_name}.json"
    )

    if not use_case_name.startswith("skip"):
        for expected_gold_asset in scenario["gold_assets"]:
            result_df = ExecEnv.SESSION.sql(
                f"SELECT * FROM test_db.{expected_gold_asset}"  # nosec
            )
            control_df = DataframeHelpers.read_from_file(
                f"{TEST_RESOURCES}/control/data/{expected_gold_asset}.csv",
                schema=SchemaUtils.from_file_to_dict(
                    f"file:///{TEST_RESOURCES}/control/schema/"
                    f"{scenario['gold_asset_schema']}.json"
                ),
            )

            assert not DataframeHelpers.has_diff(result_df, control_df)
    else:
        assert (
            f"Skipping use case {scenario['query_label']}. No cadence processed "
            "for the use case." in caplog.text
        )


================================================
FILE: tests/feature/test_heartbeat.py
================================================
"""Module with integration tests for heartbeat feature."""

import datetime
from unittest.mock import MagicMock, patch

import pytest
from pyspark.sql import DataFrame
from pyspark.sql.functions import lit
from pyspark.sql.types import TimestampType

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import (
    execute_heartbeat_sensor_data_feed,
    execute_sensor_heartbeat,
    trigger_heartbeat_sensor_jobs,
    update_heartbeat_sensor_status,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "heartbeat"
FEATURE_TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
_LOGGER = LoggingHandler(__name__).get_logger()


def _create_heartbeat_table(scenario_name: str, tables: dict) -> None:
    """Create the necessary tables required for using Heartbeat.

    Args:
        scenario_name (str): The name of the scenario.
        tables (dict): Table names.
    """
    for _, table_name in tables.items():
        DataframeHelpers.create_delta_table(
            cols=SchemaUtils.from_file_to_dict(
                f"file:///{FEATURE_TEST_RESOURCES}/setup/"
                f"{scenario_name}/column_list/{table_name}.json"
            ),
            table=table_name,
        )


def _test_heartbeat_sensor_data_feed(
    heartbeat_data_file_path: str,
    heartbeat_control_table_name: str,
    ctrl_heartbeat_df: DataFrame,
) -> None:
    """Test the function that populates the heartbeat control table.

    Args:
        heartbeat_data_file_path (str): Path to the CSV file used
            to populate the control table.
        heartbeat_control_table_name (str): Name of the target
            control table.
        ctrl_heartbeat_df (DataFrame): Reference DataFrame
            used to validate the table contents.
    """
    _LOGGER.info("Testing execute_heartbeat_sensor_data_feed function")

    execute_heartbeat_sensor_data_feed(
        heartbeat_data_file_path, heartbeat_control_table_name
    )
    heartbeat_df = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}")

    assert not DataframeHelpers.has_diff(heartbeat_df, ctrl_heartbeat_df)


@patch(
    "lakehouse_engine.algorithms.sensors.heartbeat.Heartbeat._execute_batch_of_sensor",
    MagicMock(
        return_value={
            "sensor_id": "dummy_delta_table",
            "trigger_job_id": "1927384615203749",
        }
    ),
)
@patch("lakehouse_engine.algorithms.sensors.heartbeat.current_timestamp")
def _test_execute_sensor_heartbeat(
    mocked_timestamp: MagicMock,
    acon: dict,
    heartbeat_control_table_name: str,
    ctrl_heartbeat_df: DataFrame,
    results: dict,
) -> None:
    """Test the execution of the sensor heartbeat process.

    This test mocks the internal `_execute_batch_of_sensor` method
    to simulate the heartbeat execution, then validates the
    resulting state in the heartbeat control table after
    the execution of the execute_sensor_heartbeat function.

    Args:
        mocked_timestamp (MagicMock): A static timestamp for testing.
        acon (dict): Acon used to trigger the heartbeat execution.
        heartbeat_control_table_name (str): Name of the control table to validate.
        ctrl_heartbeat_df (DataFrame): Reference DataFrame
            for asserting table contents.
        results (dict): Reference values to compare.
    """
    mocked_timestamp.return_value = lit(
        datetime.datetime.strptime("2025/08/14 23:00", "%Y/%m/%d %H:%M")
    ).cast(TimestampType())

    execute_sensor_heartbeat(acon=acon)
    heartbeat_result = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}")

    assert (
        heartbeat_result.filter("status = 'NEW_EVENT_AVAILABLE'").count()
        == results["new_events_available_count"]
    )
    assert not DataframeHelpers.has_diff(ctrl_heartbeat_df, heartbeat_result)


@patch("lakehouse_engine.algorithms.sensors.heartbeat.current_timestamp")
@patch(
    "lakehouse_engine.core.sensor_manager.datetime",
)
def _test_update_heartbeat_sensor_status(
    mocked_timestamp_sensor: MagicMock,
    mocked_timestamp_heartbeat: MagicMock,
    heartbeat_control_table_name: str,
    sensor_table_name: str,
    job_id: str,
    ctrl_heartbeat_df: DataFrame,
    ctrl_sensor_df: DataFrame,
) -> None:
    """Test the update of sensor and heartbeat control table statuses.

    This test validates that the `update_heartbeat_sensor_status`
    function correctly updates timestamps and status fields in
    both the sensor and heartbeat control tables. It also
    compares the updated tables against expected control DataFrames.

    Args:
        mocked_timestamp_sensor (MagicMock): A static timestamp for testing
            sensor table.
        mocked_timestamp_heartbeat (MagicMock): A static timestamp for testing
            heartbeat table.
        heartbeat_control_table_name (str): Name of the heartbeat control
            table to validate.
        sensor_table_name (str): Name of the sensor table to validate.
        job_id (str): Job identifier used in the update process.
        ctrl_heartbeat_df (DataFrame): Expected state
            of the updated heartbeat control table.
        ctrl_sensor_df (DataFrame): Expected state
            of the updated sensor table.
    """
    mocked_timestamp_sensor.now.return_value = datetime.datetime(
        2025, 8, 14, 23, 00, 00, 00000
    )
    mocked_timestamp_heartbeat.return_value = lit(
        datetime.datetime.strptime("2025/08/14 23:00", "%Y/%m/%d %H:%M")
    ).cast(TimestampType())

    update_heartbeat_sensor_status(
        heartbeat_control_table_name, sensor_table_name, job_id
    )

    heartbeat_data = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}")
    sensor_data = ExecEnv.SESSION.table(f"{sensor_table_name}")

    _LOGGER.info("Comparing heartbeat and sensor tables with control tables")
    assert not DataframeHelpers.has_diff(ctrl_sensor_df, sensor_data)

    assert not DataframeHelpers.has_diff(ctrl_heartbeat_df, heartbeat_data)


@patch(
    "lakehouse_engine.core.sensor_manager.SensorJobRunManager.run_job",
    MagicMock(return_value=("run_id", None)),
)
@patch("lakehouse_engine.algorithms.sensors.heartbeat.current_timestamp")
def _trigger_heartbeat_sensor_jobs(
    mocked_timestamp_heartbeat: MagicMock,
    acon: dict,
    heartbeat_control_table_name: str,
    heartbeat_control_table_updated: DataFrame,
) -> None:
    """Test the triggering of sensor heartbeat jobs.

    This test mocks the `run_job` method to simulate job execution,
    triggers the heartbeat sensor jobs, and verifies that the
    heartbeat control table reflects the expected changes.

    Args:
        mocked_timestamp_heartbeat (MagicMock): A static timestamp for testing
            heartbeat table.
        acon (dict): Acon used to trigger the sensor jobs.
        heartbeat_control_table_name (str): Name of the heartbeat control
            table to validate.
        heartbeat_control_table_updated (DataFrame): Expected state
            of the control table after job execution.
    """
    mocked_timestamp_heartbeat.return_value = lit(
        datetime.datetime.strptime("2025/08/14 23:00", "%Y/%m/%d %H:%M")
    ).cast(TimestampType())

    trigger_heartbeat_sensor_jobs(acon)

    heartbeat_table_job_run = ExecEnv.SESSION.table(f"{heartbeat_control_table_name}")
    assert not DataframeHelpers.has_diff(
        heartbeat_table_job_run, heartbeat_control_table_updated
    )


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "use_case_name": "default",
            "control_files": {
                "ctrl_heart_tbl_heartb_feed_fname": "ctr_heart_tbl_heartb_feed.csv",
                "ctrl_heart_tbl_exe_sns_hb_fname": "ctrl_heart_tbl_exec_sensor.csv",
                "ctrl_heart_tbl_updated_fname": "ctrl_heart_tbl_updated.csv",
                "ctrl_heart_tbl_trigger_job_fname": "ctrl_heart_tbl_trigger_job.csv",
                "ctrl_sensor_tbl_upd_status_fname": "ctrl_sensor_tbl_upd_status.json",
                "ctrl_heart_tbl_schema_fname": "ctrl_heart_tbl_schema.json",
            },
            "tables": {
                "heartbeat_sensor_control_table": "heartbeat_sensor_control_table",
                "sensor_table": "sensor_table",
            },
            "setup": {
                "setup_heartbeat_data": "setup_heartbeat_data.csv",
                "setup_sensor_data": "setup_sensor_data.json",
                "schema_sensor_df": "schema_sensor_df.json",
            },
            "execute_sensor_heartbeat_results": {"new_events_available_count": 1},
            "job_id": "1927384615203749",
            "trigger_heartbeat_sensor_jobs_records": {
                "heartbeat": """
                    ("delta_table","dummy_order","batch",
                    "dummy_heartbeat_asset",NULL,NULL,NULL,
                    "1015557820139870","data-product_job_name_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"UNPAUSED","true")""",
                "sensors": """
                    ("dummy_order",
                    array("dummy_heartbeat_asset"),"ACQUIRED_NEW_DATA",
                    NULL,NULL,"LOAD_DATE","10155578201985")""",
            },
        },
        {
            "use_case_name": "heartbeat_paused_sensor_new_record",
            "control_files": {
                "ctrl_heart_tbl_heartb_feed_fname": "ctr_heart_tbl_heartb_feed.csv",
                "ctrl_heart_tbl_exe_sns_hb_fname": "ctrl_heart_tbl_exec_sensor.csv",
                "ctrl_heart_tbl_updated_fname": "ctrl_heart_tbl_updated.csv",
                "ctrl_heart_tbl_trigger_job_fname": "ctrl_heart_tbl_trigger_job.csv",
                "ctrl_sensor_tbl_upd_status_fname": "ctrl_sensor_tbl_upd_status.json",
                "ctrl_heart_tbl_schema_fname": "ctrl_heart_tbl_schema.json",
            },
            "tables": {
                "heartbeat_sensor_control_table": "heartbeat_sensor_control_table",
                "sensor_table": "sensor_table",
            },
            "setup": {
                "setup_heartbeat_data": "setup_heartbeat_data.csv",
                "setup_sensor_data": "setup_sensor_data.json",
                "schema_sensor_df": "schema_sensor_df.json",
            },
            "execute_sensor_heartbeat_results": {"new_events_available_count": 0},
            "job_id": "2604918372561094",
            "trigger_heartbeat_sensor_jobs_records": {
                "heartbeat": """
                    ("delta_table","dummy_order","batch",
                    "dummy_heartbeat_asset",NULL,NULL,NULL,
                    "1015557820139870","data-product_job_name_orders","IN PROGRESS",
                    NULL,NULL,NULL,"UNPAUSED","true")""",
                "sensors": """
                    ("dummy_order",
                    array("dummy_heartbeat_asset"),"ACQUIRED_NEW_DATA",
                    NULL,NULL,"LOAD_DATE","10155578201985")""",
            },
        },
    ],
)
def test_heartbeat(scenario: dict) -> None:
    """Test the heartbeat feature.

    Tests the heartbeat feature by validating the four core
    functions invoked by the heartbeat algorithm.

    Args:
        scenario: The test scenario to execute.

    Scenarios:
        Default: A basic scenario that tests the four main steps of
        the Heartbeat algorithm:
            1. `execute_heartbeat_sensor_data_feed`: Loads a CSV file
                into an empty Heartbeat control table.
            2. `execute_sensor_heartbeat`: Simulates a Databricks job run.
                The return value is patched to avoid actual API calls.
            3. `update_heartbeat_sensor_status`: Updates values in the Heartbeat
                and Sensor tables.
            4. `trigger_heartbeat_sensor_jobs`: Triggers Databricks jobs.
            This function is also patched to prevent real job execution.
        Heartbeat_paused_sensor_new_record: Different state records that will
        have different behaviour.
            1. A record wih job_state = 'PAUSED' and  sensor_source = 'delta_table'
                is inserted into the `heartbeat` table.
                - Expected Behavior: No updates or changes throughout the test.
            2. A record wih job_state = 'Null' and  sensor_source = 'sap_bw' is
                inserted into heartbeat control table and sensor table.
               - Expected Behavior: Record is updated during the process to
                    reflect activity.
            3. A record wih job_state = 'COMPLETED' and  sensor_source = 'kafka'
                is inserted into heartbeat control table.
               - Expected Behavior:
                 - The record is updated during the process.
                 - A corresponding entry is created in the `sensor` table.
    """
    scenario_name = scenario["use_case_name"]
    _LOGGER.info(f"Setting up Test - {scenario_name}.")

    tables = scenario["tables"]
    control_files = scenario["control_files"]

    heartbeat_control_table_name = f"test_db.{tables['heartbeat_sensor_control_table']}"
    sensor_table_name = f"test_db.{tables['sensor_table']}"

    acon = {
        "heartbeat_sensor_db_table": heartbeat_control_table_name,
        "lakehouse_engine_sensor_db_table": sensor_table_name,
        "data_format": "delta",
        "sensor_source": "delta_table",
        "token": "my-token",
        "domain": "my-adidas-domain.cloud.databricks.com",
    }

    _create_heartbeat_table(scenario_name, tables)

    LocalStorage.copy_dir(
        f"{FEATURE_TEST_RESOURCES}/setup/{scenario_name}/data/",
        f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/",
    )

    LocalStorage.copy_dir(
        f"{FEATURE_TEST_RESOURCES}/control/{scenario_name}/data/",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario_name}/data/",
    )

    setup_heartbeat_data_file_path = (
        f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/"
        f"{scenario['setup']['setup_heartbeat_data']}"
    )

    ctrl_heart_tbl_heartb_feed_fname = control_files["ctrl_heart_tbl_heartb_feed_fname"]
    ctrl_heart_tbl_heartb_feed_file_path = (
        f"{TEST_LAKEHOUSE_CONTROL}/"
        f"{scenario_name}/data/{ctrl_heart_tbl_heartb_feed_fname}"
    )

    ctrl_heart_tbl_schema_file_name = control_files["ctrl_heart_tbl_schema_fname"]
    ctrl_heart_tbl_schema_file_path = (
        f"file:///{FEATURE_TEST_RESOURCES}/control/"
        f"{scenario_name}/schema/{ctrl_heart_tbl_schema_file_name}"
    )

    ctrl_heartbeat_df = DataframeHelpers.read_from_file(
        ctrl_heart_tbl_heartb_feed_file_path,
        schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path),
    )

    _test_heartbeat_sensor_data_feed(
        setup_heartbeat_data_file_path, heartbeat_control_table_name, ctrl_heartbeat_df
    )

    _LOGGER.info("Testing execute_sensor_heartbeat function")

    ctrl_heart_tbl_exe_sns_file_name = control_files["ctrl_heart_tbl_exe_sns_hb_fname"]
    ctrl_heart_tbl_exe_sns_file_path = (
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario_name}/"
        f"data/{ctrl_heart_tbl_exe_sns_file_name}"
    )
    ctrl_heart_tbl_exe_sns_df = DataframeHelpers.read_from_file(
        ctrl_heart_tbl_exe_sns_file_path,
        schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path),
    )

    execute_sensor_results = scenario["execute_sensor_heartbeat_results"]

    _test_execute_sensor_heartbeat(
        acon=acon,
        heartbeat_control_table_name=heartbeat_control_table_name,
        ctrl_heartbeat_df=ctrl_heart_tbl_exe_sns_df,
        results=execute_sensor_results,
    )

    _LOGGER.info("Testing update_heartbeat_sensor_status function")

    sensor_df_schema = (
        f"file:///{FEATURE_TEST_RESOURCES}/setup/"
        f"{scenario_name}/schema/{scenario['setup']['schema_sensor_df']}"
    )

    ctrl_heart_table_upd = (
        f"{FEATURE_TEST_RESOURCES}/control/{scenario_name}/"
        f"data/{scenario['control_files']['ctrl_heart_tbl_updated_fname']}"
    )

    setup_sensor_file_name = scenario["setup"]["setup_sensor_data"]
    sensor_table_data_path = (
        f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/{setup_sensor_file_name}"
    )

    ctrl_sensor_tbl_upd_status_fname = control_files["ctrl_sensor_tbl_upd_status_fname"]
    ctrl_sensor_upd_path = (
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario_name}/"
        f"data/{ctrl_sensor_tbl_upd_status_fname}"
    )

    sensors_data = DataframeHelpers.read_from_file(
        sensor_table_data_path,
        file_format="json",
        schema=SchemaUtils.from_file_to_dict(sensor_df_schema),
    )

    ctrl_sensor_upd_sensor_status_df = DataframeHelpers.read_from_file(
        ctrl_sensor_upd_path,
        file_format="json",
        schema=SchemaUtils.from_file_to_dict(sensor_df_schema),
    )

    ctrl_heart_tbl_df_upd_sns_status = DataframeHelpers.read_from_file(
        ctrl_heart_table_upd,
        schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path),
    )

    sensors_data.write.format("delta").mode("overwrite").saveAsTable(sensor_table_name)

    job_id = scenario["job_id"]

    _test_update_heartbeat_sensor_status(
        heartbeat_control_table_name=heartbeat_control_table_name,
        sensor_table_name=sensor_table_name,
        job_id=job_id,
        ctrl_heartbeat_df=ctrl_heart_tbl_df_upd_sns_status,
        ctrl_sensor_df=ctrl_sensor_upd_sensor_status_df,
    )

    _LOGGER.info("Testing trigger_heartbeat_sensor_jobs function")

    _LOGGER.info(f"acon: {acon}")

    _LOGGER.info("Preparing heartbeat and sensor table")
    records_to_insert = scenario["trigger_heartbeat_sensor_jobs_records"]

    ExecEnv.SESSION.sql(
        f"""INSERT INTO {heartbeat_control_table_name}
            VALUES {records_to_insert["heartbeat"]}"""  # nosec
    )
    ExecEnv.SESSION.sql(
        f"""INSERT INTO {sensor_table_name}
        VALUES {records_to_insert["sensors"]}"""  # nosec
    )

    ctrl_heart_tbl_trig_job_fname = control_files["ctrl_heart_tbl_trigger_job_fname"]
    ctrl_heart_tbl_trig_job_path = (
        f"file:///{FEATURE_TEST_RESOURCES}/control/"
        f"{scenario_name}/data/{ctrl_heart_tbl_trig_job_fname}"
    )

    ctrl_heartbeat_update_df = DataframeHelpers.read_from_file(
        ctrl_heart_tbl_trig_job_path,
        schema=SchemaUtils.from_file_to_dict(ctrl_heart_tbl_schema_file_path),
    )

    _trigger_heartbeat_sensor_jobs(
        acon=acon,
        heartbeat_control_table_name=heartbeat_control_table_name,
        heartbeat_control_table_updated=ctrl_heartbeat_update_df,
    )

    for _, table_name in tables.items():
        LocalStorage.clean_folder(f"{LAKEHOUSE}{table_name}")
        ExecEnv.SESSION.sql(f"""DROP TABLE IF EXISTS test_db.{table_name}""")  # nosec


================================================
FILE: tests/feature/test_jdbc_reader.py
================================================
"""Test jdbc reader."""

from typing import List

import pytest
from pyspark.sql.utils import IllegalArgumentException

from lakehouse_engine.engine import load_data
from lakehouse_engine.transformers.exceptions import WrongArgumentsException
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "jdbc_reader"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"
"""Same as spark, we provide two different ways to run jdbc reader.
    We can use the jdbc() function, passing inside all the arguments
        needed for Spark to work and we can even combine this with additional
        options passed trough .options().
    Other way is using .format("jdbc") and pass all necessary arguments
        through .options().
    It's important to say by choosing jdbc() we can also add options() to the execution.

JDBC Function Scenario - Description:
    correct_arguments - we are providing jdbc_args and options by passing arguments
        in a correct way.
    wrong_arguments - we are providing jdbc_args and options, but wrong arguments are
        filled to validate if spark reports the error messages properly.
JDBC Format Scenario - Description:
    correct_arguments - we are providing options to .format(jdbc) by passing arguments
        in a correct way.
    wrong_arguments - we are providing options to .format(jdbc), but wrong arguments are
        filled to validate if spark reports the error messages properly.
    predicates - predicates on spark read works on jdbc() function only, but if you
        mistake and pass to .format(jdbc) as a option, spark won't show any error, so
        we decided to add a validation and raise the error, this scenario validates it.
"""
TEST_SCENARIOS = [
    ["jdbc_function", "correct_arguments"],
    ["jdbc_function", "wrong_arguments"],
    ["jdbc_format", "correct_arguments"],
    ["jdbc_format", "wrong_arguments"],
    ["jdbc_format", "predicates"],
]


@pytest.mark.parametrize("scenario", TEST_SCENARIOS)
def test_jdbc_reader(scenario: List[str]) -> None:
    """Test loads from jdbc source.

    Args:
        scenario: scenario to test.
    """
    if scenario[0] == "jdbc_format" and scenario[1] == "wrong_arguments":
        with pytest.raises(IllegalArgumentException, match="Option.*is required."):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json"
            )

    elif scenario[0] == "jdbc_format" and scenario[1] == "predicates":
        with pytest.raises(
            WrongArgumentsException, match="Predicates can only be used with jdbc_args."
        ):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json"
            )

    elif scenario[0] == "jdbc_function" and scenario[1] == "wrong_arguments":
        with pytest.raises(
            TypeError, match=r"jdbc\(\) got an unexpected keyword argument.*"
        ):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json"
            )
    else:
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/data/source/part-01.csv",
            f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data/",
        )

        source_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/data"
        )
        DataframeHelpers.write_into_jdbc_table(
            source_df,
            f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/{scenario[0]}/{scenario[1]}/tests.db",
            f"{scenario[0]}",
        )

        load_data(
            f"file://{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/batch_init.json"
        )

        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario[0]}/{scenario[1]}/data/control/part-01.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data/",
        )

        result_df = DataframeHelpers.read_from_table(f"test_db.{scenario[0]}_table")
        control_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/{scenario[1]}/data"
        )

        assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/test_materialize_cdf.py
================================================
"""Test materialize cdf to external location."""

from typing import Any

import pytest
from delta.tables import DeltaTable

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data, manage_table
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "materialize_cdf"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize("scenario", ["streaming_with_cdf"])
def test_streaming_with_cdf(scenario: str, caplog: Any) -> None:
    """Test materialize cdf function.

    Args:
        scenario: scenario name.
        caplog: captured log.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/table/streaming_with_cdf.sql",
        f"{TEST_LAKEHOUSE_IN}/data/table/",
    )
    manage_table(f"file://{TEST_RESOURCES}/acon_create_table.json")

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/streaming_without_clean_cdf.json"
    )
    load_data(acon=acon)

    assert "Writing CDF to external table..." in caplog.text

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/part-01_cdf.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/control_schema.json",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/control_schema.json",
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/control_schema.json"
        ),
    )

    result_df_delta = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data",
        file_format=InputFormat.DELTAFILES.value,
    ).drop("_commit_timestamp")

    # once we are writing the cdf as delta, it can also be read as parquet.
    # because the _commit_timestamp field is a partition field (comes from the folder),
    # not from the parquet file, we need to enforce a schema where _commit_timestamp is
    # a string, not an int (as automatically inferred from the folder by spark).
    result_df_parquet = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data",
        file_format=InputFormat.PARQUET.value,
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_CONTROL}/{scenario}/control_schema.json"
        ),
    ).drop("_commit_timestamp")

    assert not DataframeHelpers.has_diff(result_df_delta, control_df)
    assert not DataframeHelpers.has_diff(result_df_parquet, control_df)

    # to be able to execute vacuum on expose cdf terminator spec it is
    # necessary to update _commit_timestamp to an old value, for that we
    # are enforcing the timestamp with the following delta commands.
    delta_table = DeltaTable.forPath(
        ExecEnv.SESSION,
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data",
    )
    delta_table.update(set={"_commit_timestamp": "'20211105132711'"})

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/streaming_with_clean_and_vacuum.json"
    )
    load_data(acon=acon)

    assert "Writing CDF to external table..." in caplog.text
    assert "Cleaning CDF table..." in caplog.text
    assert "Vacuuming CDF table..." in caplog.text

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/cdf_data",
        file_format=InputFormat.DELTAFILES.value,
    )

    assert result_df.count() == 6


================================================
FILE: tests/feature/test_notification.py
================================================
"""Mail notifications tests."""

import re
import typing

import pytest

from lakehouse_engine.core.definitions import TerminatorSpec
from lakehouse_engine.engine import send_notification
from lakehouse_engine.terminators.notifiers.email_notifier import EmailNotifier
from lakehouse_engine.terminators.notifiers.exceptions import (
    NotifierConfigException,
    NotifierTemplateConfigException,
    NotifierTemplateNotFoundException,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler
from tests.conftest import FEATURE_RESOURCES
from tests.utils.smtp_server import SMTPServer

LOGGER = LoggingHandler(__name__).get_logger()
TEST_ATTACHEMENTS_PATH = FEATURE_RESOURCES + "/notification/"


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Email Notification Template",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "template": "failure_notification_email",
                    "from": "test-email@email.com",
                    "cc": ["test-email1@email.com", "test-email2@email.com"],
                    "mimetype": "text/text",
                    "exception": "test-exception",
                },
            ),
            "expected": """
            Job local in workspace local has
            failed with the exception: test-exception""",
        },
        {
            "name": "Email Notification Free Form",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                    "mimetype": "text/text",
                    "subject": "Test Email",
                    "message": "Test message for the email.",
                    "attachments": [
                        f"{TEST_ATTACHEMENTS_PATH}test_attachement.txt",
                        f"{TEST_ATTACHEMENTS_PATH}test_image.png",
                    ],
                },
            ),
            "expected": "Test message for the email.",
            "expected_attachments": ["test_attachement.txt", "test_image.png"],
        },
        {
            "name": "Email Notification Free Form",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                    "mimetype": "text/html",
                    "subject": "Test Email",
                    "message": """<html><body>Test message.</body></html>""",
                },
            ),
            "expected": "<html><body>Test message.</body></html>",
        },
        {
            "name": "Error: non-existent template",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "template": "missing_template",
                },
            ),
            "expected": "Template missing_template does not exist",
        },
        {
            "name": "Error: malformed definition",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                },
            ),
            "expected": "Malformed Notification Definition",
        },
        {
            "name": "Error: Using disallowed smtp server",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "smtp.test.com",
                    "port": "1025",
                    "type": "email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                    "mimetype": "text/text",
                    "subject": "Test Email",
                    "message": "Test message for the email.",
                },
            ),
            "expected": "Trying to use disallowed smtp server: "
            "'smtp.test.com'.\n"
            "Disallowed smtp servers: ['smtp.test.com']",
        },
    ],
)
def test_email_notification(scenario: dict) -> None:
    """Testing send email notification with template.

    Args:
        scenario: scenario to test.
    """
    spec: TerminatorSpec = scenario["spec"]
    name = scenario["name"]
    expected_output = scenario["expected"]

    notification_type = spec.args["type"]

    LOGGER.info(f"Executing notification test: {name}")

    if notification_type == "email":
        port = spec.args["port"]
        server = spec.args["server"]

        email_notifier = EmailNotifier(spec)

        if "Error: " in name:
            with pytest.raises(
                (
                    NotifierTemplateNotFoundException,
                    NotifierConfigException,
                    NotifierTemplateConfigException,
                )
            ) as e:
                email_notifier.create_notification()
                email_notifier.send_notification()
            assert expected_output in str(e.value)
        else:
            smtp_server = SMTPServer(server, port)
            smtp_server.start()

            email_notifier.create_notification()
            email_notifier.send_notification()
            (
                email_from,
                email_to,
                email_cc,
                email_bcc,
                mimetype,
                subject,
                message,
                attachments,
            ) = _parse_email_output(smtp_server.get_last_message().as_string())

            assert email_from == spec.args["from"]
            if "to" in spec.args:
                assert email_to == spec.args["to"]
            if "cc" in spec.args:
                assert email_cc == spec.args["cc"]
            if "bcc" in spec.args:
                assert email_bcc == spec.args["bcc"]
            assert mimetype == spec.args["mimetype"]
            assert subject == spec.args["subject"]
            assert message == expected_output
            assert attachments == scenario.get("expected_attachments", [])

            smtp_server.stop()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Email Notification Template",
            "args": {
                "server": "localhost",
                "port": "1025",
                "type": "email",
                "template": "failure_notification_email",
                "from": "test-email@email.com",
                "to": ["test-email1@email.com", "test-email2@email.com"],
                "cc": ["test-email3@email.com", "test-email4@email.com"],
                "exception": "test-exception",
            },
            "expected": """
            Job local in workspace local has
            failed with the exception: test-exception""",
        },
        {
            "name": "Email Notification Free Form",
            "args": {
                "server": "localhost",
                "port": "1025",
                "type": "email",
                "from": "test-email@email.com",
                "bcc": ["test-email1@email.com", "test-email2@email.com"],
                "mimetype": "text/text",
                "subject": "Test Email",
                "message": "Test message for the email.",
                "attachments": [
                    f"{TEST_ATTACHEMENTS_PATH}test_attachement.txt",
                    f"{TEST_ATTACHEMENTS_PATH}test_image.png",
                ],
            },
            "expected": "Test message for the email.",
            "expected_attachments": ["test_attachement.txt", "test_image.png"],
        },
        {
            "name": "Error: non-existent template",
            "args": {
                "server": "localhost",
                "port": "1025",
                "type": "email",
                "template": "missing_template",
            },
            "expected": "Template missing_template does not exist",
        },
        {
            "name": "Error: Malformed Notification Definition",
            "args": {
                "server": "localhost",
                "port": "1025",
                "type": "email",
                "from": "test-email@email.com",
                "to": ["test-email1@email.com", "test-email2@email.com"],
            },
            "expected": "Malformed Notification Definition",
        },
        {
            "name": "Error: Using disallowed smtp server",
            "args": {
                "server": "smtp.test.com",
                "port": "1025",
                "type": "email",
                "from": "test-email@email.com",
                "to": ["test-email1@email.com", "test-email2@email.com"],
                "mimetype": "plain",
                "subject": "Test Email",
                "message": "Test message for the email.",
            },
            "expected": "Trying to use disallowed smtp server: "
            "'smtp.test.com'.\n"
            "Disallowed smtp servers: ['smtp.test.com']",
        },
    ],
)
def test_email_notification_facade(scenario: dict) -> None:
    """Testing send email notification with template.

    Args:
        scenario: scenario to test.
    """
    args = scenario["args"]
    name = scenario["name"]
    expected_output = scenario["expected"]

    notification_type = args["type"]

    LOGGER.info(f"Executing notification test: {name}")

    if notification_type == "email":
        port = args["port"]
        server = args["server"]

        if "Error: " in name:
            with pytest.raises(
                (
                    NotifierTemplateNotFoundException,
                    NotifierConfigException,
                    NotifierTemplateConfigException,
                )
            ) as e:
                send_notification(args=args)
            assert expected_output in str(e.value)
        else:
            smtp_server = SMTPServer(server, port)
            smtp_server.start()

            send_notification(args=args)
            (
                email_from,
                email_to,
                email_cc,
                email_bcc,
                mimetype,
                subject,
                message,
                attachments,
            ) = _parse_email_output(smtp_server.get_last_message().as_string())

            assert email_from == args["from"]
            if "to" in args:
                assert email_to == args["to"]
            if "cc" in args:
                assert email_cc == args["cc"]
            if "bcc" in args:
                assert email_bcc == args["bcc"]
            assert mimetype == args["mimetype"]
            assert subject == args["subject"]
            assert message == expected_output
            assert attachments == scenario.get("expected_attachments", [])

            smtp_server.stop()


def _parse_email_output(
    mail_content: str,
) -> typing.Tuple[str, list, list, list, str, str, str, list]:
    """Parse the mail that was received in the debug smtp server.

    Args:
        mail_content: The raw mail content.

    Returns:
        A tuple with the email from, email to, cc, bcc, subject and message.
    """
    email_from = re.search("(?<=From: ).*", mail_content).group()
    email_to = re.search("(?<=To: ).*", mail_content).group().split(", ")
    email_cc = re.search("(?<=CC: ).*", mail_content).group().split(", ")
    email_bcc = re.search("(?<=BCC: ).*", mail_content).group().split(", ")
    mimetype = re.search("(?<=Content-Type: ).*(?=; charset)", mail_content).group()
    subject = re.search("(?<=Subject: ).*", mail_content).group()
    message = re.search("(?<=bit\n).*?(?=--=)", mail_content, re.S).group()[1:-1]
    attachments = re.findall("""(?<=filename=").*(?=")""", mail_content)

    return (
        email_from,
        email_to,
        email_cc,
        email_bcc,
        mimetype,
        subject,
        message,
        attachments,
    )


================================================
FILE: tests/feature/test_reconciliation.py
================================================
"""Test reconciliation."""

from typing import Any, List, Union

import pytest

from lakehouse_engine.algorithms.exceptions import ReconciliationFailedException
from lakehouse_engine.algorithms.reconciliator import ReconciliationType
from lakehouse_engine.engine import execute_reconciliation
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.local_storage import LocalStorage

TEST_PATH = "reconciliation"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"

ACON_WITH_QUERIES = {
    "metrics": [
        {
            "metric": "net_sales",
            "type": "absolute",
            "aggregation": "sum",
            "yellow": 0.05,
            "red": 0.1,
        },
        {
            "metric": "net_sales",
            "type": "percentage",
            "aggregation": "avg",
            "yellow": 0.04,
            "red": 0.08,
        },
    ],
    "truth_input_spec": {
        "spec_id": "truth",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/truth.json",
    },
    "truth_preprocess_query": """
        SELECT country, sum(net_sales) as net_sales
        FROM truth
        GROUP BY country
    """,
    "truth_preprocess_query_args": [
        {
            "function": "persist",
            "args": {"storage_level": "MEMORY_AND_DISK_DESER"},
        }
    ],
    "current_input_spec": {
        "spec_id": "current_results",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/current.json",
    },
    "current_preprocess_query": """
        SELECT country, sum(net_sales) as net_sales
        FROM current
        GROUP BY country
    """,
    "current_preprocess_query_args": [
        {
            "function": "persist",
            "args": {"storage_level": "MEMORY_AND_DISK"},
        }
    ],
}

ACON_WITHOUT_QUERIES = {
    "metrics": [
        {
            "metric": "net_sales",
            "type": "absolute",
            "aggregation": "sum",
            "yellow": 0.01,
            "red": 0.05,
        },
        {
            "metric": "net_sales",
            "type": "absolute",
            "aggregation": "avg",
            "yellow": 0.04,
            "red": 0.08,
        },
    ],
    "truth_input_spec": {
        "spec_id": "truth",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/truth.json",
    },
    "truth_preprocess_query_args": [{"function": "cache"}],
    "current_input_spec": {
        "spec_id": "current_results",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/current.json",
    },
    "current_preprocess_query_args": [],  # turn cache off as it is a default
}

ACON_WITH_QUERIES_EMPTY_DF_TRUE_CHECK = {
    "metrics": [
        {
            "metric": "net_sales",
            "type": "absolute",
            "aggregation": "sum",
            "yellow": 0.05,
            "red": 0.1,
        },
        {
            "metric": "net_sales",
            "type": "percentage",
            "aggregation": "avg",
            "yellow": 0.04,
            "red": 0.08,
        },
    ],
    "truth_input_spec": {
        "spec_id": "truth",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/truth.json",
    },
    "truth_preprocess_query": """
        SELECT country, sum(net_sales) as net_sales
        FROM truth where 1 = 0
        group by country
    """,
    "truth_preprocess_query_args": [
        {
            "function": "persist",
            "args": {"storage_level": "MEMORY_AND_DISK_DESER"},
        }
    ],
    "current_input_spec": {
        "spec_id": "current_results",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse"
        "/in/feature/reconciliation/data/current.json",
    },
    "current_preprocess_query": """
        SELECT country, sum(net_sales) as net_sales
        FROM current
        WHERE 1 = 0
        group by country
    """,
    "current_preprocess_query_args": [
        {
            "function": "persist",
            "args": {"storage_level": "MEMORY_AND_DISK"},
        }
    ],
    "ignore_empty_df": True,
}

ACON_WITH_QUERIES_EMPTY_DF_FALSE_CHECK = {
    "metrics": [
        {
            "metric": "net_sales",
            "type": "absolute",
            "aggregation": "sum",
            "yellow": 0.05,
            "red": 0.1,
        },
        {
            "metric": "net_sales",
            "type": "percentage",
            "aggregation": "avg",
            "yellow": 0.04,
            "red": 0.08,
        },
    ],
    "truth_input_spec": {
        "spec_id": "truth",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/truth.json",
    },
    "truth_preprocess_query": """
        SELECT country, sum(net_sales) as net_sales
        FROM truth where 1 = 0
        group by country
    """,
    "truth_preprocess_query_args": [
        {
            "function": "persist",
            "args": {"storage_level": "MEMORY_AND_DISK_DESER"},
        }
    ],
    "current_input_spec": {
        "spec_id": "current_results",
        "read_type": "batch",
        "data_format": "json",
        "options": {"multiline": "true"},
        "location": "file:///app/tests/lakehouse/in/feature/"
        "reconciliation/data/current.json",
    },
    "current_preprocess_query": """
        SELECT country, sum(net_sales) as net_sales
        FROM current
        WHERE 1 = 0
        group by country
    """,
    "current_preprocess_query_args": [
        {
            "function": "persist",
            "args": {"storage_level": "MEMORY_AND_DISK"},
        }
    ],
    "ignore_empty_df": False,
}


ACONS = {
    "with_queries_pct": ACON_WITH_QUERIES,
    "with_files_abs": ACON_WITHOUT_QUERIES,
    "failed_reconciliation_pct": ACON_WITH_QUERIES,
    "empty_truth": ACON_WITHOUT_QUERIES,
    "different_rows": ACON_WITHOUT_QUERIES,
    "empty_df_true_check": ACON_WITH_QUERIES_EMPTY_DF_TRUE_CHECK,
    "empty_df_false_check": ACON_WITH_QUERIES_EMPTY_DF_FALSE_CHECK,
}


@pytest.mark.parametrize(
    "scenario",
    [
        [
            "with_queries_pct",
            "current.json",
            "truth.json",
            None,
            "The Reconciliation process has succeeded.",
        ],
        [
            "with_files_abs",
            "current.json",
            "truth.json",
            None,
            "The Reconciliation process has succeeded.",
        ],
        [
            "failed_reconciliation_pct",
            "current_fail.json",
            "truth.json",
            "Reconciliation result: {'net_sales_absolute_diff_sum': 100.0, "
            "'net_sales_percentage_diff_avg': 0.0625}",
            "The Reconciliation process has failed with status: red.",
        ],
        [
            "empty_truth",
            "current.json",
            "truth_empty.json",
            None,
            "The reconciliation has failed because either the truth dataset or the "
            "current results dataset was empty.",
        ],
        [
            "different_rows",
            "current_different_rows.json",
            "truth_different_rows.json",
            "Reconciliation result: {'net_sales_absolute_diff_sum': 500.0, "
            "'net_sales_absolute_diff_avg': 100.0}",
            "The Reconciliation process has failed with status: red.",
        ],
        [
            "empty_df_true_check",
            "current.json",
            "truth.json",
            None,
            "The Reconciliation process has succeeded.",
        ],
        [
            "empty_df_false_check",
            "current.json",
            "truth.json",
            None,
            "The reconciliation has failed because either the truth dataset or the "
            "current results dataset was empty.",
        ],
    ],
)
def test_reconciliation(scenario: str, caplog: Any) -> None:
    """Test reconciliation.

    Args:
        scenario: scenario to test.
             with_queries - uses queries to get the truth data and the current data.
                Reconciliation type is percentage.
             with_files - uses files for the truth data and query for the current data.
                Reconciliation type is absolute.
             failed_reconciliation - same as 'with_queries' but with a failed
                reconciliation. Reconciliation type is percentage.
             empty_truth - scenario in which the truth data is empty.
             different_rows - the truth dataset and current results dataset have
                different rows, therefore reconciliation should fail.
        caplog: captured log.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/*.json",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    acon = ACONS[scenario[0]]
    acon["current_input_spec"][  # type: ignore
        "location"
    ] = f"file:///app/tests/lakehouse/in/feature/reconciliation/data/{scenario[1]}"
    acon["truth_input_spec"][  # type: ignore
        "location"
    ] = f"file:///app/tests/lakehouse/in/feature/reconciliation/data/{scenario[2]}"

    if scenario[0] in [
        "failed_reconciliation_pct",
        "empty_truth",
        "different_rows",
        "empty_df_false_check",
    ]:
        with pytest.raises(ReconciliationFailedException) as e:
            execute_reconciliation(acon=acon)  # type: ignore
        if scenario[3]:
            assert scenario[3] in caplog.text
        assert str(e.value) == scenario[4]
    else:
        execute_reconciliation(acon=acon)  # type: ignore
        assert scenario[4] in caplog.text


@pytest.mark.parametrize(
    "scenario",
    [
        [
            "pass",
            ReconciliationType.PCT.value,
            0.05,
            0.1,
            "current_nulls_and_zeros",
            "truth_nulls_and_zeros",
            "Reconciliation result: {'net_sales_percentage_diff_sum': 0.0, "
            "'net_sales_percentage_diff_avg': 0.0}",
            "The Reconciliation process has succeeded.",
        ],
        [
            "fail_if_threshold_zero",
            ReconciliationType.PCT.value,
            0,
            0,
            "current_nulls_and_zeros_fail",
            "truth_nulls_and_zeros_fail",
            "Reconciliation result: {'net_sales_percentage_diff_sum': 1.0, "
            "'net_sales_percentage_diff_avg': 0.3333333333333333}",
            "The Reconciliation process has failed with status: red.",
        ],
        [
            "fail_null_is_not_zero",
            ReconciliationType.PCT.value,
            0.05,
            0.1,
            "current_nulls_and_zeros_fail",
            "truth_nulls_and_zeros_fail",
            "Reconciliation result: {'net_sales_percentage_diff_sum': 1.0, "
            "'net_sales_percentage_diff_avg': 0.3333333333333333}",
            "The Reconciliation process has failed with status: red.",
        ],
    ],
)
def test_nulls_and_zero_values_and_threshold(
    scenario: List[Union[str, float]], caplog: Any
) -> None:
    """Test truth and current datasets with nulls and zeros.

    Args:
        scenario: scenario to test.
            pass - reconciliation should pass even if there are 0s and nulls in the
                truth and current datasets.
            fail_if_threshold_zero - reconciliation should fail if users pass 0 as
                threshold as of course 0 indicates there's no difference. If that's the
                threshold then it will indicate that the reconciliation has failed.
            fail_null_is_not_zero - reconciliation should fail if in the first record
                of the current data we have a 0, and in the corresponding row of the
                truth data we have a null, because that indicates a percentage
                difference of 1 according to the recon algorithm, and therefore,
                the reconciliation should present those differences properly, instead
                of assuming that 0 is equal to null.
        caplog: captured log.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/*.json",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    acon = ACON_WITHOUT_QUERIES
    acon["current_input_spec"]["location"] = (  # type: ignore
        f"file:///app/tests/"
        f"lakehouse/in/feature/reconciliation/data/{scenario[4]}.json"
    )
    acon["truth_input_spec"]["location"] = (  # type: ignore
        f"file:///app/tests/"
        f"lakehouse/in/feature/reconciliation/data/{scenario[5]}.json"
    )
    acon["metrics"][0]["type"] = scenario[1]  # type: ignore
    acon["metrics"][0]["yellow"] = scenario[2]  # type: ignore
    acon["metrics"][0]["red"] = scenario[3]  # type: ignore
    acon["metrics"][1]["type"] = scenario[1]  # type: ignore
    acon["metrics"][1]["yellow"] = scenario[2]  # type: ignore
    acon["metrics"][1]["red"] = scenario[3]  # type: ignore

    if scenario[0] in ["fail_null_is_not_zero", "fail_if_threshold_zero"]:
        with pytest.raises(ReconciliationFailedException) as e:
            execute_reconciliation(acon=acon)
        assert scenario[6] in caplog.text
        assert str(e.value) == scenario[7]
    else:
        execute_reconciliation(acon=acon)
        assert scenario[6] in caplog.text


================================================
FILE: tests/feature/test_schema_evolution.py
================================================
"""Test schema evolution on delta loads."""

from typing import Generator

import pytest
from pyspark.sql.utils import AnalysisException

from lakehouse_engine.core.definitions import InputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "schema_evolution"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.fixture(autouse=True)
def prepare_tests() -> Generator:
    """Run setup and cleanup steps before/after each test scenario."""
    # Test setup
    yield
    # Test cleanup
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}")


@pytest.mark.parametrize(
    "scenario",
    [
        [
            "auto_merge_enabled_add_column",
            "part-02",
            "batch_delta_enabled",
            "control_schema_add_column",
        ],
        [
            "auto_merge_disabled_add_column",
            "part-02",
            "batch_delta_disabled",
            "control_schema_add_column",
        ],
        [
            "auto_merge_enabled_remove_column",
            "part-03",
            "batch_delta_enabled",
            "control_schema",
        ],
        [
            "auto_merge_disabled_remove_column",
            "part-03",
            "batch_delta_disabled",
            "control_schema",
            "customer",
        ],
        [
            "auto_merge_enabled_cast_column",
            "part-04",
            "batch_delta_enabled",
            "control_schema",
        ],
        [
            "auto_merge_disabled_cast_column",
            "part-04",
            "batch_delta_disabled",
            "control_schema",
        ],
        [
            "auto_merge_enabled_rename_column_file",
            "part-05",
            "batch_delta_enabled",
            "control_schema_rename",
        ],
        [
            "auto_merge_disabled_rename_column_file",
            "part-05",
            "batch_delta_disabled",
            "control_schema_rename",
            "request",
        ],
        [
            "auto_merge_enabled_rename_column_transform",
            "part-06",
            "batch_delta_enabled",
            "control_schema",
        ],
        [
            "auto_merge_disabled_rename_column_transform",
            "part-06",
            "batch_delta_disabled_rename",
            "control_schema",
            "ARTICLE",
        ],
    ],
)
def test_schema_evolution_delta_load(scenario: str) -> None:
    """Test schema evolution on delta loads.

    Args:
        scenario: scenario to test.
        auto_merge_enabled_add_column - it performs the merge successfully and
        the new column is added to the schema (older rows assume null value
        for this column)
        auto_merge_disabled_add_column - it performs the merge successfully
        but the new column is ignored (is not added to the final schema).
        auto_merge_enabled_remove_column - it performs the merge successfully,
        the column is not removed from the final schema and the new rows assume
        the value null for this column.
        auto_merge_disabled_remove_column - purposely checks that the delta
        load fails when a column is removed.
        auto_merge_enabled_cast_column - it performs the merge successfully
        but the column type does not change automatically in the final schema.
        auto_merge_disabled_cast_column - it performs the merge successfully
        but the column type does not change automatically in the final schema.
        auto_merge_enabled_rename_column_file - it performs the merge
        successfully but assumes the renamed column as a new column (the
        column is renamed in the source schema only).
        auto_merge_disabled_rename_column_file - purposely checks that the
        delta load fails when a column is renamed (the column is renamed in
        the source schema only).
        auto_merge_enabled_rename_column_transform - it performs the merge
        successfully but ignores the renaming transformation specified in
        the acon.
        auto_merge_disabled_rename_column_transform - checks the behavior
        of the delta load when a column is renamed to lowercase,
        based on a transformation specified in the acon, without spark
        case-sensitive property.
    Scenario Properties:
        [scenario name, input file, acon file, control schema file,
        error message excerpt (optional)]
    """
    _create_table("schema_evolution_delta_load", "delta_load")

    # initial load
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/delta_load/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/delta_load/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/delta_load/schema/source/source_part-01_schema.json",
        f"{TEST_LAKEHOUSE_IN}/delta_load/",
    )
    load_data(
        f"file://{TEST_RESOURCES}/delta_load/batch_init_"
        f"{'enabled' if 'enabled' in scenario[0] else 'disabled'}.json"
    )

    initial_schema = DataframeHelpers.read_from_table(
        "test_db.schema_evolution_delta_load"
    ).schema

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/delta_load/data/source/{scenario[1]}.csv",
        f"{TEST_LAKEHOUSE_IN}/delta_load/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/delta_load/schema/source/source_{scenario[1]}_schema.json",
        f"{TEST_LAKEHOUSE_IN}/delta_load/source_delta_schema.json",
    )

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/delta_load/{scenario[2]}.json"
    )

    # tests with schema auto merge enabled
    if (
        "enabled" in scenario[0]
        or scenario[0] == "auto_merge_disabled_rename_column_transform"
    ):
        load_data(acon=acon)

        result_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_OUT}/delta_load/data",
            file_format=InputFormat.DELTAFILES.value,
        )
        schema_after_merge = DataframeHelpers.read_from_table(
            "test_db.schema_evolution_delta_load"
        ).schema

        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/delta_load/data/control/{scenario[1]}.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/delta_load/data/",
        )
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/delta_load/schema/control/{scenario[3]}.json",
            f"{TEST_LAKEHOUSE_CONTROL}/delta_load/",
        )
        control_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_CONTROL}/delta_load/data/{scenario[1]}.csv",
            schema=SchemaUtils.from_file_to_dict(
                f"file://{TEST_LAKEHOUSE_CONTROL}/delta_load/{scenario[3]}.json"
            ),
        )

        # for the cast and rename tests, based on the transformations
        # specified in the acon file, the schema changes are ignored
        if scenario[0] == "auto_merge_enabled_cast_column" or scenario[0] == (
            "auto_merge_enabled_rename_column_transform"
        ):
            assert initial_schema == schema_after_merge
        else:
            assert not DataframeHelpers.has_diff(result_df, control_df)

    # tests with schema auto merge disabled
    elif "disabled" in scenario[0]:
        # for "add column" and "cast column" tests the merge runs successfully
        # but the schema changes are ignored
        if "add" in scenario[0] or "cast" in scenario[0]:
            load_data(acon=acon)

            result_df = DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_OUT}/delta_load/data",
                file_format=InputFormat.DELTAFILES.value,
            )

            if scenario[0] == "auto_merge_disabled_add_column":
                assert "new_column" not in result_df.columns
            else:
                assert not isinstance(result_df["code"], str)
        # for the removing column tests, the merge throws an error
        else:
            with pytest.raises(
                AnalysisException,
                match=f".*Cannot resolve {scenario[4]} in UPDATE clause given.*",
            ):
                load_data(acon=acon)


@pytest.mark.parametrize(
    "scenario",
    [
        [
            "auto_merge_enabled_add_column",
            "part-02",
            "batch_append_enabled",
            "control_schema_add_column",
        ],
        [
            "auto_merge_disabled_add_column",
            "part-02",
            "batch_append_disabled",
            "control_schema_add_column",
            "A schema mismatch detected when writing to the Delta table",
        ],
        [
            "auto_merge_enabled_remove_column",
            "part-03",
            "batch_append_enabled",
            "control_schema",
        ],
        [
            "auto_merge_disabled_remove_column",
            "part-03",
            "batch_append_disabled",
            "control_schema",
        ],
        [
            "auto_merge_enabled_cast_column",
            "part-04",
            "batch_append_enabled_cast",
            "control_schema",
            "Failed to merge fields",
        ],
        [
            "auto_merge_disabled_cast_column",
            "part-04",
            "batch_append_disabled",
            "control_schema",
        ],
        [
            "auto_merge_enabled_rename_column_file",
            "part-05",
            "batch_append_enabled",
            "control_schema_rename",
        ],
        [
            "auto_merge_disabled_rename_column_file",
            "part-05",
            "batch_append_disabled",
            "control_schema_rename",
            "A schema mismatch detected",
        ],
        [
            "auto_merge_enabled_rename_column_transform",
            "part-06",
            "batch_append_enabled",
            "control_schema",
        ],
        [
            "auto_merge_disabled_rename_column_transform",
            "part-06",
            "batch_append_disabled",
            "control_schema",
        ],
    ],
)
def test_schema_evolution_append_load(scenario: str) -> None:
    """Test schema evolution on append loads.

    Args:
        scenario: scenario to test.
        auto_merge_enabled_add_column - it performs the append load successfully
        and the new column is added to the schema (older rows assume null value
        for this column)
        auto_merge_disabled_add_column - purposely checks that the append load
        fails when a new column is added.
        auto_merge_enabled_remove_column - it performs the append load
        successfully, the column is not removed from the final schema and the
        new rows assume the value null for this column.
        auto_merge_disabled_remove_column - it performs the append load
        successfully, the column is not removed from the final schema and the
        new rows assume the value null for this column.
        auto_merge_enabled_cast_column - purposely checks that the append load
        fails when a cast transformation is added to the acon file.
        auto_merge_disabled_cast_column - purposely checks that the append load
        fails when a cast transformation is added to the acon file.
        auto_merge_enabled_rename_column_file - purposely checks that the
        append load fails when a column is renamed (the column is renamed in
        the source schema only).
        auto_merge_disabled_rename_column_file - purposely checks that the
        append load fails when a column is renamed (the column is renamed in
        the source schema only).
        auto_merge_enabled_rename_column_transform - it performs the append load
        successfully but ignores the renaming transformation specified in
        the acon.
        auto_merge_disabled_rename_column_transform - it performs the append load
        successfully but ignores the renaming transformation specified in
        the acon.
    Scenario Properties:
        [scenario name, input file, acon file, control schema file,
        error message excerpt (optional)]
    """
    _create_table("schema_evolution_append_load", "append_load")

    # initial load
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/append_load/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/append_load/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/append_load/schema/source/source_part-01_schema.json",
        f"{TEST_LAKEHOUSE_IN}/append_load/",
    )
    load_data(
        f"file://{TEST_RESOURCES}/append_load/batch_init_"
        f"{'enabled' if 'enabled' in scenario[0] else 'disabled'}.json"
    )

    initial_schema = DataframeHelpers.read_from_table(
        "test_db.schema_evolution_append_load"
    ).schema

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/append_load/data/source/{scenario[1]}.csv",
        f"{TEST_LAKEHOUSE_IN}/append_load/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/append_load/schema/source/source_{scenario[1]}_schema.json",
        f"{TEST_LAKEHOUSE_IN}/append_load/source_append_schema.json",
    )

    # tests with schema auto merge enabled
    if "enabled" in scenario[0]:
        # for the cast column test, the append throws an error
        acon = ConfigUtils.get_acon(
            f"file://{TEST_RESOURCES}/append_load/{scenario[2]}.json"
        )
        if "cast" in scenario[0]:
            with pytest.raises(AnalysisException, match=f".*{scenario[4]}*"):
                load_data(acon=acon)
        else:
            load_data(acon=acon)

            result_df = DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_OUT}/append_load/data",
                file_format=InputFormat.DELTAFILES.value,
            )
            schema_after_append = DataframeHelpers.read_from_table(
                "test_db.schema_evolution_append_load"
            ).schema

            LocalStorage.copy_file(
                f"{TEST_RESOURCES}/append_load/data/control/{scenario[1]}.csv",
                f"{TEST_LAKEHOUSE_CONTROL}/append_load/data/",
            )
            LocalStorage.copy_file(
                f"{TEST_RESOURCES}/append_load/schema/control/{scenario[3]}.json",
                f"{TEST_LAKEHOUSE_CONTROL}/append_load/",
            )
            control_df = DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_CONTROL}/append_load/data/{scenario[1]}.csv",
                schema=SchemaUtils.from_file_to_dict(
                    f"file://{TEST_LAKEHOUSE_CONTROL}/append_load/{scenario[3]}.json"
                ),
            )

            # for rename test, based on the transformation specified in the
            # acon file, the schema change is ignored
            if scenario[0] == "auto_merge_enabled_rename_column_transform":
                assert initial_schema == schema_after_append
            else:
                assert not DataframeHelpers.has_diff(result_df, control_df)

    # tests with schema auto merge disabled
    elif "disabled" in scenario[0]:
        # for the renaming or adding column tests, the append throws an error
        acon = ConfigUtils.get_acon(
            f"file://{TEST_RESOURCES}/append_load/{scenario[2]}.json"
        )
        if "rename_column_file" in scenario[0] or "add" in scenario[0]:
            with pytest.raises(AnalysisException, match=f".*{scenario[4]}*"):
                load_data(acon=acon)
        else:
            load_data(acon=acon)

            result_df = DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_OUT}/append_load/data",
                file_format=InputFormat.DELTAFILES.value,
            )

            schema_after_append = DataframeHelpers.read_from_table(
                "test_db.schema_evolution_append_load"
            ).schema

            assert initial_schema == schema_after_append


@pytest.mark.parametrize(
    "scenario",
    [
        [
            "auto_merge_enabled",
            "part-02",
            "batch_merge_enabled",
            "control_schema_merge_enabled",
        ],
        [
            "auto_merge_disabled",
            "part-02",
            "batch_merge_disabled",
            "",
            "Failed to merge",
        ],
        [
            "overwrite_schema",
            "part-02",
            "batch_overwrite",
            "control_schema_overwrite",
        ],
    ],
)
def test_schema_evolution_full_load(scenario: str) -> None:
    """Test schema evolution on full loads.

    Args:
        scenario: scenario to test.
        auto_merge_enabled - overwrites the data in the table but does not
        overwrite the schema (assumes the new column, keeps the removed
        column, ignores renaming and cast transformations)
        auto_merge_disabled - throws a mismatch schema error.
        overwrite_schema - overwrites the data and the schema of the table.
    Scenario Properties:
        [scenario name, input file, acon file, control schema file,
        error message excerpt (optional)]
    """
    _create_table("schema_evolution_full_load", "full_load")

    # initial load
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/full_load/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/full_load/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/full_load/schema/source/source_part-01_schema.json",
        f"{TEST_LAKEHOUSE_IN}/full_load/source_schema.json",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/full_load/batch_init.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/full_load/data/source/{scenario[1]}.csv",
        f"{TEST_LAKEHOUSE_IN}/full_load/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/full_load/schema/source/source_{scenario[1]}_schema.json",
        f"{TEST_LAKEHOUSE_IN}/full_load/source_schema.json",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/full_load/{scenario[2]}.json")
    if scenario[0] == "auto_merge_disabled":
        with pytest.raises(AnalysisException, match=f".*{scenario[4]}*"):
            load_data(acon=acon)
    else:
        load_data(acon=acon)

        final_schema = SchemaUtils.from_table_schema(
            "test_db.schema_evolution_full_load"
        )

        result_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_OUT}/full_load/data",
            file_format=InputFormat.DELTAFILES.value,
        )
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/full_load/schema/control/{scenario[3]}.json",
            f"{TEST_LAKEHOUSE_CONTROL}/full_load/",
        )

        control_schema = SchemaUtils.from_file(
            f"file://{TEST_LAKEHOUSE_CONTROL}/full_load/{scenario[3]}.json"
        )

        assert final_schema == control_schema
        # with the rename transformation specified in acon, both the original
        # and the renamed field (ARTICLE and article) are not considered in
        # the final schema
        assert ("article", "ARTICLE") not in result_df.columns


def _create_table(table_name: str, location: str) -> None:
    """Create test table."""
    ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS test_db.{table_name}")
    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS test_db.{table_name} (
            actrequest_timestamp string,
            request string,
            datapakid int,
            partno int,
            record int,
            salesorder int,
            item int,
            recordmode string,
            date int,
            customer string,
            ARTICLE string,
            amount int,
            code int
        )
        USING delta
        LOCATION '{TEST_LAKEHOUSE_OUT}/{location}/data'
        """
    )


================================================
FILE: tests/feature/test_sensors.py
================================================
"""Module with integration tests for sensors feature."""

import json
import os
from datetime import datetime

import pytest
from pyspark.sql.types import StringType, StructField, StructType

from lakehouse_engine.algorithms.exceptions import (
    NoNewDataException,
    SensorAlreadyExistsException,
)
from lakehouse_engine.core.definitions import SensorSpec, SensorStatus
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.sensor_manager import SensorControlTableManager
from lakehouse_engine.engine import (
    execute_sensor,
    generate_sensor_query,
    update_sensor_status,
)
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "sensors"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"

_TEST_SENSOR_DELTA_TABLE_BASE_SCHEMA = {
    "sensor_id": "string",
    "assets": "array<string>",
    "status": "string",
    "status_change_timestamp": "timestamp",
    "checkpoint_location": "string",
}

_TEST_SENSOR_DELTA_TABLE_SCHEMA = {
    **_TEST_SENSOR_DELTA_TABLE_BASE_SCHEMA,
    **{
        "upstream_key": "string",
        "upstream_value": "string",
    },
}


@pytest.mark.parametrize(
    "scenario",
    [
        "1st_run",
        "has_new_data",
        "has_data_from_previous_execution",
        "upstream_acquired_new_data_but_not_processed",
        "no_new_data",
    ],
)
def test_table_sensor(scenario: list) -> None:
    """Test the feature of using a sensor to read from a delta table.

    This specific test focuses on a delta table that is in itself the delta
    table where sensor information is stored. This is useful for data products
    consuming other data products sensor information to trigger their pipelines.

    Scenarios:
        1st_run: initial setup.
        has_new_data: the first time the sensor detects new data from the
            upstream.
        has_data_from_previous_execution: the sensor does not detect new data
            from the upstream, but it had data detected from a previous
            execution of the pipeline for which the completion of the processing
            of all the data was not acknowledged (e.g., the pipeline failed
            before completing all the tasks).
        upstream_acquired_new_data_but_not_processed: tests the scenario where
            the upstream sensor has acquired new data, but because it's still
            not in processed state, the downstream sensoring this table cannot
            consider there's new data available from the upstream (e.g.,
            a data product pipeline has identified new data from the source,
            but the pipeline failed, so the downstream data product pipeline's
            sensor cannot consider there's new data from the upstream).
        no_new_data: there's no new data from the upstream.
    """
    upstream_table = "test_table_sensor_upstream"
    sensor_id = "sensor_id_1"
    control_db_table_name = "test_db.test_table_sensor"
    checkpoint_location = f"{TEST_LAKEHOUSE_IN}/test_table_sensor/"

    if scenario == "1st_run":
        DataframeHelpers.create_delta_table(
            _TEST_SENSOR_DELTA_TABLE_SCHEMA,
            table="test_table_sensor",
        )
        DataframeHelpers.create_delta_table(
            _TEST_SENSOR_DELTA_TABLE_SCHEMA,
            table=upstream_table,
            enable_cdf=True,
        )

    if scenario == "has_new_data":
        _insert_data_into_upstream_table(upstream_table)
    elif scenario == "upstream_acquired_new_data_but_not_processed":
        _insert_data_into_upstream_table(
            upstream_table,
            values=(
                f"('sensor_id_upstream_1', array('dummy_upstream_asset_1'), "
                f"'{SensorStatus.ACQUIRED_NEW_DATA.value}', "
                f"'2023-05-30 23:29:49.079522', null, null, null)"
            ),
        )

    acon = {
        "sensor_id": sensor_id,
        "assets": ["dummy_asset_1"],
        "control_db_table_name": control_db_table_name,
        "input_spec": {
            "spec_id": "sensor_upstream",
            "read_type": "streaming",
            "data_format": "delta",
            "db_table": f"test_db.{upstream_table}",
            "options": {
                "readChangeFeed": "true",
            },
        },
        "preprocess_query": generate_sensor_query("sensor_id_upstream_1"),
        "base_checkpoint_location": checkpoint_location,
        "fail_on_empty_result": True,
    }

    if scenario in ["has_new_data", "has_data_from_previous_execution"]:
        has_new_data = execute_sensor(acon=acon)
        sensor_table_data = SensorControlTableManager.read_sensor_table_data(
            sensor_id=sensor_id, control_db_table_name=control_db_table_name
        )
        assert sensor_table_data.status == SensorStatus.ACQUIRED_NEW_DATA.value
        assert has_new_data

        if scenario == "has_data_from_previous_execution":
            # this is the final scenario where we should have data from upstream.
            # therefore, we checkpoint to indicate that sensor has processed
            # all the new data.
            update_sensor_status(
                sensor_id,
                control_db_table_name,
            )

            sensor_table_data = SensorControlTableManager.read_sensor_table_data(
                sensor_id=sensor_id, control_db_table_name=control_db_table_name
            )

            assert sensor_table_data.status == SensorStatus.PROCESSED_NEW_DATA.value
    else:
        with pytest.raises(NoNewDataException) as exception:
            execute_sensor(acon=acon)

        assert f"No data was acquired by {sensor_id} sensor." == str(exception.value)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "raise_exception_as_sensor_already_exists_by_sensor_id",
            "sensor_id": "sensor_id_2",
            "assets": ["dummy_asset_1"],
        },
        {
            "scenario_name": "raise_exception_as_sensor_already_exists_by_assets",
            "sensor_id": "sensor_id_1",
            "assets": ["dummy_asset_2"],
        },
    ],
)
def test_if_sensor_already_exists(scenario: dict) -> None:
    """Test if the sensor already exists.

    This specific test focuses on the ways to identify if a sensor
    already exists.

    Scenarios:
        raise_exception_as_sensor_already_exists_by_sensor_id: raises
            exception if you try to create a sensor with a
            different sensor id but same asset.
        raise_exception_as_sensor_already_exists_by_assets: raises
            exception if you try to create a sensor with
            different assets but same sensor_id.
    """
    sensor_id = "sensor_id_1"
    assets = ["dummy_asset_1"]

    control_db_table_name = "test_db.test_table_sensor"
    upstream_table = "test_table_sensor_upstream"
    checkpoint_location = f"{TEST_LAKEHOUSE_IN}/test_table_sensor/"

    LocalStorage.clean_folder(checkpoint_location)
    ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS {control_db_table_name}")
    ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS test_db.{upstream_table}")

    DataframeHelpers.create_delta_table(
        _TEST_SENSOR_DELTA_TABLE_SCHEMA,
        table="test_table_sensor",
    )
    DataframeHelpers.create_delta_table(
        _TEST_SENSOR_DELTA_TABLE_SCHEMA,
        table=upstream_table,
        enable_cdf=True,
    )

    _insert_data_into_upstream_table(upstream_table)

    acon = {
        "sensor_id": sensor_id,
        "assets": assets,
        "control_db_table_name": control_db_table_name,
        "input_spec": {
            "spec_id": "sensor_upstream",
            "read_type": "streaming",
            "data_format": "delta",
            "db_table": f"test_db.{upstream_table}",
            "options": {
                "readChangeFeed": "true",
            },
        },
        "preprocess_query": generate_sensor_query("sensor_id_upstream_1"),
        "base_checkpoint_location": checkpoint_location,
        "fail_on_empty_result": True,
    }

    execute_sensor(acon=acon)

    with pytest.raises(SensorAlreadyExistsException) as exception:
        acon["sensor_id"] = scenario["sensor_id"]
        acon["assets"] = scenario["assets"]
        execute_sensor(acon=acon)

    assert "There's already a sensor registered with same id or assets!" == str(
        exception.value
    )


@pytest.mark.parametrize(
    "scenario",
    [
        "1st_run",
        "2nd_run_with_new_data",
        "3rd_run_without_new_data",
        "4th_run_with_new_data",
    ],
)
def test_jdbc_sensor(scenario: str) -> None:
    """Test the feature of sensoring new data from a jdbc upstream.

    Scenario:
        1st_run - initial setup.
        2nd_run_with_new_data - jdbc upstream has new data.
        3rd_run_without_new_data - jdbc upstream does not have new data.
        4th_run_with_new_data - jdbc upstream has new data again.
    """
    upstream_jdbc_table = "test_jdbc_sensor_upstream"
    sensor_id = "sensor_id_1"
    sensor_table = "test_jdbc_sensor"
    control_db_table_name = f"test_db.{sensor_table}"
    os.makedirs(f"{TEST_LAKEHOUSE_IN}/{upstream_jdbc_table}", exist_ok=True)

    if scenario == "1st_run":
        DataframeHelpers.create_delta_table(
            _TEST_SENSOR_DELTA_TABLE_SCHEMA,
            table=sensor_table,
        )
        _insert_into_jdbc_table(init=True)
    elif scenario == "2nd_run_with_new_data":
        _insert_into_jdbc_table(time=datetime.now())
    elif scenario == "4th_run_with_new_data":
        _insert_into_jdbc_table(time=datetime.now())

    acon = {
        "sensor_id": sensor_id,
        "assets": ["dummy_asset_1"],
        "control_db_table_name": control_db_table_name,
        "input_spec": {
            "spec_id": "sensor_upstream",
            "read_type": "batch",
            "data_format": "jdbc",
            "jdbc_args": {
                "url": f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/"
                f"{upstream_jdbc_table}/tests.db",
                "table": upstream_jdbc_table,
                "properties": {"driver": "org.sqlite.JDBC"},
            },
        },
        "preprocess_query": generate_sensor_query(
            sensor_id=sensor_id,
            filter_exp="?upstream_key > '?upstream_value'",
            control_db_table_name=control_db_table_name,
            upstream_key="dummy_time",
        ),
        "fail_on_empty_result": True,
    }

    if scenario in ["2nd_run_with_new_data", "4th_run_with_new_data"]:
        has_new_data = execute_sensor(acon=acon)
        sensor_table_data = SensorControlTableManager.read_sensor_table_data(
            sensor_id=sensor_id, control_db_table_name=control_db_table_name
        )

        assert sensor_table_data.status == SensorStatus.ACQUIRED_NEW_DATA.value

        update_sensor_status(
            sensor_id,
            control_db_table_name,
        )

        sensor_table_data = SensorControlTableManager.read_sensor_table_data(
            sensor_id=sensor_id, control_db_table_name=control_db_table_name
        )

        assert sensor_table_data.status == SensorStatus.PROCESSED_NEW_DATA.value
        assert has_new_data
    else:
        with pytest.raises(NoNewDataException) as exception:
            execute_sensor(acon=acon)

        assert f"No data was acquired by {sensor_id} sensor." == str(exception.value)


def test_files_sensor() -> None:
    """Test the feature of sensoring a filesystem location (e.g., s3)."""
    sensor_id = "sensor_id_1"
    sensor_table = "test_files_sensor"
    control_db_table_name = f"test_db.{sensor_table}"
    checkpoint_location = f"{TEST_LAKEHOUSE_IN}/test_files_sensor/"
    files_location = f"{TEST_LAKEHOUSE_IN}/test_files_sensor/files/"

    DataframeHelpers.create_delta_table(
        _TEST_SENSOR_DELTA_TABLE_SCHEMA,
        table=sensor_table,
    )

    schema = _insert_files_sensor_test_data(files_location)

    acon = {
        "sensor_id": sensor_id,
        "assets": ["dummy_asset_1"],
        "control_db_table_name": control_db_table_name,
        "input_spec": {
            "spec_id": "sensor_upstream",
            "read_type": "streaming",
            "data_format": "csv",
            "location": files_location,
            "schema": json.loads(schema.json()),
        },
        "base_checkpoint_location": checkpoint_location,
        "fail_on_empty_result": False,
    }

    has_new_data = execute_sensor(acon=acon)

    assert has_new_data


def test_update_sensor_status() -> None:
    """Test sensor update status logic."""
    sensor_id = "sensor_id_1"
    sensor_table = "test_checkpoint_sensor"
    control_db_table_name = f"test_db.{sensor_table}"
    status = SensorStatus.ACQUIRED_NEW_DATA.value
    checkpoint_location = "s3://dummy-bucket/sensors/sensor_id_1"

    DataframeHelpers.create_delta_table(
        _TEST_SENSOR_DELTA_TABLE_BASE_SCHEMA,
        table="test_checkpoint_sensor",
    )

    SensorControlTableManager.update_sensor_status(
        sensor_spec=SensorSpec(
            sensor_id=sensor_id,
            assets=["asset_1"],
            control_db_table_name=control_db_table_name,
            checkpoint_location=checkpoint_location,
            preprocess_query=None,
            input_spec=None,
        ),
        status=status,
    )

    row = SensorControlTableManager.read_sensor_table_data(
        sensor_id=sensor_id, control_db_table_name=control_db_table_name
    )

    assert (
        row.sensor_id == sensor_id
        and row.status == SensorStatus.ACQUIRED_NEW_DATA.value
        and row.checkpoint_location == "s3://dummy-bucket/sensors/sensor_id_1"
    )


def _insert_data_into_upstream_table(
    table: str,
    db: str = "test_db",
    values: str = None,
) -> None:
    """Insert data into upstream table for testing sensoring based on tables.

    Args:
        table: table name.
        db: database name.
        values: string with the values operator for inserting data through SQL
            DML statement.
    """
    if not values:
        values = (
            f"('sensor_id_upstream_1', array('dummy_upstream_asset_1'), "
            f"'{SensorStatus.PROCESSED_NEW_DATA.value}', "
            f"'2023-05-30 23:28:49.079522', null, null, null),"
            f"('sensor_id_upstream_2', array('dummy_upstream_asset_2'), "
            f"'{SensorStatus.PROCESSED_NEW_DATA.value}', "
            f"'2023-05-30 23:28:49.089522', null, null, null)"
        )

    ExecEnv.SESSION.sql(f"INSERT INTO {db}.{table} VALUES {values}")  # nosec: B608


def _insert_files_sensor_test_data(files_location: str) -> StructType:
    """Insert test data for files sensor test.

    Args:
        files_location: location to insert the data.

    Returns:
        A dummy struct type.
    """
    schema = StructType([StructField("dummy_field", StringType(), True)])

    df = ExecEnv.SESSION.createDataFrame(
        [
            ["a"],
            ["b"],
        ],
        schema,
    )

    df.write.format("csv").save(files_location)

    return schema


def _insert_into_jdbc_table(
    init: bool = False,
    time: datetime = None,
) -> None:
    """Insert data into the jdbc table for tests.

    Args:
        init: if to init the table or not with empty data.
        time: value to use for the dummy_time field, so that time-based filters
            can be applied to the table so that we know that new data is
            available from upstream.
    """
    schema = StructType(
        [
            StructField("dummy_field", StringType(), True),
            StructField("dummy_time", StringType(), True),
        ]
    )

    if init:
        df = ExecEnv.SESSION.createDataFrame(
            [],
            schema,
        )
    else:
        df = ExecEnv.SESSION.createDataFrame(
            [
                ["a", str(time)],
                ["b", str(time)],
            ],
            schema,
        )

    DataframeHelpers.write_into_jdbc_table(
        df,
        f"jdbc:sqlite:{TEST_LAKEHOUSE_IN}/test_jdbc_sensor_upstream/tests.db",
        "test_jdbc_sensor_upstream",
    )


================================================
FILE: tests/feature/test_sftp_reader.py
================================================
"""Test SFTP reader.

Note: there is a limitation with the SFTP server/client which serves all files with
the same access and modified time, so we use the biggest dates to cover those
scenarios. Moreover, we also cover scenarios were no files are expected to be found,
due to the date filters.
"""

import gzip
import io
import os
from copy import deepcopy
from io import TextIOWrapper
from typing import Generator
from zipfile import ZipFile

import pandas as pd
import pytest
from paramiko import Transport
from paramiko.sftp_client import SFTPClient
from pytest_sftpserver.consts import (  # type: ignore
    SERVER_KEY_PRIVATE,
    SERVER_KEY_PUBLIC,
)
from pytest_sftpserver.sftp.server import SFTPServer  # type: ignore

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.logging_handler import LoggingHandler
from tests.conftest import FEATURE_RESOURCES, LAKEHOUSE_FEATURE_OUT
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "sftp_reader"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"
LOCAL_PATH = f"{TEST_RESOURCES}/data/"
LOGGER = LoggingHandler(__name__).get_logger()
FILES = os.listdir(LOCAL_PATH)


@pytest.fixture(scope="module")
def sftp_client(sftpserver: SFTPServer) -> Generator:
    """Create the sftp client to perform the tests.

    Args:
        sftpserver: a local SFTP-Server provided by the plugin pytest-sftpserver.
    """
    conn_cred = {"username": "a", "password": "b"}
    transport = Transport((sftpserver.host, sftpserver.port))
    transport.connect(
        hostkey=None,
        **conn_cred,
        pkey=None,
        gss_host=None,
        gss_auth=False,
        gss_kex=False,
        gss_deleg_creds=True,
        gss_trust_dns=True,
    )
    client = SFTPClient.from_transport(transport)
    yield client
    client.close()
    transport.close()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_csv",
            "test_name": "between_dates",
            "sftp_files_format": "csv",
            "file_name": "file",
            "file_extension": ".csv",
        },
        {
            "scenario_name": "sftp_csv",
            "test_name": "between_dates_fail",
            "sftp_files_format": "csv",
            "file_name": "file",
            "file_extension": ".csv",
        },
    ],
)
def test_sftp_reader_csv(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - csv type.

    This tests covers a connection using keys and tests a scenario between dates.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "key_type": "RSA",
            "pkey": LocalStorage.read_file(SERVER_KEY_PUBLIC).split()[1],
            "key_filename": SERVER_KEY_PRIVATE,
            "date_time_gt": "2022-01-01",
            "date_time_lt": (
                "9999-12-31" if "fail" not in scenario["test_name"] else "2021-01-01"
            ),
            "file_name_contains": f"e{scenario['file_extension']}",
            "args": {"sep": "|"},
        }

        acon = _get_test_acon(scenario, option_params)

        if "fail" not in scenario["test_name"]:
            _execute_and_validate(acon, scenario)
        else:
            with pytest.raises(
                ValueError, match="No files were found with the specified parameters."
            ):
                _execute_and_validate(acon, scenario)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_fwf",
            "test_name": "earliest_file",
            "sftp_files_format": "fwf",
            "file_name": "file5",
            "file_extension": ".txt",
        }
    ],
)
def test_sftp_reader_fwf(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - fwf type.

    This test covers a connection using add auto policy and tests
    earliest file and additional args.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "add_auto_policy": True,
            "earliest_file": True,
            "file_name_contains": scenario["file_extension"],
            "args": {"index_col": False, "names": ["value"]},
        }

        acon = _get_test_acon(scenario, option_params)

        _execute_and_validate(acon, scenario)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_gz_file",
            "test_name": "compressed_gz_file",
            "sftp_files_format": "csv",
            "file_name": "file6.compress",
            "file_extension": ".gz",
        },
    ],
)
def test_sftp_reader_gz_file(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - compressed gz type.

    This tests covers a connection using keys and tests a scenario of
    extracting a compressed gz file.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "key_type": "RSA",
            "pkey": LocalStorage.read_file(SERVER_KEY_PUBLIC).split()[1],
            "file_name_contains": "file6",
            "args": {"sep": "|"},
        }

        acon = _get_test_acon(scenario, option_params)

        _execute_and_validate(acon, scenario)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_json",
            "test_name": "greater_than",
            "sftp_files_format": "json",
            "file_name": "file3",
            "file_extension": ".json",
        }
    ],
)
def test_sftp_reader_json(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - json type.

    This tests covers a connection with add auto policy and tests date time
    greater than specified date and additional args.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "add_auto_policy": True,
            "date_time_gt": "2022-01-01",
            "file_name_contains": scenario["file_extension"],
            "args": {"lines": True, "orient": "columns"},
        }

        acon = _get_test_acon(scenario, option_params)

        _execute_and_validate(acon, scenario)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_mult_files",
            "test_name": "file_name_contains",
            "sftp_files_format": "csv",
            "file_name": "*",
            "file_extension": ".csv",
        }
    ],
)
def test_sftp_reader_mult_files(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - multiple files.

    This test covers a connection with add auto policy and tests file
    contains with additional args.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")

    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "add_auto_policy": True,
            "file_name_contains": scenario["file_extension"],
            "args": {"sep": "|"},
        }

        acon = _get_test_acon(scenario, option_params)

        _execute_and_validate(acon, scenario)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_xml",
            "test_name": "lower_than",
            "sftp_files_format": "xml",
            "file_name": "file4",
            "file_extension": ".xml",
        },
        {
            "scenario_name": "sftp_xml",
            "test_name": "lower_than_fails",
            "sftp_files_format": "xml",
            "file_name": "file4",
            "file_extension": ".xml",
        },
    ],
)
def test_sftp_reader_xml(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - xml type.

    This test covers a connection with add auto policy and date time
    lower than specified date.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "add_auto_policy": True,
            "date_time_lt": (
                "9999-12-31" if "fail" not in scenario["test_name"] else "2022-01-01"
            ),
            "file_name_contains": scenario["file_extension"],
        }

        acon = _get_test_acon(scenario, option_params)

        if "fail" not in scenario["test_name"]:
            _execute_and_validate(acon, scenario)
        else:
            with pytest.raises(
                ValueError, match="No files were found with the specified parameters."
            ):
                _execute_and_validate(acon, scenario)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sftp_zip_file",
            "test_name": "compressed_zip_file",
            "sftp_files_format": "csv",
            "file_name": "file7",
            "file_extension": ".zip",
        },
    ],
)
def test_sftp_reader_zip_file(
    sftp_client: SFTPClient,
    sftpserver: SFTPServer,
    scenario: dict,
    remote_location: dict,
) -> None:
    """Test loads from sftp source - compressed zip type.

    This tests covers a connection using keys and tests a scenario of
    extracting a compressed zip file.

    Args:
        sftp_client: sftp client used to perform tests.
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
        scenario: scenario being tested.
        remote_location: serve files on remote location.
    """
    LOGGER.info(f"Starting Scenario {scenario['scenario_name']}")
    with sftpserver.serve_content(deepcopy(remote_location)):
        rename_remote_files(sftp_client)

        option_params = {
            "hostname": sftpserver.host,
            "username": "dummy_user",
            "password": "dummy_password",
            "port": sftpserver.port,
            "key_type": "RSA",
            "pkey": LocalStorage.read_file(SERVER_KEY_PUBLIC).split()[1],
            "sub_dir": True,
            "file_name_contains": "file7",
            "args": {"sep": "|"},
        }

        acon = _get_test_acon(scenario, option_params)

        _execute_and_validate(acon, scenario)


def test_sftp_server_available(sftpserver: SFTPServer) -> None:
    """Test availability of sftp server.

    Args:
        sftpserver: a local SFTP-Server created by pytest_sftpserver.
    """
    assert isinstance(sftpserver, SFTPServer)
    assert sftpserver.is_alive()
    assert str(sftpserver.port) in sftpserver.url


def _execute_and_validate(
    acon: dict,
    scenario: dict,
) -> None:
    """Execute the load and compare data of result and control.

    Args:
        acon: acon dict to be tested.
        scenario: scenario to be tested.
    """
    load_data(acon=acon)
    result = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/{scenario['test_name']}/data"
    )

    if scenario["scenario_name"] == "sftp_fwf":
        control = (
            ExecEnv.SESSION.read.format("text")
            .option("lineSep", "\n")
            .load(
                f"{TEST_RESOURCES}/data/{scenario['file_name']}"
                f"{scenario['file_extension']}"
            )
        )
    elif scenario["scenario_name"] == "sftp_json":
        control = DataframeHelpers.read_from_file(
            f"{TEST_RESOURCES}/data/{scenario['file_name']}"
            f"{scenario['file_extension']}",
            file_format="json",
        )
    elif scenario["scenario_name"] == "sftp_xml":
        control = (
            ExecEnv.SESSION.read.format("xml")
            .option("rowTag", "row")
            .load(
                f"{TEST_RESOURCES}/data/{scenario['file_name']}"
                f"{scenario['file_extension']}"
            )
        )
    elif scenario["scenario_name"] == "sftp_zip_file":
        with ZipFile(
            f"{TEST_RESOURCES}/data/{scenario['file_name']}"
            f"{scenario['file_extension']}",
            "r",
        ) as zf:
            file = pd.read_csv(TextIOWrapper(zf.open(zf.namelist()[0])), sep="|")
        control = ExecEnv.SESSION.createDataFrame(file)
    else:
        control = DataframeHelpers.read_from_file(
            f"{TEST_RESOURCES}/data/{scenario['file_name']}"
            f"{scenario['file_extension']}"
        )

    assert not DataframeHelpers.has_diff(result, control)


def _get_test_acon(
    scenario: dict,
    option_params: dict,
) -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Args:
        scenario: the scenario being tested.
        option_params: option params for the scenario being tested.

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sftp_source",
                "read_type": "batch",
                "data_format": "sftp",
                "sftp_files_format": scenario["sftp_files_format"],
                "location": "remote_location",
                "options": option_params,
            }
        ],
        "output_specs": [
            {
                "spec_id": "sftp_bronze",
                "input_id": "sftp_source",
                "write_type": "overwrite",
                "data_format": "csv",
                "options": {"header": True, "delimiter": "|", "inferSchema": True},
                "location": f"file:///{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/"
                f"{scenario['test_name']}/data",
            }
        ],
    }


@pytest.fixture(scope="module")
def remote_location() -> dict:
    """Get files to serve on a remote sftp location.

    For creating compressed file in the remote location,
    it is necessary to read, decompress, cast it to bytes
    and then send it to the location.
    For regular files, only file read is necessary.

    Returns:
        A dict with the files for the remote location configured.
    """
    remote_location: dict = {"remote_location": {}}

    for file in FILES:
        if file.endswith(".gz"):
            file_name = file.rsplit(".", 1)[0]
            with gzip.GzipFile(f"{LOCAL_PATH}{file}", "rb") as compressed_file:
                file_data_string = compressed_file.read().decode()
                file_bytes = gzip.compress(file_data_string.encode("utf-8"))
            remote_location["remote_location"][f"{file_name}"] = file_bytes
        elif file.endswith(".zip"):
            file_name = file.rsplit(".", 1)[0]
            with ZipFile(f"{LOCAL_PATH}{file}", "r") as f:
                with f.open(f"{file_name}.csv") as zfile:
                    data = zfile.read().decode()

            bytesfile = io.BytesIO()
            with ZipFile(bytesfile, mode="w") as zf:
                zf.writestr(f"{file_name}.csv", data)
                zf.close()
                file_bytes = bytesfile.getvalue()
            remote_location["remote_location"].update({"sub_dir": {}})
            remote_location["remote_location"]["sub_dir"][f"{file_name}"] = file_bytes
        else:
            file_name = file.split(".")[0]
            remote_location["remote_location"][f"{file_name}"] = LocalStorage.read_file(
                f"{LOCAL_PATH}{file}"
            )
    return remote_location


def rename_remote_files(sftp_client: SFTPClient) -> None:
    """Rename files served remotely in SFTP."""
    for file in FILES:
        file_name = file.rsplit(".", 1)[0]
        try:
            sftp_client.rename(
                f"/remote_location/{file_name}",
                f"/remote_location/{file}",
            )
        except IOError:
            pass
        try:
            sftp_client.rename(
                f"/remote_location/sub_dir/{file_name}",
                f"/remote_location/sub_dir/{file}",
            )
        except IOError:
            pass


================================================
FILE: tests/feature/test_sharepoint_reader.py
================================================
"""Test Sharepoint reader."""

import json
from pathlib import Path
from typing import Any, Dict, List, Set
from unittest.mock import Mock, patch

import pytest

from lakehouse_engine.core.definitions import SharepointFile
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from tests.conftest import FEATURE_RESOURCES
from tests.utils.local_storage import LocalStorage
from tests.utils.mocks import MockRESTResponse

TEST_NAME = "sharepoint"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"

TEST_SCENARIOS_READER_SUCCESS: List[List[str]] = [
    ["reader", "read_single_csv_success"],
    ["reader", "read_single_csv_full_path_success"],
    ["reader", "read_folder_csv_success"],
    ["reader", "read_folder_csv_pattern_success"],
    ["reader", "read_single_csv_archive_enabled_success"],
    ["reader", "read_folder_csv_archive_enabled_success"],
    ["reader", "read_single_csv_archive_default_enabled_success"],
    ["reader", "read_single_csv_archive_success_subfolder_override_success"],
    ["reader", "read_folder_csv_archive_success_subfolder_override_success"],
]

TEST_SCENARIOS_READER_FAILURES: List[List[str]] = [
    [
        "reader",
        "read_folder_csv_one_file_schema_mismatch_should_archive_error",
        r"Schema mismatch",
    ],
    ["reader", "read_single_csv_empty_file_should_archive_error", r"is empty"],
    [
        "reader",
        "read_folder_csv_no_csv_files_should_fail",
        r"No CSV files found in folder: sp_test",
    ],
    [
        "reader",
        "read_folder_csv_pattern_matches_no_files_should_fail",
        r"No CSV files found in folder: sp_test",
    ],
    [
        "reader",
        "read_folder_csv_one_file_schema_mismatch_"
        "custom_error_subfolder_should_archive_error",
        r"Schema mismatch",
    ],
    [
        "reader",
        "read_single_csv_download_error_should_archive_error",
        r"Download failed",
    ],
    [
        "reader",
        "read_single_csv_spark_load_fails_should_archive_error",
        r"Failed to read Sharepoint file",
    ],
]


TEST_SCENARIOS_READER_EXCEPTIONS: List[List[str]] = [
    [
        "reader",
        "read_single_csv_full_path_with_file_name_should_fail",
        "When `folder_relative_path` points to a file, `file_name` must be None.",
    ],
    [
        "reader",
        "read_folder_path_does_not_exist_should_fail",
        "Folder 'missing_folder' does not exist in Sharepoint.",
    ],
    [
        "reader",
        "read_file_name_and_file_pattern_conflict_should_fail",
        "Conflicting options: provide either `file_name` or `file_pattern`",
    ],
    [
        "reader",
        "read_file_name_unsupported_extension_should_fail",
        "`file_name` must end with one of",
    ],
    [
        "reader",
        "read_folder_relative_path_looks_like_file_unsupported_extension_should_fail",
        "`folder_relative_path` appears to be a file path but does not end with one of",
    ],
    [
        "reader",
        "read_unsupported_file_type_should_fail",
        "`file_type` must be one of",
    ],
    [
        "reader",
        "read_single_csv_full_path_with_file_pattern_should_fail",
        "When `folder_relative_path` points to a file, `file_pattern` must be None.",
    ],
    [
        "reader",
        "read_single_csv_full_path_with_file_type_should_fail",
        "When `folder_relative_path` points to a file, `file_type` must be None",
    ],
]

# Helper functions


def _read_bytes(path_value: str) -> bytes:
    """Read a test file as bytes."""
    return Path(path_value).read_bytes()


def _get_output_path_by_scenario() -> Dict[str, str]:
    """Return the delta output location for each success scenario."""
    return {
        "read_single_csv_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/reader/delta/"
        ),
        "read_single_csv_full_path_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path/"
        ),
        "read_folder_csv_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder/"
        ),
        "read_folder_csv_pattern_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_pattern/"
        ),
        "read_single_csv_archive_enabled_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/"
            "reader/delta_single_archive_enabled/"
        ),
        "read_folder_csv_archive_enabled_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/"
            "reader/delta_folder_archive_enabled/"
        ),
        "read_single_csv_archive_default_enabled_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/"
            "reader/delta_single_archive_default_enabled/"
        ),
        "read_single_csv_archive_success_subfolder_override_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/"
            "reader/delta_single_archive_success_subfolder_override/"
        ),
        "read_folder_csv_archive_success_subfolder_override_success": (
            "/app/tests/lakehouse/out/feature/sharepoint/"
            "reader/delta_folder_archive_success_subfolder_override/"
        ),
    }


def _setup_sharepoint_reader_mocks_for_success(
    scenario_name: str,
    mock_list_items_in_path: Mock,
    mock_get_file_metadata: Mock,
) -> None:
    """Configure SharePoint mocks used by Sharepoint reader success scenarios.

    Args:
        scenario_name: Test scenario identifier.
        mock_list_items_in_path: Mock for SharepointUtils.list_items_in_path.
        mock_get_file_metadata: Mock for SharepointUtils.get_file_metadata.
    """
    is_folder_read_scenario = scenario_name.startswith("read_folder_")

    if is_folder_read_scenario:
        mock_list_items_in_path.return_value = [
            {"name": "sample_1.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
            {"name": "sample_2.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
            {"name": "other.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
            {"name": "ignore.txt", "createdDateTime": "", "lastModifiedDateTime": ""},
        ]

        file_bytes_by_path: Dict[str, bytes] = {
            "sp_test/sample_1.csv": _read_bytes(
                f"{TEST_RESOURCES}/reader/data/sample_1.csv"
            ),
            "sp_test/sample_2.csv": _read_bytes(
                f"{TEST_RESOURCES}/reader/data/sample_2.csv"
            ),
            "sp_test/other.csv": _read_bytes(f"{TEST_RESOURCES}/reader/data/other.csv"),
        }

        def get_file_metadata_side_effect_for_folder(file_path: str) -> SharepointFile:
            """Side effect function for `get_file_metadata` mock in folder scenarios."""
            return SharepointFile(
                file_name=file_path.split("/")[-1],
                time_created="",
                time_modified="",
                content=file_bytes_by_path[file_path],
                _folder=file_path.rsplit("/", 1)[0],
            )

        mock_get_file_metadata.side_effect = get_file_metadata_side_effect_for_folder
        return

    content = _read_bytes(f"{TEST_RESOURCES}/reader/data/sample_1.csv")

    def get_file_metadata_side_effect_for_single_file(file_path: str) -> SharepointFile:
        """Side effect function for `get_file_metadata` mock in single file scenarios.

        Args:
            file_path: The path of the file for which metadata is being requested.

        Returns:
            A SharepointFile object with the content set to the bytes read from the
            test file.
        """
        folder = file_path.rsplit("/", 1)[0] if "/" in file_path else "sp_test"
        return SharepointFile(
            file_name=file_path.split("/")[-1],
            time_created="",
            time_modified="",
            content=content,
            _folder=folder,
        )

    mock_get_file_metadata.side_effect = get_file_metadata_side_effect_for_single_file


def _assert_archive_calls_for_success(
    scenario_name: str,
    mock_archive_sharepoint_file: Mock,
) -> None:
    """Assert archive behavior for Sharepoint reader success scenarios.

    Args:
        scenario_name: Test scenario identifier.
        mock_archive_sharepoint_file: Mock for SharepointUtils.archive_sharepoint_file.
    """
    is_folder_read_scenario = scenario_name.startswith("read_folder_")

    folder_expected_calls_by_scenario: Dict[str, int] = {
        "read_folder_csv_success": 3,
        "read_folder_csv_pattern_success": 2,
        "read_folder_csv_archive_enabled_success": 3,
        "read_folder_csv_archive_success_subfolder_override_success": 3,
    }

    folder_archive_enabled_scenarios: Set[str] = {
        "read_folder_csv_archive_enabled_success",
        "read_folder_csv_archive_success_subfolder_override_success",
    }

    single_file_archive_enabled_scenarios: Set[str] = {
        "read_single_csv_archive_enabled_success",
        "read_single_csv_archive_default_enabled_success",
        "read_single_csv_archive_success_subfolder_override_success",
    }

    success_subfolder_by_scenario: Dict[str, str] = {
        "read_single_csv_archive_success_subfolder_override_success": "processed",
        "read_folder_csv_archive_success_subfolder_override_success": "processed",
    }

    expected_success_subfolder = success_subfolder_by_scenario.get(
        scenario_name, "done"
    )

    if is_folder_read_scenario:
        expected_calls = folder_expected_calls_by_scenario[scenario_name]
        assert mock_archive_sharepoint_file.call_count == expected_calls

        expected_move_enabled = scenario_name in folder_archive_enabled_scenarios
        for call in mock_archive_sharepoint_file.call_args_list:
            assert call.kwargs["move_enabled"] is expected_move_enabled
            if expected_move_enabled:
                to_path = call.kwargs["to_path"]
                assert to_path is not None
                assert to_path.endswith(f"/{expected_success_subfolder}")
        return

    mock_archive_sharepoint_file.assert_called_once()
    expected_move_enabled = scenario_name in single_file_archive_enabled_scenarios
    assert (
        mock_archive_sharepoint_file.call_args.kwargs["move_enabled"]
        is expected_move_enabled
    )

    if expected_move_enabled:
        to_path = mock_archive_sharepoint_file.call_args.kwargs["to_path"]
        assert to_path is not None
        assert to_path.endswith(f"/{expected_success_subfolder}")


def _assert_sharepoint_reader_success_output(
    scenario_name: str,
    output_path: str,
) -> None:
    """Assert the delta output produced by Sharepoint reader success scenarios.

    Args:
        scenario_name: Test scenario identifier.
        output_path: Delta output location for the scenario.
    """
    data_frame = ExecEnv.SESSION.read.format("delta").load(output_path)
    assert data_frame.columns == ["col_a", "col_b"]

    if scenario_name in {
        "read_folder_csv_success",
        "read_folder_csv_archive_enabled_success",
        "read_folder_csv_archive_success_subfolder_override_success",
    }:
        assert data_frame.count() == 3
        rows = [row.asDict() for row in data_frame.orderBy("col_a").collect()]
        assert rows == [
            {"col_a": 1, "col_b": 2},
            {"col_a": 3, "col_b": 4},
            {"col_a": 999, "col_b": 999},
        ]
    elif scenario_name == "read_folder_csv_pattern_success":
        assert data_frame.count() == 2
        rows = [row.asDict() for row in data_frame.orderBy("col_a").collect()]
        assert rows == [
            {"col_a": 1, "col_b": 2},
            {"col_a": 3, "col_b": 4},
        ]


@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.archive_sharepoint_file"
)
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.get_file_metadata")
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.list_items_in_path")
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists",
    return_value=True,
)
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app")
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token")
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request")
@pytest.mark.parametrize("scenario", TEST_SCENARIOS_READER_SUCCESS)
def test_sharepoint_reader_success(
    mock_make_request: Any,
    mock_get_token: Any,
    mock_create_app: Any,
    mock_check_if_endpoint_exists: Any,
    mock_list_items_in_path: Any,
    mock_get_file_metadata: Any,
    mock_archive_sharepoint_file: Any,
    scenario: List[str],
) -> None:
    """Test Sharepoint reader happy paths (single file, full path, folder)."""
    scenario_name = scenario[1]

    output_path_by_scenario = _get_output_path_by_scenario()

    mock_archive_sharepoint_file.return_value = None
    mock_make_request.return_value = None

    _setup_sharepoint_reader_mocks_for_success(
        scenario_name=scenario_name,
        mock_list_items_in_path=mock_list_items_in_path,
        mock_get_file_metadata=mock_get_file_metadata,
    )

    output_path = output_path_by_scenario[scenario_name]
    LocalStorage.clean_folder(output_path)

    load_data(f"file://{TEST_RESOURCES}/{scenario[0]}/acons/{scenario_name}.json")

    _assert_archive_calls_for_success(
        scenario_name=scenario_name,
        mock_archive_sharepoint_file=mock_archive_sharepoint_file,
    )

    _assert_sharepoint_reader_success_output(
        scenario_name=scenario_name,
        output_path=output_path,
    )


@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.archive_sharepoint_file"
)
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.get_file_metadata")
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils.list_items_in_path")
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists",
    return_value=True,
)
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app")
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token")
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request")
@pytest.mark.parametrize("scenario", TEST_SCENARIOS_READER_FAILURES)
def test_sharepoint_reader_failures(
    mock_make_request: Any,
    mock_get_token: Any,
    mock_create_app: Any,
    mock_check_if_endpoint_exists: Any,
    mock_list_items_in_path: Any,
    mock_get_file_metadata: Any,
    mock_archive_sharepoint_file: Any,
    scenario: List[str],
    tmp_path: Path,
) -> None:
    """Test Sharepoint reader runtime failure scenarios.

    This test covers failures that happen during file processing (for example schema
    mismatches, empty files, or folder contents that result in non readable CSVs).
    These are different from `test_sharepoint_reader_exceptions`, which validates
    fail-fast configuration errors (invalid option combinations, unsupported file
    types) that should raise before any file processing starts.
    For runtime failures where archiving is enabled, the reader should move the
    problematic file(s) to the configured error subfolder (default: "error").
    The assertions at the end verify:
    - the job failed with the expected error message
    - archiving was invoked with `move_enabled=True`
    - the archive target folder matches the expected error subfolder
    - the archived file is one of the files involved in the scenario
    """
    scenario_name = scenario[1]
    expected_error_regex = scenario[2]

    mock_archive_sharepoint_file.return_value = None
    mock_make_request.return_value = None

    should_assert_no_archive_calls = False
    expected_error_subfolder = "error"
    allowed_file_names: Set[str] = set()

    should_patch_spark_load = False

    # Scenario-specific mocking + expectations (no load_data here)
    if "schema_mismatch" in scenario_name:
        expected_error_subfolder = (
            "failed" if "custom_error_subfolder" in scenario_name else "error"
        )
        allowed_file_names = {"sample_1.csv", "bad_schema.csv"}

        mock_list_items_in_path.return_value = [
            {"name": "sample_1.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
            {
                "name": "bad_schema.csv",
                "createdDateTime": "",
                "lastModifiedDateTime": "",
            },
        ]

        file_bytes_by_path: Dict[str, bytes] = {
            "sp_test/sample_1.csv": _read_bytes(
                f"{TEST_RESOURCES}/reader/data/sample_1.csv"
            ),
            "sp_test/bad_schema.csv": _read_bytes(
                f"{TEST_RESOURCES}/reader/data/bad_schema.csv"
            ),
        }

        def get_file_metadata_side_effect(file_path: str) -> SharepointFile:
            return SharepointFile(
                file_name=file_path.split("/")[-1],
                time_created="",
                time_modified="",
                content=file_bytes_by_path[file_path],
                _folder=file_path.rsplit("/", 1)[0],
            )

        mock_get_file_metadata.side_effect = get_file_metadata_side_effect

    elif scenario_name == "read_single_csv_empty_file_should_archive_error":
        allowed_file_names = {"empty.csv"}

        def get_file_metadata_side_effect(file_path: str) -> SharepointFile:
            return SharepointFile(
                file_name="empty.csv",
                time_created="",
                time_modified="",
                content=b"",
                _folder="sp_test",
            )

        mock_get_file_metadata.side_effect = get_file_metadata_side_effect

    elif scenario_name == "read_folder_csv_no_csv_files_should_fail":
        should_assert_no_archive_calls = True
        mock_list_items_in_path.return_value = [
            {"name": "ignore.txt", "createdDateTime": "", "lastModifiedDateTime": ""},
            {"name": "readme.md", "createdDateTime": "", "lastModifiedDateTime": ""},
        ]

    elif scenario_name == "read_folder_csv_pattern_matches_no_files_should_fail":
        should_assert_no_archive_calls = True
        mock_list_items_in_path.return_value = [
            {"name": "sample_1.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
            {"name": "sample_2.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
            {"name": "other.csv", "createdDateTime": "", "lastModifiedDateTime": ""},
        ]

    elif scenario_name == "read_single_csv_download_error_should_archive_error":
        allowed_file_names = {"sample_1.csv"}

        first_sharepoint_file = SharepointFile(
            file_name="sample_1.csv",
            time_created="",
            time_modified="",
            content=b"not-empty",
            _folder="sp_test",
        )

        mock_get_file_metadata.side_effect = [
            first_sharepoint_file,
            ValueError("Download failed"),
        ]
    elif scenario_name == "read_single_csv_spark_load_fails_should_archive_error":
        should_patch_spark_load = True
        allowed_file_names = {"sample_1.csv"}

        sp_file_first = SharepointFile(
            file_name="sample_1.csv",
            time_created="",
            time_modified="",
            content=b"col_a,col_b\n1,2\n",
            _folder="sp_test",
        )

        sp_file_second = SharepointFile(
            file_name="sample_1.csv",
            time_created="",
            time_modified="",
            content=b"col_a,col_b\n1,2\n",
            _folder="sp_test",
        )

        mock_get_file_metadata.side_effect = [sp_file_first, sp_file_second]

    else:
        raise ValueError(f"Unhandled failure scenario: {scenario_name}")

    # Execute + assert error (exactly once per scenario)
    acon_path = f"file://{TEST_RESOURCES}/{scenario[0]}/acons/{scenario_name}.json"

    if should_patch_spark_load:
        fake_local_file: Path = tmp_path / "fake.csv"
        fake_local_file.write_text("dummy")
        with (
            patch(
                "lakehouse_engine.utils.sharepoint_utils."
                "SharepointUtils.save_to_staging_area",
                return_value=str(fake_local_file),
            ),
            patch(
                "pyspark.sql.readwriter.DataFrameReader.load",
                side_effect=Exception("Spark load failed"),
            ),
        ):
            with pytest.raises(ValueError, match=expected_error_regex):
                load_data(acon_path)
    else:
        with pytest.raises(ValueError, match=expected_error_regex):
            load_data(acon_path)

    # For scenarios that fail before reading any CSV file (folder contains no CSVs, or
    # the pattern filters everything out), there is no concrete CSV file to archive.
    # We assert no archive attempts are made.
    if should_assert_no_archive_calls:
        assert mock_archive_sharepoint_file.call_count == 0
        assert mock_get_file_metadata.call_count == 0
        return

    # For processing-time failures, the reader should attempt to archive the failing
    # file(s) into the configured error subfolder (default: "error").
    # We assert at least one archive call targeted that error folder with move enabled,
    # and that the archived file belongs to this scenario.
    error_calls = [
        c
        for c in mock_archive_sharepoint_file.call_args_list
        if (c.kwargs.get("to_path") or "").endswith(f"/{expected_error_subfolder}")
    ]
    assert len(error_calls) >= 1

    for c in error_calls:
        assert c.kwargs["move_enabled"] is True
        sp_file = c.kwargs.get("sp_file")
        assert sp_file is not None
        assert sp_file.file_name in allowed_file_names


@pytest.mark.parametrize("scenario", TEST_SCENARIOS_READER_EXCEPTIONS)
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app")
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token",
    return_value="fake-token",
)
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request",
    side_effect=[
        # site id
        MockRESTResponse(
            status_code=200,
            json_data=json.loads(
                open(f"{TEST_RESOURCES}/reader/mocks/get_site_id.json").read()
            ),
        ),
        # drive id
        MockRESTResponse(
            status_code=200,
            json_data=json.loads(
                open(f"{TEST_RESOURCES}/reader/mocks/get_drive_id.json").read()
            ),
        ),
    ],
)
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists",
    return_value=True,
)
def test_sharepoint_reader_exceptions(
    mock_check_if_endpoint_exists: Any,
    mock_make_request: Any,
    mock_get_token: Any,
    mock_create_app: Any,
    scenario: List[str],
) -> None:
    """Test Sharepoint reader invalid configs that must fail fast."""
    scenario_name = scenario[1]

    if scenario_name == "read_folder_path_does_not_exist_should_fail":
        mock_check_if_endpoint_exists.return_value = False

    with pytest.raises(ValueError, match=scenario[2]):
        load_data(f"file://{TEST_RESOURCES}/{scenario[0]}/acons/{scenario_name}.json")


================================================
FILE: tests/feature/test_sharepoint_writer.py
================================================
"""Test Sharepoint utils."""

import json
from typing import Any, List
from unittest.mock import MagicMock, patch

import pytest

from lakehouse_engine.engine import load_data
from lakehouse_engine.io.exceptions import (
    EndpointNotFoundException,
    InputNotFoundException,
    NotSupportedException,
)
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.local_storage import LocalStorage
from tests.utils.mocks import MockRESTResponse

"""
Tests for Sharepoint-related utilities and functionality.

This test suite validates the behavior of the Sharepoint writer, ensuring
that it handles various scenarios correctly. The tests cover validation of
mandatory inputs, unsupported operations, endpoint existence checks, and
successful writing to Sharepoint.

Scenarios tested:
- Attempting to use streaming with the Sharepoint writer raises a
  `NotSupportedException`.
- Missing mandatory options (`site_name`, `drive_name`, `local_path`) raises
  an `InputNotFoundException`.
- Providing an invalid endpoint raises an `EndpointNotFoundException`.
- Successful writing to Sharepoint and associated log validation.

Mocks:
- `SharepointWriter._get_sharepoint_utils` is patched to simulate the behavior
  of the Sharepoint utilities without making actual external calls.
- Mock REST responses simulate Sharepoint API interactions for success cases.

Dependencies:
- Uses pytest for parameterized testing of different scenarios.
- Relies on a local storage utility for preparing test data and file operations.
"""

TEST_NAME = "sharepoint"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"

TEST_SCENARIOS_EXCEPTIONS = [
    [
        "streaming_exception",
        "Sharepoint writer doesn't support streaming!",
    ],
    [
        "drive_exception",
        "Please provide all mandatory Sharepoint options. \n"
        "Expected: site_name, drive_name and local_path. "
        "Value should not be None.\n"
        "Provided: site_name=mock_site, \n"
        "drive_name=, \n"
        "local_path=mock_path",
    ],
    [
        "site_exception",
        "Please provide all mandatory Sharepoint options. \n"
        "Expected: site_name, drive_name and local_path. "
        "Value should not be None.\n"
        "Provided: site_name=, \n"
        "drive_name=mock_drive, \n"
        "local_path=mock_path",
    ],
    [
        "local_path_exception",
        "Please provide all mandatory Sharepoint options. \n"
        "Expected: site_name, drive_name and local_path. "
        "Value should not be None.\n"
        "Provided: site_name=mock_site, \n"
        "drive_name=mock_drive, \n"
        "local_path=",
    ],
    ["endpoint_exception", "The provided endpoint does not exist!"],
]

TEST_SCENARIOS_WRITER = [
    [
        "writer",
        "write_to_local_success",
        f"Deleted the local folder: {TEST_LAKEHOUSE_OUT}/writer/data",
    ],
]


@pytest.mark.parametrize("scenario", TEST_SCENARIOS_EXCEPTIONS)
@patch(
    "lakehouse_engine.io.writers.sharepoint_writer.SharepointWriter._get_sharepoint_utils"  # noqa
)
def test_sharepoint_writer_exceptions(
    mock_get_sharepoint_utils: MagicMock, scenario: List[str]
) -> None:
    """Test writing to Sharepoint from csv source.

    Args:
        scenario: scenario to test.
        mock_get_sharepoint_utils: patch sharepoint_utils.
    """
    mock_sharepoint_utils = MagicMock()
    mock_get_sharepoint_utils.return_value = mock_sharepoint_utils

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    if scenario[1] == "streaming_exception":
        with pytest.raises(NotSupportedException, match=scenario[2]):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/acons/streaming_exception.json"
            )
    elif scenario[1] == "site_exception":
        with pytest.raises(InputNotFoundException, match=scenario[2]):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/acons/site_exception.json"
            )
    elif scenario[1] == "drive_exception":
        with pytest.raises(InputNotFoundException, match=scenario[2]):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/acons/drive_exception.json"
            )
    elif scenario[1] == "local_path_exception":
        with pytest.raises(InputNotFoundException, match=scenario[2]):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/acons/local_path_exception.json"
            )
    elif scenario[1] == "endpoint_exception":
        mock_sharepoint_utils.check_if_endpoint_exists.return_value = False
        with pytest.raises(EndpointNotFoundException, match=scenario[2]):
            load_data(
                f"file://{TEST_RESOURCES}/{scenario[0]}/acons/endpoint_exception.json"
            )


@pytest.mark.parametrize("scenario", TEST_SCENARIOS_WRITER)
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils.check_if_endpoint_exists",
    return_value=True,  # noqa
)
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._create_app")  # noqa
@patch("lakehouse_engine.utils.sharepoint_utils.SharepointUtils._get_token")  # noqa
@patch(
    "lakehouse_engine.utils.sharepoint_utils.SharepointUtils._make_request",
    side_effect=[
        MockRESTResponse(
            status_code=200,
            json_data=json.loads(
                open(f"{TEST_RESOURCES}/writer/mocks/get_site_id.json").read()
            ),
        ),
        MockRESTResponse(
            status_code=200,
            json_data=json.loads(
                open(f"{TEST_RESOURCES}/writer/mocks/get_drive_id.json").read()
            ),
        ),
        MockRESTResponse(
            status_code=200,
            json_data=json.loads(
                open(f"{TEST_RESOURCES}/writer/mocks/create_upload_session.json").read()
            ),
        ),
        MockRESTResponse(status_code=200),  # final upload to sharepoint
    ],
)  # noqa
def test_sharepoint_writer(
    _: Any, __: Any, ___: Any, _make_requests: Any, scenario: List[str], caplog: Any
) -> None:
    """Test writing to Sharepoint from csv source.

    Args:
        scenario: scenario to test.
        caplog: fetch logs.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    if scenario[0] == "writer" and scenario[1] == "write_to_local_success":
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv",
            f"{TEST_LAKEHOUSE_IN}/data/",
        )

        load_data(
            f"file://{TEST_RESOURCES}/{scenario[0]}/acons/write_to_local_success.json"
        )

        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario[0]}/data/file_source.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/{scenario[0]}/data/",
        )

        assert scenario[2] in caplog.text


================================================
FILE: tests/feature/test_table_manager.py
================================================
"""Test table manager."""

import logging
from typing import Any

import pytest
from pyspark.sql.utils import AnalysisException

from lakehouse_engine.engine import manage_table
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.local_storage import LocalStorage

TEST_PATH = "table_manager"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenarios",
    [
        {
            "table_and_view_name": ["SimpleSplitScenario"],
            "locations_name": ["simple_split_scenario"],
            "create_tbl_sql": "test_table_simple_split_scenario.sql",
            "create_tbl_json": "acon_create_table_simple_split_scenario",
            "execute_sql_json": "acon_execute_sql_simple_split_scenario",
            "create_vw_sql": "test_view_simple_split_scenario",
            "create_vw_json": "acon_create_view_simple_split_scenario",
            "describe_tbl_json": "acon_describe_simple_split_scenario",
            "vacuum_tbl_json": "acon_vacuum_table_simple_split_scenario",
            "vacuum_loc_json": "acon_vacuum_location_simple_split_scenario",
            "optimize_tbl_json_": "optimize_table_simple_split_scenario",
            "optimize_loc_json": "optimize_location_simple_split_scenario",
            "compute_statistics_tbl_json": ["table_stats_simple_split_scenario"],
            "show_tbl_prop_json": "show_tbl_properties_simple_split_scenario",
            "tbl_primary_keys_json": "get_tbl_pk_simple_split_scenario",
            "drop_vw_json": "acon_drop_view_simple_split_scenario",
            "delete_json": "acon_delete_where_table_simple_split_scenario",
            "drop_tbl_json": "acon_drop_table_simple_split_scenario",
        },
        {
            "table_and_view_name": [
                "ComplexDefaultScenario1",
                "ComplexDefaultScenario2",
            ],
            "locations_name": [
                "complex_default_scenario1",
                "complex_default_scenario2",
            ],
            "create_tbl_sql": "test_table_complex_default_scenario.sql",
            "create_tbl_json": "acon_create_table_complex_default_scenario",
            "execute_sql_json": "acon_execute_sql_complex_default_scenario",
            "create_vw_sql": "test_view_complex_default_scenario",
            "create_vw_json": "acon_create_view_complex_default_scenario",
            "compute_statistics_tbl_json": [
                "table_stats_complex_default_scenario1",
                "table_stats_complex_default_scenario2",
            ],
        },
        {
            "table_and_view_name": [
                "ComplexDifferentDelimiterScenario1",
                "ComplexDifferentDelimiterScenario2",
            ],
            "locations_name": [
                "complex_different_delimiter_scenario1",
                "complex_different_delimiter_scenario2",
            ],
            "create_tbl_sql": "test_table_complex_different_delimiter_scenario.sql",
            "create_tbl_json": "acon_create_table_complex_different_delimiter_scenario",
            "execute_sql_json": "acon_execute_sql_complex_different_delimiter_scenario",
            "create_vw_sql": "test_view_complex_different_delimiter_scenario",
            "create_vw_json": "acon_create_view_complex_different_delimiter_scenario",
            "compute_statistics_tbl_json": [
                "table_stats_complex_different_delimiter_scenario1",
                "table_stats_complex_different_delimiter_scenario2",
            ],
        },
    ],
)
def test_table_manager(scenarios: dict, caplog: Any) -> None:
    """Test functions from table manager.

    Args:
        scenarios: scenarios to test.
        caplog: captured log.
    """
    with caplog.at_level(logging.INFO):
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/create/table/{scenarios['create_tbl_sql']}",
            f"{TEST_LAKEHOUSE_IN}/create/table/",
        )

        manage_table(
            f"file://{TEST_RESOURCES}/create/{scenarios['create_tbl_json']}.json"
        )
        assert "create_table successfully executed!" in caplog.text

        manage_table(
            f"file://{TEST_RESOURCES}/execute_sql/{scenarios['execute_sql_json']}.json"
        )
        assert "sql successfully executed!" in caplog.text

        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/create/view/{scenarios['create_vw_sql']}.sql",
            f"{TEST_LAKEHOUSE_IN}/create/view/",
        )

        manage_table(
            f"file://{TEST_RESOURCES}/create/{scenarios['create_vw_json']}.json"
        )
        assert "create_view successfully executed!" in caplog.text

        if scenarios.get("describe_tbl_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/describe/"
                f"{scenarios['describe_tbl_json']}.json"
            )
            assert (
                "DataFrame[col_name: string, data_type: string, comment: string]"
                in caplog.text
            )

        if scenarios.get("vacuum_tbl_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/vacuum/{scenarios['vacuum_tbl_json']}.json"
            )
            assert (
                "Vacuuming table: test_db.DummyTableBronzeSimpleSplitScenario"
                in caplog.text
            )

        if scenarios.get("vacuum_loc_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/vacuum/{scenarios['vacuum_loc_json']}.json"
            )
            assert (
                "Vacuuming location: file:///app/tests/lakehouse/out/feature/"
                "table_manager/dummy_table_bronze/data_simple_split_scenario"
                in caplog.text
            )

        if scenarios.get("optimize_tbl_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/optimize/"
                f"{scenarios['optimize_tbl_json']}.json"
            )
            assert (
                "sql command: OPTIMIZE test_db.DummyTableBronzeSimpleSplitScenario "
                "WHERE year >= 2021 and month >= 09 and day > 01 ZORDER BY (col1,col2)"
                in caplog.text
            )

        if scenarios.get("optimize_loc_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/optimize/"
                f"{scenarios['optimize_loc_json']}.json"
            )
            assert (
                f"sql command: OPTIMIZE delta.`file://{TEST_LAKEHOUSE_OUT}/"
                "dummy_table_bronze/data_simple_split_scenario` WHERE year >= 2021 "
                "and month >= 09 and day > 01 ZORDER BY (col1,col2)" in caplog.text
            )

        with pytest.raises(
            AnalysisException, match=".*ANALYZE TABLE is not supported for v2 tables.*"
        ):
            # compute table stats is still not supported in current OS delta lake.
            if scenarios.get("compute_statistics_tbl_json") is not None:
                for (
                    compute_statistics_table_index,
                    compute_statistics_table_json_file,
                ) in enumerate(scenarios["compute_statistics_tbl_json"]):
                    manage_table(
                        f"file://{TEST_RESOURCES}/compute_table_statistics/"
                        f"{compute_statistics_table_json_file}.json"
                    )
                    scenario_name = scenarios["table_and_view_name"][
                        compute_statistics_table_index
                    ]
                    assert (
                        "sql command: ANALYZE TABLE test_db.DummyTable"
                        f"Bronze{scenario_name} COMPUTE STATISTICS" in caplog.text
                    )

        if scenarios.get("show_tbl_prop_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/show_tbl_properties/"
                f"{scenarios['show_tbl_prop_json']}.json"
            )
            assert (
                "sql command: SHOW TBLPROPERTIES test_db.DummyTable"
                "BronzeSimpleSplitScenario" in caplog.text
            )

        if scenarios.get("tbl_primary_keys_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/get_tbl_pk/"
                f"{scenarios['tbl_primary_keys_json']}.json"
            )
            assert "['id', 'col1']" in caplog.text

        if scenarios.get("drop_vw_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/drop/{scenarios['drop_vw_json']}.json"
            )
            assert "View successfully dropped!" in caplog.text

        if scenarios.get("delete_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/delete/{scenarios['delete_json']}.json"
            )
            assert (
                "sql command: DELETE FROM test_db.DummyTable"
                "BronzeSimpleSplitScenario WHERE year=2021"
                in caplog.text  # nosec: B608
            )

        if scenarios.get("drop_tbl_json") is not None:
            manage_table(
                f"file://{TEST_RESOURCES}/drop/{scenarios['drop_tbl_json']}.json"
            )
            assert "Table successfully dropped!" in caplog.text


================================================
FILE: tests/feature/test_writers.py
================================================
"""Test engine writers.

Delta merge tests writers weren't added because it is always batch,
micro batch or normal batch, but always batch. Also, we have another
test like delta_load that uses delta_merge_writer.
Kafka writer weren't added also, because we cannot
simulate kafka on local tests. All other writers were covered.
"""

import logging
import os
import random
import string
from collections import namedtuple
from typing import Any, Optional, OrderedDict
from unittest.mock import patch

import pytest
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

from lakehouse_engine.core.definitions import OutputFormat, OutputSpec
from lakehouse_engine.engine import load_data
from lakehouse_engine.io.exceptions import NotSupportedException
from lakehouse_engine.io.writers.dataframe_writer import DataFrameWriter
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "writers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_NAME}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_NAME}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_NAME}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_NAME}"


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_batch_files"},
        {"scenario_name": "write_streaming_files"},
        {
            "scenario_name": "write_streaming_foreachBatch_files",
        },
    ],
)
def test_write_to_files(scenario: dict) -> None:
    """Test file writer.

    Args:
        scenario: scenario to test.
    """
    _prepare_files()

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )
    load_data(acon=acon)

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_batch_rest_api"},
        {"scenario_name": "write_streaming_rest_api"},
    ],
)
def test_write_to_rest_api(scenario: dict) -> None:
    """Test rest api writer.

    Args:
        scenario: scenario to test.
    """
    _prepare_files()

    RestResponse = namedtuple("RestResponse", "status_code text")

    with patch(
        "lakehouse_engine.io.writers.rest_api_writer.execute_api_request",
        return_value=RestResponse(status_code=200, text="ok"),
    ):
        acon = ConfigUtils.get_acon(
            f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
        )
        load_data(acon=acon)


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_batch_jdbc"},
        {"scenario_name": "write_streaming_foreachBatch_jdbc"},
    ],
)
def test_write_to_jdbc(scenario: dict) -> None:
    """Test jdbc writer.

    Args:
        scenario: scenario to test.
    """
    _prepare_files()

    os.mkdir(f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/")

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )
    load_data(acon=acon)

    result_df = DataframeHelpers.read_from_jdbc(
        f"jdbc:sqlite:{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/test.db",
        f"{scenario['scenario_name']}",
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_batch_table"},
        {"scenario_name": "write_streaming_table"},
        {"scenario_name": "write_streaming_foreachBatch_table"},
    ],
)
def test_write_to_table(scenario: dict) -> None:
    """Test table writer.

    Args:
        scenario: scenario to test.
    """
    _prepare_files()

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )
    load_data(acon=acon)

    result_df = DataframeHelpers.read_from_table(f"test_db.{scenario['scenario_name']}")

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_batch_console"},
        {"scenario_name": "write_streaming_console"},
        {"scenario_name": "write_streaming_foreachBatch_console"},
    ],
)
def test_write_to_console(scenario: dict, capsys: Any) -> None:
    """Test console writer.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    _prepare_files()

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )
    load_data(acon=acon)

    captured = capsys.readouterr()

    logging.info(captured.out)

    assert "20140601|customer1|article3|" in captured.out


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_batch_dataframe"},
        {"scenario_name": "write_streaming_dataframe"},
        {"scenario_name": "write_streaming_foreachBatch_dataframe"},
    ],
)
def test_write_to_dataframe(scenario: dict, capsys: Any) -> None:
    """Test dataframe writer returning the output by OutputSpec.

    Description of the test scenarios:
        - write_batch_dataframe - test writing a DataFrame from two batch sources,
        uniting both sources.
        It's generated a DataFrame containing the data from both sources.
        - write_streaming_dataframe - similar to write_batch_dataframe but inputting
        data from a stream.
        - write_streaming_foreachBatch_dataframe - similar to write_batch_dataframe but
        mixing batch and streaming,
        so the first source from batch and the second from a stream.
        This test have the responsibility to
        execute the writer using micro batch strategy.


    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/source")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}")
    _prepare_files()

    result = load_data(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv"
    )
    expected_keys = ["sales"]

    assert not DataframeHelpers.has_diff(result.get("sales"), control_df)
    assert len(result.keys()) == len(expected_keys)
    assert all(
        subject == expected for subject, expected in zip(result.keys(), expected_keys)
    )


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "write_streaming_df_with_checkpoint",
            "control": "streaming_dataframe",
        },
        {
            "scenario_name": "write_streaming_foreachBatch_df_with_checkpoint",
            "control": "streaming_dataframe_foreachBatch",
        },
    ],
)
def test_write_to_dataframe_checkpoints(scenario: dict, capsys: Any) -> None:
    """Test dataframe writer using checkpoint for the next run.

    In this test our InputSpecs have the option `maxFilesPerTrigger`,
    this option forces our stream to read a maximum files per iteration,
    this property also needs to have a checkpoint location
    because spark internally needs to control the state of reading the
    files.

    Description of the test scenarios:
        - write_streaming_dataframe - test if the checkpoint is working
         as expected when writing the data
         from stream to DataFrame.
         We have two different input files for each source
         we expect to read just the first
         in the first execution and the second in the next one.
         - write_streaming_foreachBatch_dataframe - test if the
         checkpoint is working as expected when writing
         the data from stream and batch using
         the micro batch strategy to DataFrame.
         As we have two different input files for each source
         we expect to read just the first file
         in the first execution and the second in the
         next one for the stream with checkpoint source.
         On the batch source we expect to read the first
         file in the first run and both files in the second run.


    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_IN}/source")
    LocalStorage.clean_folder(f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}")

    for iteration in range(1, 2):
        _prepare_files(iteration)
        result = load_data(
            f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
        )

        control_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_CONTROL}/data/"
            f"writers_control_{scenario['control']}_{iteration}.csv"
        )
        expected_keys = ["sales"]

        assert not DataframeHelpers.has_diff(result.get("sales"), control_df)
        assert len(result.keys()) == len(expected_keys)
        assert all(
            subject == expected
            for subject, expected in zip(result.keys(), expected_keys)
        )


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "write_streaming_multiple_dfs"},
    ],
)
def test_multiple_write_to_dataframe(scenario: dict, capsys: Any) -> None:
    """Test dataframe writer chaining ACON calls.

    This test have the objective to demonstrate how you can use
    the output from an ACON as input to another ACON,
    showing the flexibility of this writer unlock to us.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    _prepare_files()

    multiple_df_result = load_data(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )

    generated_acon = _generate_acon_from_source(multiple_df_result)

    result = load_data(acon=generated_acon)
    result_keys = list(multiple_df_result.keys()) + list(result.keys())

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/writers_control.csv"
    )
    expected_keys = ["sales_historical", "sales_new", "sales"]

    assert not DataframeHelpers.has_diff(result.get("sales"), control_df)
    assert len(result_keys) == len(expected_keys)
    assert all(
        subject == expected for subject, expected in zip(result_keys, expected_keys)
    )


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "write_streaming_processing_time_dataframe",
            "streaming_processing_time": "2 seconds",
        },
        {
            "scenario_name": "write_streaming_continuous_dataframe",
            "streaming_continuous": "2 seconds",
        },
    ],
)
def test_write_to_dataframe_exception(scenario: dict, capsys: Any) -> None:
    """Test expected exception for dataframe writer on stream cases.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """

    def dataframe_writer(
        df: DataFrame = None,
        data: OrderedDict = None,
        streaming_processing_time: Optional[str] = None,
        streaming_continuous: Optional[str] = None,
    ) -> DataFrameWriter:
        """Create DataFrame Writer.

        Args:
            df: dataframe containing the data to append.
            data: list of all dfs generated on previous steps before writer.
            streaming_processing_time: if streaming query is to be kept alive,
                this indicates the processing time of each micro batch.
            streaming_continuous: set a trigger that runs
                a continuous query with a given
        checkpoint interval.
        """
        if not df:
            df = DataframeHelpers.create_empty_dataframe(StructType([]))

        spec = OutputSpec(
            spec_id=random.choice(string.ascii_letters),  # nosec
            input_id=random.choice(string.ascii_letters),  # nosec
            write_type=None,
            data_format=OutputFormat.DATAFRAME.value,
            streaming_processing_time=streaming_processing_time,
            streaming_continuous=streaming_continuous,
        )

        return DataFrameWriter(output_spec=spec, df=df.coalesce(1), data=data)

    with pytest.raises(NotSupportedException) as exception:
        dataframe_writer(
            streaming_processing_time=scenario.get("streaming_processing_time"),
            streaming_continuous=scenario.get("streaming_continuous"),
        ).write()

    assert (
        "DataFrame writer doesn't support processing time or continuous streaming"
        in str(exception.value)
    )


def _generate_acon_from_source(source: OrderedDict) -> dict:
    """Create an ACON from dictionary source containing resulted dataframes.

    Args:
        source: Dictionary containing source computed dataframes.
    """
    return {
        "input_specs": [
            {
                "spec_id": "sales_historical",
                "read_type": "batch",
                "data_format": "dataframe",
                "df_name": source.get("sales_historical"),
            },
            {
                "spec_id": "sales_new",
                "read_type": "batch",
                "data_format": "dataframe",
                "df_name": source.get("sales_new"),
            },
        ],
        "transform_specs": [
            {
                "spec_id": "union_dataframes",
                "input_id": "sales_historical",
                "transformers": [
                    {"function": "union", "args": {"union_with": ["sales_new"]}}
                ],
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales",
                "input_id": "union_dataframes",
                "data_format": "dataframe",
            }
        ],
    }


def _prepare_files(iteration: int = 0) -> None:
    file_suffix = "*" if iteration == 0 else f"{iteration}"

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/source/sales_historical_{file_suffix}.csv",
        f"{TEST_LAKEHOUSE_IN}/source/sales_historical/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/source/sales_new_{file_suffix}.csv",
        f"{TEST_LAKEHOUSE_IN}/source/sales_new/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/schema/*.json",
        f"{TEST_LAKEHOUSE_IN}/schema/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/control/*.*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )


================================================
FILE: tests/feature/transformations/__init__.py
================================================
"""Transformations feature tests."""


================================================
FILE: tests/feature/transformations/test_chain_transformations.py
================================================
"""Test chain transformer."""

from typing import Any

import pytest
from pyspark.sql.utils import StreamingQueryException

from lakehouse_engine.core.definitions import InputFormat, OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/chain_transformations"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "batch"},
        {"scenario_name": "streaming"},
        {"scenario_name": "streaming_batch"},
        {"scenario_name": "write_streaming_struct_data"},
        {"scenario_name": "write_streaming_struct_data_fail"},
    ],
)
def test_chain_transformations(scenario: dict, caplog: Any) -> None:
    """Test chain transformation.

    Args:
        scenario: scenario to test.
            batch - scenario where we are using batch dataframes;
            streaming - scenario where we are using streaming dataframes;
            streaming_batch - scenario where we are using batch and streaming
                dataframes;
            write_streaming_struct_data - scenario where are we making transformations
                in first place, use this result to apply other transform and write
                in micro batch;
            write_streaming_struct_data_fail - scenario where we are trying to use a
                result from micro batch transformation into another transform, this
                one should fail because we cannot have dependency from micro batch.
        caplog: captured log.
    """
    _prepare_files()

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/acons/{scenario['scenario_name']}.json"
    )

    if scenario["scenario_name"] == "write_streaming_struct_data_fail":
        with pytest.raises(
            StreamingQueryException,
            match=".*An exception was raised by the Python Proxy.*",
        ):
            load_data(acon=acon)

        assert (
            "A column, variable, or function parameter with name `sample_json_field1` "
            "cannot be resolved." in caplog.text
        )
    else:
        load_data(acon=acon)

        result_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/data",
            file_format=OutputFormat.DELTAFILES.value,
        )

        if scenario["scenario_name"] == "write_streaming_struct_data":
            control_df = DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_CONTROL}/data/struct_data.json",
                file_format=InputFormat.JSON.value,
                options={"multiLine": "true"},
            ).select(
                "salesorder",
                "item",
                "article",
                "sample_json_field1",
                "sample_json_field4",
                "item_amount_json",
            )
        else:
            control_df = DataframeHelpers.read_from_file(
                f"{TEST_LAKEHOUSE_CONTROL}/data/chain_control.csv"
            )

        assert not DataframeHelpers.has_diff(result_df, control_df)


def _prepare_files() -> None:
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/source/sales_historical.csv",
        f"{TEST_LAKEHOUSE_IN}/source/sales_historical/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/source/sales_new.csv",
        f"{TEST_LAKEHOUSE_IN}/source/sales_new/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/source/customers.csv",
        f"{TEST_LAKEHOUSE_IN}/source/customers/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/source/struct_data.csv",
        f"{TEST_LAKEHOUSE_IN}/source/struct_data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/schema/*.json",
        f"{TEST_LAKEHOUSE_IN}/schema/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/control/*.*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )


================================================
FILE: tests/feature/transformations/test_column_creators.py
================================================
"""Test Column Creator Transformers."""

import pytest

from lakehouse_engine.core.definitions import InputFormat, OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/column_creators"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["streaming", "batch"],
)
def test_column_creators(scenario: str) -> None:
    """Test column creators.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*.json",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data",
        file_format=InputFormat.JSON.value,
        options={"multiLine": "true"},
    ).select(
        "salesorder",
        "item",
        "date",
        "customer",
        "article",
        "amount",
        "dummy_string",
        "dummy_int",
        "dummy_double",
        "dummy_boolean",
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_column_reshapers.py
================================================
"""Test Column Reshaping Transformers."""

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/column_reshapers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        {"type": "batch", "scenario_name": "flatten_schema"},
        {"type": "streaming", "scenario_name": "flatten_schema"},
        {"type": "batch", "scenario_name": "explode_arrays"},
        {"type": "streaming", "scenario_name": "explode_arrays"},
        {"type": "batch", "scenario_name": "flatten_and_explode_arrays_and_maps"},
        {"type": "streaming", "scenario_name": "flatten_and_explode_arrays_and_maps"},
    ],
)
def test_column_reshapers(scenario: dict) -> None:
    """Test column reshaping transformers.

    Args:
        scenario: scenario to test.
            flatten_schema: This test flattens the struct.
            explode_arrays: This test explode the array columns specified.
            flatten_and_explode_arrays_and_maps: This test flattens the struct
                and explode the array  and map columns specified.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario['scenario_name']}/data/source/*.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario['scenario_name']}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario['scenario_name']}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario['scenario_name']}/",
    )

    load_data(
        f"file://{TEST_RESOURCES}/{scenario['scenario_name']}/{scenario['type']}.json"
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario['scenario_name']}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario['scenario_name']}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario['scenario_name']}/{scenario['type']}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario['scenario_name']}/data/"
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_data_maskers.py
================================================
"""Test Data Masking Transformers."""

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/data_maskers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["drop_columns", "hash_masking"],
)
def test_data_maskers(scenario: str) -> None:
    """Test data masking transformers.

    Args:
        scenario: scenario to test.
            drop_columns - scenario where we mask data by dropping columns;
            hash_masking - scenario where we mask data by hashing columns.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}_control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_date_transformers.py
================================================
"""Test Date Transformers."""

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/date_transformers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["streaming"],
)
def test_date_transformers(scenario: str) -> None:
    """Test date transformers.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=OutputFormat.DELTAFILES.value,
    ).drop("curr_date")

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_drop_duplicate_rows.py
================================================
"""Test drop_duplicate_rows function."""

import pytest

from lakehouse_engine.core.definitions import InputFormat, OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/drop_duplicate_rows"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        ["batch"],
        ["streaming"],
    ],
)
def test_drop_duplicate_rows(scenario: str) -> None:
    """Tests drop duplicate rows transformer available in the ACON transform_specs.

    Args:
        scenario: scenario to test.
            batch - test the transformer utilization in batch mode.
                The transformer is tested 3 times: 1) without providing arguments;
                2) providing an empty list ([]); and 3) providing a list with
                columns names (["order_number","item_number"]). This happens
                using 3 different dataframes saved in different locations
                specified in the ACON. In the 2 first times, the transformer
                should have the same behaviour has using the pyspark
                function distinct().
            streaming - the same as batch but using streaming.

    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/{scenario[0]}_*.json",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario[0]}.json")
    load_data(acon=acon)

    control_drop_duplicates = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[0]}_drop_duplicates.json",
        file_format=InputFormat.JSON.value,
        options={"multiLine": "true"},
    )

    control_distinct = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[0]}_distinct.json",
        file_format=InputFormat.JSON.value,
        options={"multiLine": "true"},
    )

    df_transform_columns = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/columns/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    assert not DataframeHelpers.has_diff(df_transform_columns, control_drop_duplicates)

    df_transform_no_args = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/orders_duplicate_no_args/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    assert not DataframeHelpers.has_diff(df_transform_no_args, control_distinct)

    df_transform_empty = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/orders_duplicate_empty/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    assert not DataframeHelpers.has_diff(df_transform_empty, control_distinct)


================================================
FILE: tests/feature/transformations/test_joiners.py
================================================
"""Test Join Transformers."""

from typing import List

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/joiners"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        ["streaming", "control_scenario_1_and_2"],
        ["streaming_without_broadcast", "control_scenario_1_and_2"],
        ["streaming_without_column_rename", "control_scenario_3"],
        ["streaming_foreachBatch", "control_scenario_1_and_2"],
        ["batch", "control_scenario_1_and_2"],
    ],
)
def test_joiners(scenario: List[str]) -> None:
    """Test join transformers.

    Args:
        scenario: scenario to test.
            streaming - join streaming scenario.
            streaming_without_broadcast - same as streaming scenario but without
            broadcast join. Note: also differs by partitioning by customer and date,
            not only date.
            streaming_without_column_rename - same as streaming scenario but without
            renaming name column to customer_name.
            streaming_foreachBatch - join streaming scenario in foreachBatch mode.
            batch - join batch scenario.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/customer-part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/data/customers/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/sales-part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/data/sales/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )

    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario[0]}.json")

    if scenario[0] != "batch":
        load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/sales-part-02.csv",
        f"{TEST_LAKEHOUSE_IN}/data/sales/",
    )
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[1]}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario[1]}_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_multiple_transformations.py
================================================
"""Test multiple transformations and output specs on the same ACON."""

import pytest

from lakehouse_engine.core.definitions import InputFormat, OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/multiple_transform"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["batch"],
)
def test_multiple_transformations(scenario: str) -> None:
    """Tests multiple transformations available in the ACON transform_specs.\
    Transformations are saved in different locations, according to the output_specs.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*.json",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    result_transform_df1 = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/orders_customer_cols/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    result_transform_df2 = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/orders_kpi_cols/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data",
        file_format=InputFormat.JSON.value,
        options={"multiLine": "true"},
    )

    assert not DataframeHelpers.has_diff(
        result_transform_df1, control_df.select("date", "country", "customer_number")
    )
    assert not DataframeHelpers.has_diff(
        result_transform_df2, control_df.select("date", "city", "amount")
    )


================================================
FILE: tests/feature/transformations/test_null_handlers.py
================================================
"""Test Null Handler Transformers."""

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/null_handlers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["replace_nulls", "replace_nulls_col_subset"],
)
def test_replace_nulls(scenario: str) -> None:
    """Test date transformers.

    Args:
        scenario: scenario to test.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/control/*.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=OutputFormat.DELTAFILES.value,
    ).drop("curr_date")

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_optimizers.py
================================================
"""Test Optimizer transformers."""

import pytest
from pyspark.sql.dataframe import DataFrame

from lakehouse_engine.engine import load_data
from tests.conftest import FEATURE_RESOURCES, LAKEHOUSE_FEATURE_IN
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/optimizers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"


def is_df_cached(df: DataFrame) -> DataFrame:
    """Check if the dataframe is cached.

    Args:
        df: DataFrame passed as input.

    Returns:
        DataFrame: same as the input DataFrame.
    """
    if not df.is_cached:
        raise Exception

    return df


def is_df_not_cached(df: DataFrame) -> DataFrame:
    """Check if the dataframe is not cached.

    Args:
        df: DataFrame passed as input.

    Returns:
        DataFrame: same as the input DataFrame.
    """
    if df.is_cached:
        raise Exception

    return df


@pytest.mark.parametrize("scenario", ["batch", "streaming"])
def test_optimizer(scenario: str) -> None:
    """Test the optimizer transformer both in batch and streaming."""
    acon = _get_test_acon(scenario)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/part-01.csv",
        f"{TEST_LAKEHOUSE_IN}/data/",
    )

    load_data(acon=acon)


def _get_test_acon(read_type: str) -> dict:
    """Creates a test ACON with the desired logic for the algorithm.

    Args:
        read_type: the read type (streaming or batch).

    Returns:
        dict: the ACON for the algorithm configuration.
    """
    acon = {
        "input_specs": [
            {
                "spec_id": "sales_source",
                "read_type": read_type,
                "data_format": "csv",
                "options": {"header": True, "delimiter": "|", "inferSchema": True},
                "location": f"file:///{TEST_LAKEHOUSE_IN}/data/",
            }
        ],
        "transform_specs": [
            {
                "spec_id": "transformed_sales_source",
                "input_id": "sales_source",
                "transformers": [
                    {
                        "function": "persist",
                        "args": {"storage_level": "MEMORY_AND_DISK"},
                    },
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": is_df_cached},
                    },
                    {
                        "function": "unpersist",
                    },
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": is_df_not_cached},
                    },
                    {
                        "function": "cache",
                    },
                    {
                        "function": "custom_transformation",
                        "args": {"custom_transformer": is_df_cached},
                    },
                ],
            }
        ],
        "output_specs": [
            {
                "spec_id": "sales_bronze",
                "input_id": "transformed_sales_source",
                "data_format": "console",
            }
        ],
    }

    if read_type == "streaming":
        acon["transform_specs"][0][  # type: ignore
            "force_streaming_foreach_batch_processing"
        ] = True
        acon["exec_env"] = {"spark.sql.streaming.schemaInference": True}

    return acon


================================================
FILE: tests/feature/transformations/test_regex_transformers.py
================================================
"""Test Regex Transformers."""

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/regex_transformers"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    ["with_regex_value"],
)
def test_regex_transformers(scenario: str) -> None:
    """Test regex transformers.

    Args:
        scenario: scenario to test.
            with_regex_value - test with_regex_value feature in the regex transformers.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/source/*.csv",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/data/",
    )
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario}/",
    )
    acon = ConfigUtils.get_acon(f"file://{TEST_RESOURCES}/{scenario}/batch.json")
    load_data(acon=acon)

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario}/data/control/part-01.csv",
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data/",
    )

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )
    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/{scenario}/data",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario}/control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


================================================
FILE: tests/feature/transformations/test_unions.py
================================================
"""Test Union Transformers."""

from typing import List

import pytest
from pyspark.sql.utils import AnalysisException

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/unions"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        ["batch", "union", "control_sales"],
        ["batch", "union_diff_schema", ""],
        ["batch", "unionByName", "control_sales"],
        ["batch", "unionByName_diff_schema", "control_sales_shipment"],
        ["batch", "unionByName_diff_schema_error", ""],
        ["streaming", "union", "control_sales_streaming"],
        ["streaming", "unionByName_diff_schema", "control_sales_shipment_streaming"],
        ["streaming", "union_foreachBatch", "control_sales_streaming_foreachBatch"],
        [
            "streaming",
            "unionByName_diff_schema_foreachBatch",
            "control_sales_shipment_streaming_foreachBatch",
        ],
    ],
)
def test_unions(scenario: List[str]) -> None:
    """Test union transformers.

    Args:
        scenario: scenario to test.
            batch_union - union batch scenario, using union function based on
            columns' position.
            batch_union_diff_schema - same as batch_union scenario but tries
            to union data with different schema, throwing an exception.
            batch_unionByName - union batch scenario, using unionByName
            function based on columns' names.
            batch_unionByName_diff_schema - same as batch_unionByName
            scenario but allows the union of datasets with different schemas
            enabling the allowMissingColumns param.
            batch_unionByName_diff_schema_error - same as
            batch_unionByName_diff_schema but disabling the allowMissingColumns
            param and therefore, throwing an exception.
            streaming_union - union streaming scenario, using union function
            based on columns' position.
            streaming_unionByName_diff_schema - union streaming scenario,
            using unionByName function based on columns' names and allowing the
            union of datasets with different schemas.
            streaming_union_foreachBatch - union streaming scenario, using union
            function based on columns' position in foreachBatch mode.
            streaming_unionByName_diff_schema_foreachBatch - union streaming scenario,
            using unionByName function based on columns' names and allowing the
            union of datasets with different schemas in foreachBatch mode.

    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/",
    )

    copy_data_files(1)

    acon = ConfigUtils.get_acon(
        f"file://{TEST_RESOURCES}/{scenario[0]}_{scenario[1]}.json"
    )

    if "union_diff_schema" in scenario[1] or "error" in scenario[1]:
        with pytest.raises(
            AnalysisException,
            match=".*UNION can only be performed on inputs with the same number.*",
        ):
            load_data(acon=acon)

    else:
        if scenario[0] != "batch":
            load_data(acon=acon)

            copy_data_files(2)

        load_data(acon=acon)

        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/data/control/*.csv",
            f"{TEST_LAKEHOUSE_CONTROL}/data/",
        )

        result_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_OUT}/{scenario[0]}_{scenario[1]}/data",
            file_format=OutputFormat.DELTAFILES.value,
        )
        control_df = DataframeHelpers.read_from_file(
            f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario[2]}.csv"
        )

        assert not DataframeHelpers.has_diff(result_df, control_df)


def copy_data_files(iteration: int) -> None:
    """Copies the data files to the tests input location.

    Args:
        iteration: number indicating the file to load.
    """
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/sales-historical-part-0{iteration}.csv",
        f"{TEST_LAKEHOUSE_IN}/data/sales/sales_historical/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/sales-new-part-0{iteration}.csv",
        f"{TEST_LAKEHOUSE_IN}/data/sales/sales_new/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/data/source/sales-shipment-part-0{iteration}.csv",
        f"{TEST_LAKEHOUSE_IN}/data/sales/sales_shipment/",
    )


================================================
FILE: tests/feature/transformations/test_watermarker.py
================================================
"""Test Watermarker Transformers."""

import pytest

from lakehouse_engine.core.definitions import OutputFormat
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import load_data
from lakehouse_engine.utils.configs.config_utils import ConfigUtils
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import (
    FEATURE_RESOURCES,
    LAKEHOUSE_FEATURE_CONTROL,
    LAKEHOUSE_FEATURE_IN,
    LAKEHOUSE_FEATURE_OUT,
)
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_PATH = "transformations/watermarker"
TEST_RESOURCES = f"{FEATURE_RESOURCES}/{TEST_PATH}"
TEST_LAKEHOUSE_IN = f"{LAKEHOUSE_FEATURE_IN}/{TEST_PATH}"
TEST_LAKEHOUSE_CONTROL = f"{LAKEHOUSE_FEATURE_CONTROL}/{TEST_PATH}"
TEST_LAKEHOUSE_OUT = f"{LAKEHOUSE_FEATURE_OUT}/{TEST_PATH}"


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "streaming_drop_duplicates", "loads": 2},
        {"scenario_name": "streaming_drop_duplicates_overall_watermark", "loads": 2},
    ],
)
def test_drop_duplicates_with_watermark(scenario: dict) -> None:
    """Test deduplication applying watermarking.

    For both test scenarios if there is late data coming out of the
    watermark time, this data won't be integrated. It won't be in the
    target destination (and so it is also not in the control data).

    Args:
        scenario: scenario to test.
            streaming_drop_duplicates - apply drop duplicates over a streaming
             dataframe.
            streaming_drop_duplicates_overall_watermark - apply drop duplicates over
             a streaming dataframe defined as an independent transformation.
             It also uses the Group and rank transformation which ignores the watermark
             because that transformation is applied over a foreach batch operation.
    """
    scenario_name = scenario["scenario_name"]
    loads = scenario["loads"]
    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario_name}/data/control/*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario_name}/*schema.json",
        f"{TEST_LAKEHOUSE_IN}/{scenario_name}/",
    )

    for load in range(1, loads + 1):
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario_name}/data/source/part-0{str(load)}.csv",
            f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/",
        )
        acon = ConfigUtils.get_acon(
            f"file://{TEST_RESOURCES}/{scenario_name}/{scenario_name}.json"
        )
        load_data(acon=acon)

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario_name}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario_name}/source_schema.json"
        ),
    )
    assert not DataframeHelpers.has_diff(result_df, control_df)


@pytest.mark.parametrize(
    "scenario",
    [
        {"scenario_name": "streaming_inner_join", "loads": 2},
        {"scenario_name": "streaming_right_outer_join", "loads": 2},
        {"scenario_name": "streaming_left_outer_join", "loads": 5},
    ],
)
def test_joins_with_watermark(scenario: dict) -> None:
    """Test join operations applying watermarking.

    Args:
        scenario: scenario to test.
            streaming_inner_join - apply inner join over 2 streaming dataframes.
            streaming_right_outer_join - apply right outer join over 2 streaming
             dataframes.
            streaming_left_outer_join - apply left outer join over 2 streaming
             dataframes.
    """
    scenario_name = scenario["scenario_name"]
    loads = scenario["loads"]
    if scenario_name == "streaming_right_outer_join":
        _drop_and_create_table(
            "streaming_outer_join", f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data"
        )

    for load in range(1, loads + 1):
        file_prefix = f"part-0{str(load)}.csv"
        if load >= 1 and not scenario_name == "streaming_inner_join":
            LocalStorage.copy_file(
                f"{TEST_RESOURCES}/{scenario_name}/data/source/customer-{file_prefix}",
                f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/customers/",
            )
        elif load == 1 and scenario_name == "streaming_inner_join":
            LocalStorage.copy_file(
                f"{TEST_RESOURCES}/{scenario_name}/data/source/customer-part-01.csv",
                f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/customers/",
            )

        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario_name}/data/source/sales-{file_prefix}",
            f"{TEST_LAKEHOUSE_IN}/{scenario_name}/data/sales/",
        )
        LocalStorage.copy_file(
            f"{TEST_RESOURCES}/{scenario_name}/*schema.json",
            f"{TEST_LAKEHOUSE_IN}/{scenario_name}/",
        )

        acon = ConfigUtils.get_acon(
            f"file://{TEST_RESOURCES}/{scenario_name}/{scenario_name}.json"
        )
        load_data(acon=acon)

    result_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_OUT}/{scenario_name}/data",
        file_format=OutputFormat.DELTAFILES.value,
    )

    LocalStorage.copy_file(
        f"{TEST_RESOURCES}/{scenario_name}/data/control/*",
        f"{TEST_LAKEHOUSE_CONTROL}/data/",
    )

    control_df = DataframeHelpers.read_from_file(
        f"{TEST_LAKEHOUSE_CONTROL}/data/{scenario_name}.csv",
        schema=SchemaUtils.from_file_to_dict(
            f"file://{TEST_LAKEHOUSE_IN}/{scenario_name}/"
            f"{scenario_name}_control_schema.json"
        ),
    )

    assert not DataframeHelpers.has_diff(result_df, control_df)


def _drop_and_create_table(table_name: str, location: str) -> None:
    """Create test table.

    Args:
        table_name: name of the table.
        location: location of the table.
    """
    ExecEnv.SESSION.sql(f"DROP TABLE IF EXISTS test_db.{table_name}")
    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS test_db.{table_name} (
            salesorder int,
            item int,
            date timestamp,
            customer string,
            article string,
            amount int,
            customer_name string
        )
        USING delta
        LOCATION '{location}'
        """
    )


================================================
FILE: tests/resources/feature/append_load/failfast/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "enforce_schema_from_table": "test_db.failfast_table",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.failfast_table"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_date",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_date"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "db_table": "test_db.failfast_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/append_load/failfast/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/append_load/failfast/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.failfast_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/append_load/failfast/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/append_load/failfast/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/append_load/failfast/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/append_load/failfast/data/source/part-03.csv
================================================
salesorder|item|date|customer|article|amount2|onemorecolumn
5|1|20170510|customer4|article6|15000|NA
5|2|20170510|customer4|article3|10000|NA
5|3|20170510|customer4|article5|8000|NA
6|1|20170601|customer2|article4|10000|NA
6|2|20170601|customer2|article1|5000|NA
6|3|20170601|customer2|article2|9000|NA

================================================
FILE: tests/resources/feature/append_load/jdbc_permissive/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "jdbc_args": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db",
        "table": "jdbc_permissive",
        "properties": {
          "driver": "org.sqlite.JDBC"
        }
      },
      "options": {
        "numPartitions": 1
      }
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.jdbc_permissive_table"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_date",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_date"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "db_table": "test_db.jdbc_permissive_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/append_load/jdbc_permissive/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "jdbc_args": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/append_load/jdbc_permissive/tests.db",
        "table": "jdbc_permissive",
        "properties": {
          "driver": "org.sqlite.JDBC"
        }
      },
      "options": {
        "numPartitions": 1
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.jdbc_permissive_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/append_load/jdbc_permissive/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/append_load/jdbc_permissive/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/append_load/jdbc_permissive/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/append_load/jdbc_permissive/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/append_load/jdbc_permissive/data/source/part-03.csv
================================================
salesorder|item|date|customer|article|amount
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/append_load/streaming_dropmalformed/data/source/part-03.csv
================================================
salesorder|item|date|customer|article|amount2|onemorecolumn
5|1|20170510|customer4|article6|15000|NA
5|2|20170510|customer4|article3|10000|NA
5|3|20170510|customer4|article5|8000|NA
6|1|20170601|customer2|article4|10000|NA
6|2|20170601|customer2|article1|5000|NA
6|3|20170601|customer2|article2|9000|NA

================================================
FILE: tests/resources/feature/append_load/streaming_dropmalformed/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "DROPMALFORMED"
      },
      "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_dropmalformed/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "db_table": "test_db.streaming_dropmalformed_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_dropmalformed/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/append_load/streaming_with_terminators/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/append_load/streaming_with_terminators/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/append_load/streaming_with_terminators/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "DROPMALFORMED"
      },
      "location": "file:///app/tests/lakehouse/in/feature/append_load/streaming_with_terminators/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "db_table": "test_db.streaming_with_terminators_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/append_load/streaming_with_terminators/data"
    }
  ],
  "terminate_specs": [
    {
      "function": "optimize_dataset",
      "args": {
        "db_table": "test_db.streaming_with_terminators_table",
        "debug": true
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "group_article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article_number",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data/source/part-01.csv
================================================
salesorder|item|date|group_article|article_number|amount
1|1|20160601|IE4089|IE4019|1000
1|2|20160601|IE4088|IE4018|2000
1|3|20160601|IE4087|IE4017|500
2|1|20170215|IE4086|IE4016|100
2|2|20170215|IE4085|IE4015|500
2|3|20170215|IE4084|IE4014|300
3|1|20170215|IE4083|IE4013|2000
3|2|20170215|IE4082|IE4012|1200
3|3|20170215|IE4081|IE4011|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data/source/part-02.csv
================================================
salesorder|item|date|group_article|article_number|amount
1|1|20160601|IE4099|IE4039|1000
1|2|20160601|IE4098|IE4038|2000
1|3|20160601|IE4097|IE4037|500
2|1|20170215|IE4096|IE4036|100
2|2|20170215|IE4095|IE4035|500
2|3|20170215|IE4094|IE4034|300
3|1|20170215|IE4093|IE4033|2000
3|2|20170215|IE4092|IE4032|1200
3|3|20170215|IE4091|IE4031|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "group_article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article_number",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "group_article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article_number",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "VBELN",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "EDATU",
            "type": "date",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "MBDAT",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "ERDAT",
            "type": "date",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "ERDATA",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "BPDAT",
            "type": "date",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data/source/part-01.csv
================================================
VBELN|EDATU|MBDAT|ERDAT|ERDATA|BPDAT
2001|2029-01-12|2023-11-21|2022-08-07|2022-08-07|2023-09-04
2002|2029-01-12|2020-01-01|2020-01-04|2019-08-07|2023-10-14
2003|2019-01-12|2023-03-21|2009-01-14|2012-08-07|2024-12-24

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data/source/part-02.csv
================================================
VBELN|EDATU|MBDAT|ERDAT|ERDATA|BPDAT
2004|2029-01-12|2022-04-21|2010-05-04|2020-08-07|2024-11-04
2005|2013-01-12|2022-05-21|2013-01-11|2022-05-21|2024-09-12

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "VBELN",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "EDATU",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "MBDAT",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ERDAT",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ERDATA",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "BPDAT",
      "type": "date",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "VBELN",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "EDATU",
            "type": "date",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "MBDAT",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "ERDAT",
            "type": "date",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "ERDATA",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "BPDAT",
            "type": "date",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|2016-06-01T12:00:00|customer1|article1|1000
1|2|2016-06-01T12:00:00|customer1|article2|2000
1|3|2016-06-01T12:00:00|customer1|article3|500
2|1|2017-02-15T12:00:00|customer2|article4|100
2|2|2017-02-15T12:00:00|customer2|article6|500
2|3|2017-02-15T12:00:00|customer2|article1|300
3|1|2017-02-15T12:00:00|customer1|article5|2000
3|2|2017-02-15T12:00:00|customer1|article2|1200
3|3|2017-02-15T12:00:00|customer1|article4|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
4|1|2017-04-30T12:00:00|customer3|article3|800
4|2|2017-04-30T12:00:00|customer3|article7|700
4|3|2017-04-30T12:00:00|customer3|article1|300
4|4|2017-04-30T12:00:00|customer3|article2|500
5|1|2017-05-10T12:00:00|customer4|article6|1500
5|2|2017-05-10T12:00:00|customer4|article3|1000
5|3|2017-05-10T12:00:00|customer4|article5|800
6|1|2017-06-01T12:00:00|customer2|article4|1000
6|2|2017-06-01T12:00:00|customer2|article1|500
6|3|2017-06-01T12:00:00|customer2|article2|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_be_date_not_older_than/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "number",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data/source/part-01.csv
================================================
salesorder|item|number|customer|article|amount
1|1|4061622965678|customer1|article1|1000
1|2|4061622965678|customer1|article2|2000
1|3|4061622965678|customer1|article3|500
2|1|4061622965678|customer2|article4|100
2|2|4061622965678|customer2|article6|500
2|3|4061622965678|customer2|article1|300
3|1|4061622965678|customer1|article5|2000
3|2|4061622965678|customer1|article2|1200
3|3|4061622965678|customer1|article4|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
4|1|4061622965678|customer3|article3|800
4|2|4061622965678|customer3|article7|700
4|3|4061622965678|customer3|article1|300
4|4|4061622965678|customer3|article2|500
5|1|4061622965678|customer4|article6|1500
5|2|4061622965678|customer4|article3|1000
5|3|4061622965678|customer4|article5|800
6|1|4061622965678|customer2|article4|1000
6|2|4061622965678|customer2|article1|500
6|3|4061622965678|customer2|article2|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "number",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "number",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_column_values_to_not_be_null_or_empty_string/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "itemcode",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data/source/part-01.csv
================================================
salesorder|item|itemcode|date|customer|article|amount
1|1|1|20160601|customer1|article1|1000
1|2|2|20160601|customer1|article2|2000
1|3|3|20160601|customer1|article3|500
2|1|1|20170215|customer2|article4|100
2|2|2|20170215|customer2|article6|500
2|3|3|20170215|customer2|article1|300
3|1|1|20170215|customer1|article5|2000
3|2|2|20170215|customer1|article2|1200
3|3|3|20170215|customer1|article4|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data/source/part-02.csv
================================================
salesorder|item|itemcode|date|customer|article|amount
4|1|1|20170430|customer3|article3|800
4|2|2|20170430|customer3|article7|700
4|3|3|20170430|customer3|article1|300
4|4|4|20170430|customer3|article2|500
5|1|1|20170510|customer4|article6|1500
5|2|2|20170510|customer4|article3|1000
5|3|3|20170510|customer4|article5|800
6|1|1|20170601|customer2|article4|1000
6|2|2|20170601|customer2|article1|500
6|3|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "itemcode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "itemcode",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_queried_column_agg_value_to_be/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "itemcode",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "year",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "month",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "day",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_queried_column_agg_value_to_be/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/data/source/part-01.csv
================================================
salesorder|item|itemcode|year|month|day|customer|article|amount
1|1|1|2016|06|01|customer1|article1|1000
1|2|2|2016|06|01|customer1|article2|2000
1|3|3|2016|06|01|customer1|article3|500
2|1|1|2017|02|15|customer2|article4|100
2|2|2|2017|02|15|customer2|article6|500
2|3|3|2017|02|15|customer2|article1|300
3|1|1|2015|10|09|customer1|article5|2000
3|2|2|2015|10|09|customer1|article2|1200
3|3|3|2015|10|09|customer1|article4|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/data/source/part-02.csv
================================================
salesorder|item|itemcode|year|month|day|customer|article|amount
4|1|1|2020|04|30|customer3|article3|800
4|2|2|2020|04|30|customer3|article7|700
4|3|3|2021|11|31|customer3|article1|300
4|4|4|2021|11|31|customer3|article2|500
5|1|1|2022|01|01|customer4|article6|1500
5|2|2|2022|01|01|customer4|article3|1000
5|3|3|2022|01|01|customer4|article5|800
6|1|1|2010|06|29|customer2|article4|1000
6|2|2|2010|06|29|customer2|article1|500
6|3|3|2010|06|29|customer2|article2|900

================================================
FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "itemcode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "year",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "month",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "day",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/custom_expectations/expect_queried_column_agg_value_to_be/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/custom_expectations/expect_queried_column_agg_value_to_be/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "itemcode",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "year",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "month",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "day",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_queried_column_agg_value_to_be/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/custom_expectations/expect_queried_column_agg_value_to_be/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "long",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/data/control/part-01.csv
================================================
date|amount
20160601|3500

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/calculate_kpi/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/control/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|15000
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|20000
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|5000
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|1000
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|5000
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|3000
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|20000
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|7000
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|4000
20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|7000
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|15000
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|10000
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|8000
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|10000
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|5000
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|9000
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|12000

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|2|1|14|4|4||20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/delta_load/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "long",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/data/control/part-01.csv
================================================
date|amount
20160601|3500

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/data_loader_custom_transformer/sql_transformation/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/data_quality/build_data_docs/with_data_docs_local_fs/20240410-080323-dq_success-sales_orders-checkpoint/20240410T080323.289170Z/7ba399ea28cc40bf8c79213a440aeb91.json
================================================
{
  "evaluation_parameters": {},
  "meta": {
    "active_batch_definition": {
      "batch_identifiers": {
        "input_id": "sales_orders",
        "spec_id": "dq_success",
        "timestamp": "20240410080151"
      },
      "data_asset_name": "dq_success-sales_orders",
      "data_connector_name": "dq_success-sales_orders-data_connector",
      "datasource_name": "dq_success-sales_orders-datasource"
    },
    "batch_markers": {
      "ge_load_time": "20240410T080323.295280Z"
    },
    "batch_spec": {
      "batch_data": "SparkDataFrame",
      "data_asset_name": "dq_success-sales_orders"
    },
    "checkpoint_id": null,
    "checkpoint_name": "dq_success-sales_orders-checkpoint",
    "expectation_suite_name": "dq_success-sales_orders-validator",
    "great_expectations_version": "0.18.8",
    "run_id": {
      "run_name": "20240410-080323-dq_success-sales_orders-checkpoint",
      "run_time": "2024-04-10T08:03:23.289170+00:00"
    },
    "validation_id": null,
    "validation_time": "20240410T080323.296161Z"
  },
  "results": [
    {
      "exception_info": {
        "exception_message": null,
        "exception_traceback": null,
        "raised_exception": false
      },
      "expectation_config": {
        "expectation_type": "expect_column_to_exist",
        "kwargs": {
          "batch_id": "7ba399ea28cc40bf8c79213a440aeb91",
          "column": "article"
        },
        "meta": {}
      },
      "meta": {},
      "result": {},
      "success": true
    },
    {
      "exception_info": {
        "exception_message": null,
        "exception_traceback": null,
        "raised_exception": false
      },
      "expectation_config": {
        "expectation_type": "expect_table_row_count_to_be_between",
        "kwargs": {
          "batch_id": "7ba399ea28cc40bf8c79213a440aeb91",
          "max_value": 50,
          "min_value": 0
        },
        "meta": {}
      },
      "meta": {},
      "result": {
        "observed_value": 34
      },
      "success": true
    }
  ],
  "statistics": {
    "evaluated_expectations": 2,
    "success_percent": 100.0,
    "successful_expectations": 2,
    "unsuccessful_expectations": 0
  },
  "success": true
}

================================================
FILE: tests/resources/feature/data_quality/build_data_docs/without_data_docs_local_fs/20240409-143548-dq_validator-sales_source-checkpoint/20240409T143548.454043Z/f0d7bd293d22bcfd3c1fec5a7d566638.json
================================================
{
  "evaluation_parameters": {},
  "meta": {
    "active_batch_definition": {
      "batch_identifiers": {
        "input_id": "sales_source",
        "spec_id": "dq_validator",
        "timestamp": "20240409143443"
      },
      "data_asset_name": "dq_validator-sales_source",
      "data_connector_name": "dq_validator-sales_source-data_connector",
      "datasource_name": "dq_validator-sales_source-datasource"
    },
    "batch_markers": {
      "ge_load_time": "20240409T143548.465215Z"
    },
    "batch_spec": {
      "batch_data": "SparkDataFrame",
      "data_asset_name": "dq_validator-sales_source"
    },
    "checkpoint_id": null,
    "checkpoint_name": "dq_validator-sales_source-checkpoint",
    "expectation_suite_name": "dq_validator-sales_source-validator",
    "great_expectations_version": "0.18.8",
    "run_id": {
      "run_name": "20240409-143548-dq_validator-sales_source-checkpoint",
      "run_time": "2024-04-09T14:35:48.454043+00:00"
    },
    "validation_id": null,
    "validation_time": "20240409T143548.466032Z"
  },
  "results": [
    {
      "exception_info": {
        "exception_message": null,
        "exception_traceback": null,
        "raised_exception": false
      },
      "expectation_config": {
        "expectation_type": "expect_table_row_count_to_be_between",
        "kwargs": {
          "batch_id": "f0d7bd293d22bcfd3c1fec5a7d566638",
          "max_value": 34,
          "min_value": 34
        },
        "meta": {}
      },
      "meta": {},
      "result": {
        "observed_value": 34
      },
      "success": true
    },
    {
      "exception_info": {
        "exception_message": null,
        "exception_traceback": null,
        "raised_exception": false
      },
      "expectation_config": {
        "expectation_type": "expect_table_column_count_to_be_between",
        "kwargs": {
          "batch_id": "f0d7bd293d22bcfd3c1fec5a7d566638",
          "max_value": 12,
          "min_value": 12
        },
        "meta": {}
      },
      "meta": {},
      "result": {
        "observed_value": 12
      },
      "success": true
    }
  ],
  "statistics": {
    "evaluated_expectations": 2,
    "success_percent": 100.0,
    "successful_expectations": 2,
    "unsuccessful_expectations": 0
  },
  "success": true
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":19,"min_value":19,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":19,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":9,"min_value":9,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":9,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "sum_total",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/sales.json
================================================
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/control/sales_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "actrequest_timestamp",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "request",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "datapakid",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "partno",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "record",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "salesorder",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "item",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "recordmode",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "date",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "customer",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "article",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "amount",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "dq_validations",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "run_name",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "run_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "raised_exceptions",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "run_row_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "dq_failure_details",
            "nullable": true,
            "type": {
              "containsNull": true,
              "elementType": {
                "fields": [
                  {
                    "metadata": {},
                    "name": "expectation_type",
                    "nullable": true,
                    "type": "string"
                  },
                  {
                    "metadata": {},
                    "name": "kwargs",
                    "nullable": true,
                    "type": "string"
                  }
                ],
                "type": "struct"
              },
              "type": "array"
            }
          }
        ],
        "type": "struct"
      }
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 19, "max_value": 19}
rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12}
rule_3|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 9, "max_value": 9}
rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12}
rule_3|expect_column_to_exist|in_motion|test_db|dummy_sales|fake_column|{"column": "fake_column"}
rule_4|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/streaming_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "prisma",
      "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/dq",
      "result_sink_format": "delta",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_table_table_filter": "dummy_sales",
      "tag_source_data": true,
      "source": "condensed_sales",
      "data_product_name": "delta_with_dupl_tag_gen_fail"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/streaming_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "prisma",
      "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/dq",
      "result_sink_format": "delta",
      "dq_table_table_filter": "dummy_sales",
      "tag_source_data": true,
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "source": "condensed_sales",
      "data_product_name": "delta_with_dupl_tag_gen_fail"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      },
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_dupl_tag_gen_fail/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"19.0","min_value":"19.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":19,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_multicolumn_sum_to_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":null,"column_B":null,"column_list":"[salesorder, request]","sum_total":"5.0", "unexpected_index_list":[{"run_success":false,"customer":"customer1","date":null,"item":"1","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"2","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"3","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer2","date":20170215,"item":"1","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"3","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"4","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"2","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170215,"item":"2","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer2","date":20170215,"item":"3","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"1","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer1","date":20170215,"item":"2","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"1","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"2","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"3","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"1","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170601,"item":"3","request":"0","salesorder":"6"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, null, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"9.0","min_value":"9.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":9,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_values_to_be_in_set","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"salesorder","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"customer":"customer5","date":"20180110","item":"1","salesorder":"7"}],"value_set":"[1, 2, 3, 4, 5]","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"3.0","min_value":"3.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_value_lengths_to_be_between","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"amount","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"4","salesorder":"4"},{"run_success":false,"amount":"50","customer":"customer2","date":20170215,"item":2,"salesorder":2},{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"1","salesorder":"4"},{"run_success":false,"amount":"80","customer":"customer3","date":20170430,"item":"2","salesorder":"4"},{"run_success":false,"amount":"40","customer":"customer3","date":20170430,"item":"3","salesorder":"4"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_pair_values_to_be_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":"datapakid","column_B":"partno", "unexpected_index_list":[{"run_success":false,"datapakid":"2","salesorder":"4","customer":"customer3","date":20170430,"item":"4","partno":"1"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys": "1, 1, 20160601, customer1||2, 2, 20170215, customer2||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||7, 1, 20180110, customer5"}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_list",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "sum_total",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": {
          "fields": [
            {
              "metadata": {},
              "name": "customer",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "date",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "item",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "request",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "salesorder",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "run_success",
              "nullable": true,
              "type": "boolean"
            },
            {
              "metadata": {},
              "name": "amount",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "datapakid",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "partno",
              "nullable": true,
              "type": "string"
            }
          ],
          "type": "struct"
        },
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_A",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_B",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "value_set",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/sales.json
================================================
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_pair_values_to_be_equal","kwargs":"{\"column_A\":\"datapakid\",\"column_B\":\"partno\"}"},{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_values_to_be_in_set","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"salesorder\",\"value_set\":[1,2,3,4,5]}"}]}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/control/sales_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "actrequest_timestamp",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "request",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "datapakid",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "partno",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "record",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "salesorder",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "item",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "recordmode",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "date",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "customer",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "article",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "amount",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "dq_validations",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "run_name",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "run_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "raised_exceptions",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "run_row_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "dq_failure_details",
            "nullable": true,
            "type": {
              "containsNull": true,
              "elementType": {
                "fields": [
                  {
                    "metadata": {},
                    "name": "expectation_type",
                    "nullable": true,
                    "type": "string"
                  },
                  {
                    "metadata": {},
                    "name": "kwargs",
                    "nullable": true,
                    "type": "string"
                  }
                ],
                "type": "struct"
              },
              "type": "array"
            }
          }
        ],
        "type": "struct"
      }
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 19, "max_value": 19}
rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12}
rule_3|expect_multicolumn_sum_to_equal|in_motion|test_db|dummy_sales|salesorder,request|{"column_list": ["salesorder", "request"], "sum_total": 5}
rule_4|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}
rule_5|expect_multicolumn_sum_to_equal|in_motion|test_db|dummy_sales|salesorder,request|{"column_list": ["salesorder", "request"], "sum_total": 5}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 9, "max_value": 9}
rule_2|expect_table_column_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 12, "max_value": 12}
rule_3|expect_column_values_to_be_in_set|in_motion|test_db|dummy_sales|salesorder|{"column": "salesorder", "value_set": [1, 2, 3, 4, 5]}
rule_4|expect_column_value_lengths_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 3, "max_value": 3}
rule_5|expect_column_to_exist|in_motion|test_db|dummy_sales|fake_column|{"column": "fake_column"}
rule_6|expect_column_pair_values_to_be_equal|in_motion|test_db|dummy_sales|datapakid, partno|{"column_A": "datapakid", "column_B": "partno"}
rule_7|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/streaming_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "prisma",
      "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/dq",
      "result_sink_format": "delta",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_table_table_filter": "dummy_sales",
      "tag_source_data": true,
      "source": "condensed_sales",
      "data_product_name": "delta_with_duplicates_tag"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/streaming_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "prisma",
      "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/dq",
      "result_sink_format": "delta",
      "tag_source_data": true,
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_table_table_filter": "dummy_sales",
      "source": "condensed_sales",
      "data_product_name": "delta_with_duplicates_tag"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      },
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/delta_with_duplicates_tag/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/full_overwrite_tag/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "prisma",
      "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/dq",
      "result_sink_format": "delta",
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "sales",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_table_table_filter": "dummy_sales",
      "tag_source_data": true,
      "data_product_name": "full_overwrite_tag"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/batch_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_table/full_overwrite_tag/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "prisma",
      "dq_db_table": "test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/dq",
      "result_sink_format": "delta",
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "sales",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_table_table_filter": "dummy_sales",
      "tag_source_data": true,
      "data_product_name": "full_overwrite_tag"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_table/full_overwrite_tag/data"
    }
  ],
  "exec_env": {
    "spark.sql.sources.partitionColumnTypeInference.enabled": false
  }
}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"], "processed_keys":"1, 1, 20160601, customer1||1, 2, 20160601, customer1||1, 3, 20160601, customer1||2, 1, 20170215, customer2||2, 2, 20170215, customer2||2, 3, 20170215, customer2||3, 1, 20170215, customer1||3, 2, 20170215, customer1||3, 3, 20170215, customer1||4, 1, 20170430, customer3||4, 2, 20170430, customer3||4, 3, 20170430, customer3||4, 4, 20170430, customer3||5, 1, 20170510, customer4||5, 2, 20170510, customer4||5, 3, 20170510, customer4||6, 1, 20170601, customer2||6, 2, 20170601, customer2||6, 3, 20170601, customer2"}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/sales.json
================================================
{"salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"1","item":"2","date":"20160601","customer":"customer1","article":"article2","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"1","item":"3","date":"20160601","customer":"customer1","article":"article3","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"2","item":"1","date":"20170215","customer":"customer2","article":"article4","amount":"1000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article6","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"2","item":"3","date":"20170215","customer":"customer2","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"3","item":"1","date":"20170215","customer":"customer1","article":"article5","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"3","item":"2","date":"20170215","customer":"customer1","article":"article2","amount":"12000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"3","item":"3","date":"20170215","customer":"customer1","article":"article4","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"2","date":"20170430","customer":"customer3","article":"article7","amount":"7000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"3","date":"20170430","customer":"customer3","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"5","item":"1","date":"20170510","customer":"customer4","article":"article6","amount":"15000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"5","item":"2","date":"20170510","customer":"customer4","article":"article3","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"5","item":"3","date":"20170510","customer":"customer4","article":"article5","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"6","item":"1","date":"20170601","customer":"customer2","article":"article4","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"6","item":"2","date":"20170601","customer":"customer2","article":"article1","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"6","item":"3","date":"20170601","customer":"customer2","article":"article2","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/control/sales_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "salesorder",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "item",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "date",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "customer",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "article",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "amount",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "dq_validations",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "run_name",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "run_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "raised_exceptions",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "run_row_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "dq_failure_details",
            "nullable": true,
            "type": {
              "containsNull": true,
              "elementType": {
                "fields": [
                  {
                    "metadata": {},
                    "name": "expectation_type",
                    "nullable": true,
                    "type": "string"
                  },
                  {
                    "metadata": {},
                    "name": "kwargs",
                    "nullable": true,
                    "type": "string"
                  }
                ],
                "type": "struct"
              },
              "type": "array"
            }
          }
        ],
        "type": "struct"
      }
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_to_exist|in_motion|test_db|dummy_sales|article|{"column": "article"}
rule_2|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales||{"min_value": 3, "max_value": 50}
rule_3|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/dq_functions/test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_new.csv
================================================
dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments
rule_1|COLUMN EXISTS|expect_column_to_exist|in_motion|test_db|dummy_sales|article||{"column": "article"}
rule_2|ROW COUNT|expect_table_row_count_to_be_between|in_motion|test_db|dummy_sales|||{"min_value": 3, "max_value": 50}
rule_3|TABLE STRUCTURE|expect_wrong_expectation|at_rest|test_db|no_table|amount||{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_table/full_overwrite_tag/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":19,"min_value":19,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":19,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":9,"min_value":9,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":9,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":12,"min_value":12,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":12,"column":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "sum_total",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/sales.json
================================================
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/control/sales_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "actrequest_timestamp",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "request",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "datapakid",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "partno",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "record",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "salesorder",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "item",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "recordmode",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "date",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "customer",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "article",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "amount",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "dq_validations",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "run_name",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "run_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "raised_exceptions",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "run_row_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "dq_failure_details",
            "nullable": true,
            "type": {
              "containsNull": true,
              "elementType": {
                "fields": [
                  {
                    "metadata": {},
                    "name": "expectation_type",
                    "nullable": true,
                    "type": "string"
                  },
                  {
                    "metadata": {},
                    "name": "kwargs",
                    "nullable": true,
                    "type": "string"
                  }
                ],
                "type": "struct"
              },
              "type": "array"
            }
          }
        ],
        "type": "struct"
      }
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/streaming_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/dq",
      "result_sink_db_table": "test_db.validator_delta_with_dupl_tag_gen_fail",
      "result_sink_format": "delta",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "tag_source_data": true,
      "source": "condensed_sales",
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 19,
            "max_value": 19
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/streaming_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/dq",
      "result_sink_db_table": "test_db.validator_delta_with_dupl_tag_gen_fail",
      "result_sink_format": "delta",
      "tag_source_data": true,
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "source": "condensed_sales",
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 9,
            "max_value": 9
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        },
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "fake_column"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      },
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_dupl_tag_gen_fail/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"19.0","min_value":"19.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"19","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"9.0","min_value":"9.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"9","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/streaming_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/dq",
      "result_sink_db_table": "test_db.validator_delta_with_duplicates",
      "result_sink_format": "json",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "source": "condensed_sales",
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 19,
            "max_value": 19
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates/streaming_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/dq",
      "result_sink_db_table": "test_db.validator_delta_with_duplicates",
      "result_sink_format": "json",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "source": "condensed_sales",
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 9,
            "max_value": 9
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      },
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"19.0","min_value":"19.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"19","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":3,"success_percent":66.66666666666666,"successful_expectations":2,"unsuccessful_expectations":1,"expectation_type":"expect_multicolumn_sum_to_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":null,"column_B":null,"column_list":"[salesorder, request]","sum_total":"5.0", "unexpected_index_list":[{"run_success":false,"customer":"customer1","date":null,"item":"1","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"2","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer1","date":20160601,"item":"3","request":"0","salesorder":"1"},{"run_success":false,"customer":"customer2","date":20170215,"item":"1","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"3","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"4","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"2","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170215,"item":"2","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer2","date":20170215,"item":"3","request":"0","salesorder":"2"},{"run_success":false,"customer":"customer1","date":20170215,"item":"1","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer1","date":20170215,"item":"2","request":"0","salesorder":"3"},{"run_success":false,"customer":"customer3","date":20170430,"item":"1","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"2","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer3","date":20170430,"item":"3","request":"0","salesorder":"4"},{"run_success":false,"customer":"customer2","date":20170601,"item":"1","request":"0","salesorder":"6"},{"run_success":false,"customer":"customer2","date":20170601,"item":"3","request":"0","salesorder":"6"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"9.0","min_value":"9.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"9","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"12.0","min_value":"12.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_table_column_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info","observed_value":"12","column":null,"column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_values_to_be_in_set","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"salesorder","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"customer":"customer5","date":"20180110","item":"1","salesorder":"7"}],"value_set":"[1, 2, 3, 4, 5]","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":"3.0","min_value":"3.0","evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_value_lengths_to_be_between","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"amount","column_A":null,"column_B":null, "unexpected_index_list":[{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"4","salesorder":"4"},{"run_success":false,"amount":"50","customer":"customer2","date":20170215,"item":2,"salesorder":2},{"run_success":false,"amount":"70","customer":"customer3","date":20170430,"item":"1","salesorder":"4"},{"run_success":false,"amount":"80","customer":"customer3","date":20170430,"item":"2","salesorder":"4"},{"run_success":false,"amount":"40","customer":"customer3","date":20170430,"item":"3","salesorder":"4"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_to_exist","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":"fake_column","column_A":null,"column_B":null,"unexpected_index_list":null,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T10:40:13.053632+00:00","run_results":"run_results_for_all_expectations","success":false,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"condensed_sales","validation_results":"validation_results","source":"condensed_sales","batch_id":"batch_id","max_value":null,"min_value":null,"evaluated_expectations":6,"success_percent":33.33333333333333,"successful_expectations":2,"unsuccessful_expectations":4,"expectation_type":"expect_column_pair_values_to_be_equal","expectation_success":false,"kwargs":"kwargs","exception_info":"exception_info","observed_value":null,"column":null,"column_A":"datapakid","column_B":"partno", "unexpected_index_list":[{"run_success":false,"datapakid":"2","salesorder":"4","customer":"customer3","date":20170430,"item":"4","partno":"1"}],"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_list",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "sum_total",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": {
          "fields": [
            {
              "metadata": {},
              "name": "customer",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "date",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "item",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "request",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "salesorder",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "run_success",
              "nullable": true,
              "type": "boolean"
            },
             {
              "metadata": {},
              "name": "amount",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "datapakid",
              "nullable": true,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "partno",
              "nullable": true,
              "type": "string"
            }
          ],
          "type": "struct"
        },
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_A",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column_B",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "value_set",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/sales.json
================================================
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"1","recordmode":"N","customer":"customer1","article":"article1","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"2","recordmode":"N","date":"20160601","customer":"customer1","article":"article2","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"1","item":"3","recordmode":"N","date":"20160601","customer":"customer1","article":"article3","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"1","recordmode":"N","date":"20170215","customer":"customer2","article":"article4","amount":"10","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"2","item":"3","recordmode":"N","date":"20170215","customer":"customer2","article":"article1","amount":"30","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"3","item":"1","recordmode":"N","date":"20170215","customer":"customer1","article":"article5","amount":"200","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"1","recordmode":"N","date":"20170510","customer":"customer4","article":"article6","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"2","recordmode":"N","date":"20170510","customer":"customer4","article":"article3","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"5","item":"3","recordmode":"N","date":"20170510","customer":"customer4","article":"article5","amount":"80","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"1","recordmode":"N","date":"20170601","customer":"customer2","article":"article4","amount":"100","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"2","recordmode":"N","date":"20170601","customer":"customer2","article":"article1","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"00000000000000t","request":"0","datapakid":"0","partno":"0","record":"0","salesorder":"6","item":"3","recordmode":"N","date":"20170601","customer":"customer2","article":"article2","amount":"90","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_multicolumn_sum_to_equal","kwargs":"{\"column_list\":[\"salesorder\",\"request\"],\"sum_total\":5.0}"}]}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"3","salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"150","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":true}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"5","salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article2","amount":"50","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"2","partno":"1","record":"2","salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_pair_values_to_be_equal","kwargs":"{\"column_A\":\"datapakid\",\"column_B\":\"partno\"}"},{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}
{"actrequest_timestamp":"20180110120052t","request":"request1","datapakid":"1","partno":"1","record":"1","salesorder":"7","item":"1","recordmode":"N","date":"20180110","customer":"customer5","article":"article2","amount":"120","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_values_to_be_in_set","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"salesorder\",\"value_set\":[1,2,3,4,5]}"}]}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"4","salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"70","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}
{"actrequest_timestamp":"20180110130103t","request":"request2","datapakid":"1","partno":"1","record":"6","salesorder":"4","item":"3","recordmode":"N","date":"20170430","customer":"customer3","article":"article1","amount":"40","dq_validations":{"run_name":"--dq_validator-condensed_sales--checkpoint","run_success":false,"raised_exceptions":false,"run_row_success":false,"dq_failure_details":[{"expectation_type":"expect_column_value_lengths_to_be_between","kwargs":"{\"batch_id\":\"f254637fcd94414aae931f85b2d20d02\",\"column\":\"amount\",\"max_value\":3.0,\"min_value\":3.0}"}]}}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/control/sales_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "actrequest_timestamp",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "request",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "datapakid",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "partno",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "record",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "salesorder",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "item",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "recordmode",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "date",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "customer",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "article",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "amount",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "dq_validations",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "run_name",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "run_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "raised_exceptions",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "run_row_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "dq_failure_details",
            "nullable": true,
            "type": {
              "containsNull": true,
              "elementType": {
                "fields": [
                  {
                    "metadata": {},
                    "name": "expectation_type",
                    "nullable": true,
                    "type": "string"
                  },
                  {
                    "metadata": {},
                    "name": "kwargs",
                    "nullable": true,
                    "type": "string"
                  }
                ],
                "type": "struct"
              },
              "type": "array"
            }
          }
        ],
        "type": "struct"
      }
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/streaming_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/dq",
      "result_sink_db_table": "test_db.validator_delta_with_duplicates_tag",
      "result_sink_format": "delta",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "tag_source_data": true,
      "source": "condensed_sales",
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 19,
            "max_value": 19
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        },
        {
          "function": "expect_multicolumn_sum_to_equal",
          "args":{
            "column_list": ["salesorder", "request"],
            "sum_total": 5
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/streaming_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "condensed_sales",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/dq",
      "result_sink_db_table": "test_db.validator_delta_with_duplicates_tag",
      "result_sink_format": "delta",
      "tag_source_data": true,
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "source": "condensed_sales",
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 9,
            "max_value": 9
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        },
        {
          "function": "expect_column_values_to_be_in_set",
          "args": {
            "column": "salesorder",
            "value_set": [1, 2, 3, 4, 5]
          }
        },
        {
          "function": "expect_column_value_lengths_to_be_between",
          "args": {
            "column": "amount",
            "min_value": 3,
            "max_value": 3
          }
        },
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "fake_column"
          }
        },
        {
          "function": "expect_column_pair_values_to_be_equal",
          "args": {
            "column_A": "datapakid",
            "column_B": "partno"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      },
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/delta_with_duplicates_tag/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "validator",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/dq",
      "result_sink_db_table": "test_db.validator_full_overwrite",
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "sales",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "tag_source_data": true,
      "dq_functions": [
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "article"
          }
        },
        {
          "function": "expect_table_row_count_to_be_between",
          "args": {
            "min_value": 3,
            "max_value": 50
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/batch_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "validator",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/dq",
      "result_sink_db_table": "test_db.validator_full_overwrite",
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "sales",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "tag_source_data": true,
      "dq_functions": [
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "article"
          }
        },
        {
          "function": "expect_table_row_count_to_be_between",
          "args": {
            "min_value": 3,
            "max_value": 50
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite/data"
    }
  ],
  "exec_env": {
    "spark.sql.sources.partitionColumnTypeInference.enabled": false
  }
}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "validator",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/dq",
      "result_sink_db_table": "test_db.validator_full_overwrite_tag",
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "sales",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "tag_source_data": true,
      "dq_functions": [
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "article"
          }
        },
        {
          "function": "expect_table_row_count_to_be_between",
          "args": {
            "min_value": 3,
            "max_value": 50
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/batch_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "validator",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/dq",
      "result_sink_db_table": "test_db.validator_full_overwrite_tag",
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "sales",
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "tag_source_data": true,
      "dq_functions": [
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "article"
          }
        },
        {
          "function": "expect_table_row_count_to_be_between",
          "args": {
            "min_value": 3,
            "max_value": 50
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data"
    }
  ],
  "exec_env": {
    "spark.sql.sources.partitionColumnTypeInference.enabled": false
  }
}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config_init","run_name":"20221228-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_1","source":"sales","batch_id":"batch_id_1","column":"article","evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_column_to_exist","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_1","run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20221229-104013-dq_validator-sales_source-checkpoint","run_time":"2022-12-29T21:40:13.053632+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","validation_results":"validation_results_2","source":"sales","batch_id":"batch_id_2","max_value":50,"min_value":3,"evaluated_expectations":2,"success_percent":100.0,"successful_expectations":2,"unsuccessful_expectations":0,"expectation_type":"expect_table_row_count_to_be_between","expectation_success":true,"kwargs":"kwargs","exception_info":"exception_info_2","observed_value":19,"run_time_year":2022,"run_time_month":12,"run_time_day":29,"source_primary_key": ["salesorder", "item", "date", "customer"]}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "validation_result_identifier",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "batch_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "column",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "max_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "min_value",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "evaluated_expectations",
      "nullable": true,
      "type": "float"
    },
    {
      "metadata": {},
      "name": "success_percent",
      "nullable": true,
      "type": "double"
    },
    {
      "metadata": {},
      "name": "successful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "unsuccessful_expectations",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "expectation_type",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "expectation_success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "exception_info",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "exception_message",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "exception_traceback",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "raised_exception",
            "nullable": true,
            "type": "boolean"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "meta",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "column",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_check_type",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "dq_rule_id",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "execution_point",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "filters",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "schema",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "table",
            "nullable": true,
            "type": "string"
          }
        ],
        "type": "struct"
      }
    },
    {
      "metadata": {},
      "name": "observed_value",
      "nullable": true,
      "type": "long"
    },
    {
      "metadata": {},
      "name": "run_time_year",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_month",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "run_time_day",
      "nullable": true,
      "type": "integer"
    },
    {
      "metadata": {},
      "name": "kwargs",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/sales.json
================================================
{"salesorder":"1","item":"1","date":"20160601","customer":"customer1","article":"article1","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"1","item":"2","date":"20160601","customer":"customer1","article":"article2","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"1","item":"3","date":"20160601","customer":"customer1","article":"article3","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"2","item":"1","date":"20170215","customer":"customer2","article":"article4","amount":"1000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"2","item":"2","date":"20170215","customer":"customer2","article":"article6","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"2","item":"3","date":"20170215","customer":"customer2","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"3","item":"1","date":"20170215","customer":"customer1","article":"article5","amount":"20000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"3","item":"2","date":"20170215","customer":"customer1","article":"article2","amount":"12000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"3","item":"3","date":"20170215","customer":"customer1","article":"article4","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"1","date":"20170430","customer":"customer3","article":"article3","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"2","date":"20170430","customer":"customer3","article":"article7","amount":"7000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"3","date":"20170430","customer":"customer3","article":"article1","amount":"3000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"4","item":"4","date":"20170430","customer":"customer3","article":"article2","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"5","item":"1","date":"20170510","customer":"customer4","article":"article6","amount":"15000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"5","item":"2","date":"20170510","customer":"customer4","article":"article3","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"5","item":"3","date":"20170510","customer":"customer4","article":"article5","amount":"8000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"6","item":"1","date":"20170601","customer":"customer2","article":"article4","amount":"10000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"6","item":"2","date":"20170601","customer":"customer2","article":"article1","amount":"5000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}
{"salesorder":"6","item":"3","date":"20170601","customer":"customer2","article":"article2","amount":"9000","dq_validations":{"run_name":"--dq_validator-sales_source--checkpoint","run_success":true,"raised_exceptions":false,"run_row_success":true}}


================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/control/sales_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "salesorder",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "item",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "date",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "customer",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "article",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "amount",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "dq_validations",
      "nullable": true,
      "type": {
        "fields": [
          {
            "metadata": {},
            "name": "run_name",
            "nullable": true,
            "type": "string"
          },
          {
            "metadata": {},
            "name": "run_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "raised_exceptions",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "run_row_success",
            "nullable": true,
            "type": "boolean"
          },
          {
            "metadata": {},
            "name": "dq_failure_details",
            "nullable": true,
            "type": {
              "containsNull": true,
              "elementType": {
                "fields": [
                  {
                    "metadata": {},
                    "name": "expectation_type",
                    "nullable": true,
                    "type": "string"
                  },
                  {
                    "metadata": {},
                    "name": "kwargs",
                    "nullable": true,
                    "type": "string"
                  }
                ],
                "type": "struct"
              },
              "type": "array"
            }
          }
        ],
        "type": "struct"
      }
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/full_overwrite_tag/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/control/data_validator.json
================================================
{"checkpoint_config":"checkpoint_config_init","run_name":"20220611-211348-dq_validator-sales_source-checkpoint","run_time":"2022-06-11T21:13:48.505870+00:00","run_results":"run_results_for_all_expectations_1","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","source_primary_key": ["salesorder", "item", "date", "customer"]}
{"checkpoint_config":"checkpoint_config","run_name":"20220612-211348-dq_validator-sales_source-checkpoint","run_time":"2022-06-12T21:13:48.505870+00:00","run_results":"run_results_for_all_expectations_2","success":true,"validation_result_identifier":"validation_result_identifier","spec_id":"dq_validator","input_id":"sales_source","source_primary_key": ["salesorder", "item", "date", "customer"]}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/control/data_validator_schema.json
================================================
{
  "fields": [
    {
      "metadata": {},
      "name": "checkpoint_config",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_name",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "run_time",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "validation_results",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "success",
      "nullable": true,
      "type": "boolean"
    },
    {
      "metadata": {},
      "name": "spec_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "input_id",
      "nullable": true,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "source_primary_key",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "unexpected_index_list",
      "nullable": true,
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "processed_keys",
      "nullable": true,
      "type": "string"
    }
  ],
  "type": "struct"
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/streaming_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/no_transformers/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/dq",
      "result_sink_db_table": "test_db.validator_no_transformers",
      "result_sink_format": "json",
      "result_sink_explode": false,
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 34,
            "max_value": 34
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.test_no_transformers",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/load_with_dq_validator/no_transformers/streaming_new.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/data_quality/load_with_dq_validator/no_transformers/data"
    }
  ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "sales_source",
      "dq_type": "validator",
      "cache_df": true,
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/dq",
      "result_sink_db_table": "test_db.validator_no_transformers",
      "result_sink_format": "json",
      "result_sink_explode": false,
      "unexpected_rows_pk": ["salesorder", "item", "date", "customer"],
      "dq_functions": [
        {
          "function": "expect_table_row_count_to_be_between",
          "args":{
            "min_value": 26,
            "max_value": 26
          }
        },
        {
          "function": "expect_table_column_count_to_be_between",
          "args":{
            "min_value": 12,
            "max_value": 12
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "dq_validator",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.test_no_transformers",
      "location": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/data_quality/load_with_dq_validator/no_transformers/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/data_quality/validator/data/control/data_validator.csv
================================================
checkpoint_config|run_name|run_time|validation_results|success|validation_result_identifier|spec_id|input_id|source_primary_key|processed_keys
checkpoint_config|20220612-221423-validator-sales_orders-checkpoint|2022-06-12T22:14:23.625852+00:00|validation_results_for_all_expectations|true|validation_result_identifier|dq_success|sales_orders|["salesorder", "item", "date", "customer"]|
checkpoint_config2|20220613-221423-validator-sales_orders-checkpoint2|2022-06-12T22:14:23.625852+00:00|validation_results_for_all_expectations2|false|validation_result_identifier|dq_failure_error_disabled|sales_orders|["salesorder", "item", "date", "customer"]|

================================================
FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_failure.csv
================================================
dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments
rule_1|COLUMN EXISTS|expect_column_to_exist|at_rest|test_db|dummy_sales|article||{"column": "article"}
rule_2|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 1}

================================================
FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_failure_error_disabled.csv
================================================
dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments
rule_1|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 1}

================================================
FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_failure_max_percentage.csv
================================================
dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments
rule_1|COLUMN EXISTS|expect_column_to_exist|at_rest|test_db|dummy_sales|article||{"column": "article"}
rule_2|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 1}

================================================
FILE: tests/resources/feature/data_quality/validator/data/dq_functions/test_db.dq_functions_source_dq_success.csv
================================================
dq_rule_id|dq_check_type|dq_tech_function|execution_point|schema|table|column|filters|arguments
rule_1|COLUMN EXISTS|expect_column_to_exist|at_rest|test_db|dummy_sales|article||{"column": "article"}
rule_2|ROW COUNT|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|||{"min_value": 0, "max_value": 50}

================================================
FILE: tests/resources/feature/data_quality/validator/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data"
    },
    {
      "spec_id": "sales_silver",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_silver_timestamp",
      "input_id": "sales_silver",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "extraction_date"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "extraction_date",
            "increment_df": "max_sales_silver_timestamp"
          }
        },
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "group_and_rank",
          "args": {
            "group_key": [
              "salesorder",
              "item"
            ],
            "ranking_key": [
              "extraction_date",
              "changed_on",
              "lhe_row_id"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_silver",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "partitions": ["date"],
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "group_and_rank",
          "args": {
            "group_key": [
              "salesorder",
              "item"
            ],
            "ranking_key": [
              "extraction_date",
              "changed_on",
              "lhe_row_id"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_silver",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "partitions": ["date"],
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/control_batch_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_row_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "extraction_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/control_streaming_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "extraction_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_batch_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_row_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/control/batch.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date
1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000
1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000
1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000
2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000
2|2|shipped|20200811|20170215|customer2|article2|50|0|202108111600000000
2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000
3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000
3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000
3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000
4|1|cancelled|20200811|20170430|customer3|article3|100|1|202108111600000000
4|2|released|20200811|20170430|customer3|article7|80|2|202108111600000000
4|4|released|20200811|20170430|customer3|article2|60|4|202108111600000000
5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000
5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000
5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000
6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000
6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000
6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000
7|1|cancelled|20200811|20180110|customer5|article2|120|0|202108111500000000

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/control/streaming.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|extraction_date|lhe_batch_id|lhe_row_id
1|1|shipped|20200811|20160601|customer1|article1|150|202108111500000000|4|2
1|2|created|20200811|20160601|customer1|article2|200|202108111400000000|3|1
1|3|created|20200811|20160601|customer1|article3|50|202108111400000000|3|2
2|1|created|20200811|20170215|customer2|article4|10|202108111400000000|3|3
2|2|shipped|20200811|20170215|customer2|article2|50|202108111600000000|5|0
2|3|created|20200811|20170215|customer2|article1|30|202108111400000000|3|5
3|1|created|20200811|20170215|customer1|article5|200|202108111400000000|3|6
3|2|released|20200811|20170215|customer1|article2|120|202108111500000000|4|4
3|3|released|20200811|20170215|customer1|article4|90|202108111500000000|4|5
4|1|cancelled|20200811|20170430|customer3|article3|100|202108111600000000|5|1
4|2|released|20200811|20170430|customer3|article7|80|202108111600000000|5|2
4|4|released|20200811|20170430|customer3|article2|60|202108111600000000|5|4
5|1|created|20200811|20170510|customer4|article6|150|202108111400000000|3|13
5|2|created|20200811|20170510|customer4|article3|100|202108111400000000|3|14
5|3|created|20200811|20170510|customer4|article5|80|202108111400000000|3|15
6|1|created|20200811|20170601|customer2|article4|100|202108111400000000|3|16
6|2|created|20200811|20170601|customer2|article1|50|202108111400000000|3|17
6|3|created|20200811|20170601|customer2|article2|90|202108111400000000|3|18
7|1|cancelled|20200811|20180110|customer5|article2|120|202108111500000000|4|0

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111400000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
1|1|created|20200811|20160601|customer1|article1|100
1|2|created|20200811|20160601|customer1|article2|200
1|3|created|20200811|20160601|customer1|article3|50
2|1|created|20200811|20170215|customer2|article4|10
2|2|created|20200811|20170215|customer2|article6|50
2|3|created|20200811|20170215|customer2|article1|30
3|1|created|20200811|20170215|customer1|article5|200
3|2|created|20200811|20170215|customer1|article2|120
3|3|created|20200811|20170215|customer1|article4|90
4|1|created|20200811|20170430|customer3|article3|80
4|2|created|20200811|20170430|customer3|article7|70
4|3|created|20200811|20170430|customer3|article1|30
4|4|created|20200811|20170430|customer3|article2|50
5|1|created|20200811|20170510|customer4|article6|150
5|2|created|20200811|20170510|customer4|article3|100
5|3|created|20200811|20170510|customer4|article5|80
6|1|created|20200811|20170601|customer2|article4|100
6|2|created|20200811|20170601|customer2|article1|50
6|3|created|20200811|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111500000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
7|1|cancelled|20200811|20180110|customer5|article2|120
7|1|created|20200811|20180110|customer5|article2|120
1|1|shipped|20200811|20160601|customer1|article1|150
2|2|released|20200811|20170215|customer2|article2|50
3|2|released|20200811|20170215|customer1|article2|120
3|3|released|20200811|20170215|customer1|article4|90

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111600000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
2|2|shipped|20200811|20170215|customer2|article2|50
4|1|cancelled|20200811|20170430|customer3|article3|100
4|2|released|20200811|20170430|customer3|article7|80
4|3|deleted|20200811|20170430|customer3|article1|30
4|4|released|20200811|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_bronze",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_bronze_with_extraction_date",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "group_and_rank",
          "args": {
            "group_key": [
              "salesorder",
              "item"
            ],
            "ranking_key": [
              "extraction_date",
              "changed_on",
              "lhe_row_id"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_silver",
      "input_id": "sales_bronze_with_extraction_date",
      "write_type": "merge",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/streaming/checkpoint"
      },
      "with_batch_id": true,
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data"
    },
    {
      "spec_id": "sales_silver",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_silver_timestamp",
      "input_id": "sales_silver",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "extraction_date"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "extraction_date",
            "increment_df": "max_sales_silver_timestamp"
          }
        },
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "group_and_rank",
          "args": {
            "group_key": [
              "salesorder",
              "item"
            ],
            "ranking_key": [
              "extraction_date",
              "changed_on",
              "lhe_row_id"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_silver",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "group_and_rank",
          "args": {
            "group_key": [
              "salesorder",
              "item"
            ],
            "ranking_key": [
              "extraction_date",
              "changed_on",
              "lhe_row_id"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_silver",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/control_batch_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_row_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "extraction_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/control_streaming_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "extraction_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_batch_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_row_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/control/batch.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date
1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000
1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000
1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000
2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000
2|2|shipped|20200811|20170215|customer2|article2|50|0|202108111600000000
2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000
3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000
3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000
3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000
4|1|cancelled|20200811|20170430|customer3|article3|100|1|202108111600000000
4|2|released|20200811|20170430|customer3|article7|80|2|202108111600000000
4|4|released|20200811|20170430|customer3|article2|60|4|202108111600000000
5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000
5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000
5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000
6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000
6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000
6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000
7|1|cancelled|20200811|20180110|customer5|article2|120|1|202108111500000000

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/control/streaming.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|extraction_date|lhe_batch_id|lhe_row_id
1|1|shipped|20200811|20160601|customer1|article1|150|202108111500000000|4|2
1|2|created|20200811|20160601|customer1|article2|200|202108111400000000|3|1
1|3|created|20200811|20160601|customer1|article3|50|202108111400000000|3|2
2|1|created|20200811|20170215|customer2|article4|10|202108111400000000|3|3
2|2|shipped|20200811|20170215|customer2|article2|50|202108111600000000|5|0
2|3|created|20200811|20170215|customer2|article1|30|202108111400000000|3|5
3|1|created|20200811|20170215|customer1|article5|200|202108111400000000|3|6
3|2|released|20200811|20170215|customer1|article2|120|202108111500000000|4|4
3|3|released|20200811|20170215|customer1|article4|90|202108111500000000|4|5
4|1|cancelled|20200811|20170430|customer3|article3|100|202108111600000000|5|1
4|2|released|20200811|20170430|customer3|article7|80|202108111600000000|5|2
4|4|released|20200811|20170430|customer3|article2|60|202108111600000000|5|4
5|1|created|20200811|20170510|customer4|article6|150|202108111400000000|3|13
5|2|created|20200811|20170510|customer4|article3|100|202108111400000000|3|14
5|3|created|20200811|20170510|customer4|article5|80|202108111400000000|3|15
6|1|created|20200811|20170601|customer2|article4|100|202108111400000000|3|16
6|2|created|20200811|20170601|customer2|article1|50|202108111400000000|3|17
6|3|created|20200811|20170601|customer2|article2|90|202108111400000000|3|18
7|1|cancelled|20200811|20180110|customer5|article2|120|202108111500000000|4|1

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111400000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
1|1|created|20200811|20160601|customer1|article1|100
1|2|created|20200811|20160601|customer1|article2|200
1|3|created|20200811|20160601|customer1|article3|50
2|1|created|20200811|20170215|customer2|article4|10
2|2|created|20200811|20170215|customer2|article6|50
2|3|created|20200811|20170215|customer2|article1|30
3|1|created|20200811|20170215|customer1|article5|200
3|2|created|20200811|20170215|customer1|article2|120
3|3|created|20200811|20170215|customer1|article4|90
4|1|created|20200811|20170430|customer3|article3|80
4|2|created|20200811|20170430|customer3|article7|70
4|3|created|20200811|20170430|customer3|article1|30
4|4|created|20200811|20170430|customer3|article2|50
5|1|created|20200811|20170510|customer4|article6|150
5|2|created|20200811|20170510|customer4|article3|100
5|3|created|20200811|20170510|customer4|article5|80
6|1|created|20200811|20170601|customer2|article4|100
6|2|created|20200811|20170601|customer2|article1|50
6|3|created|20200811|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111500000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
7|1|created|20200811|20180110|customer5|article2|120
7|1|cancelled|20200811|20180110|customer5|article2|120
1|1|shipped|20200811|20160601|customer1|article1|150
2|2|released|20200811|20170215|customer2|article2|50
3|2|released|20200811|20170215|customer1|article2|120
3|3|released|20200811|20170215|customer1|article4|90

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/data/source/WE_SO_SCL_202108111600000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
2|2|shipped|20200811|20170215|customer2|article2|50
4|1|cancelled|20200811|20170430|customer3|article3|100
4|2|released|20200811|20170430|customer3|article7|80
4|3|deleted|20200811|20170430|customer3|article1|30
4|4|released|20200811|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_bronze",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_bronze_with_extraction_date",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "group_and_rank",
          "args": {
            "group_key": [
              "salesorder",
              "item"
            ],
            "ranking_key": [
              "extraction_date",
              "changed_on",
              "lhe_row_id"
            ]
          }
        },
        {
          "function": "repartition",
          "args": {
            "num_partitions": 1
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_silver",
      "input_id": "sales_bronze_with_extraction_date",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/checkpoint"
      },
      "with_batch_id": true,
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/control_batch_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lhe_row_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "extraction_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "example_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/data"
    },
    {
      "spec_id": "example_silver",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/insert_column_set/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_silver_timestamp",
      "input_id": "example_silver",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "extraction_date"
          }
        }
      ]
    },
    {
      "spec_id": "example_transform",
      "input_id": "example_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "extraction_date",
            "increment_df": "max_sales_silver_timestamp"
          }
        },
        {
          "function": "with_auto_increment_id"
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "example_output",
      "input_id": "example_transform",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/insert_column_set/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "insert_predicate": "new.event in ('shipped','cancelled')",
        "insert_column_set": {"salesorder": "new.salesorder", "item": "new.item", "event": "new.event","changed_on": "new.changed_on",
          "amount": "new.amount + 101", "lhe_row_id": "new.lhe_row_id", "extraction_date": "new.extraction_date"},
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "example_input",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/insert_column_set/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "example_transform",
      "input_id": "example_input",
      "transformers": [
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "example_bronze",
      "input_id": "example_transform",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/insert_column_set/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/data/control/batch.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date
1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000
1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000
2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000
3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000
4|2|created|20200811|20170430|customer3|article7|70|10|202108111400000000
4|3|created|20200811|20170430|customer3|article1|30|11|202108111400000000
5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000
5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000
5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000
6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000
6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000
7|1|cancelled|20200811||||221|1|202108111500000000
1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000
2|2|released|20200811|20170215|customer2|article2|50|3|202108111500000000
2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000
3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000
3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000
4|1|created|20200811|20170430|customer3|article3|80|9|202108111400000000
4|4|created|20200811|20170430|customer3|article2|50|12|202108111400000000
6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000

================================================
FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/data/source/WE_SO_SCL_202108111400000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
1|1|created|20200811|20160601|customer1|article1|100
1|2|created|20200811|20160601|customer1|article2|200
1|3|created|20200811|20160601|customer1|article3|50
2|1|created|20200811|20170215|customer2|article4|10
2|2|created|20200811|20170215|customer2|article6|50
2|3|created|20200811|20170215|customer2|article1|30
3|1|created|20200811|20170215|customer1|article5|200
3|2|created|20200811|20170215|customer1|article2|120
3|3|created|20200811|20170215|customer1|article4|90
4|1|created|20200811|20170430|customer3|article3|80
4|2|created|20200811|20170430|customer3|article7|70
4|3|created|20200811|20170430|customer3|article1|30
4|4|created|20200811|20170430|customer3|article2|50
5|1|created|20200811|20170510|customer4|article6|150
5|2|created|20200811|20170510|customer4|article3|100
5|3|created|20200811|20170510|customer4|article5|80
6|1|created|20200811|20170601|customer2|article4|100
6|2|created|20200811|20170601|customer2|article1|50
6|3|created|20200811|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/merge_options/insert_column_set/data/source/WE_SO_SCL_202108111500000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
7|1|created|20200811|20180110|customer5|article2|120
7|1|cancelled|20200811|20180110|customer5|article2|120
1|1|shipped|20200811|20160601|customer1|article1|150
2|2|released|20200811|20170215|customer2|article2|50
3|2|released|20200811|20170215|customer1|article2|120
3|3|released|20200811|20170215|customer1|article4|90

================================================
FILE: tests/resources/feature/delta_load/merge_options/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "event",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "changed_on",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_all/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "example_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/data"
    },
    {
      "spec_id": "example_silver",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_all/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_silver_timestamp",
      "input_id": "example_silver",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "extraction_date"
          }
        }
      ]
    },
    {
      "spec_id": "example_transform",
      "input_id": "example_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "extraction_date",
            "increment_df": "max_sales_silver_timestamp"
          }
        },
        {
          "function": "with_auto_increment_id"
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "example_output",
      "input_id": "example_transform",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_all/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_all/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "example_input",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_all/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "example_transform",
      "input_id": "example_input",
      "transformers": [
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "example_bronze",
      "input_id": "example_transform",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_all/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_all/data/control/batch.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date
1|1|shipped|20200811|20160601|customer1|article1|150|2|202108111500000000
1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000
2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000
3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000
4|2|created|20200811|20170430|customer3|article7|70|10|202108111400000000
4|3|created|20200811|20170430|customer3|article1|30|11|202108111400000000
5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000
5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000
5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000
6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000
6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000
7|1|created|20200811|20180110|customer5|article2|120|0|202108111500000000
7|1|cancelled|20200811|20180110|customer5|article2|120|1|202108111500000000
1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000
2|2|released|20200811|20170215|customer2|article2|50|3|202108111500000000
2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000
3|2|released|20200811|20170215|customer1|article2|120|4|202108111500000000
3|3|released|20200811|20170215|customer1|article4|90|5|202108111500000000
4|1|created|20200811|20170430|customer3|article3|80|9|202108111400000000
4|4|created|20200811|20170430|customer3|article2|50|12|202108111400000000
6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_all/data/source/WE_SO_SCL_202108111400000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
1|1|created|20200811|20160601|customer1|article1|100
1|2|created|20200811|20160601|customer1|article2|200
1|3|created|20200811|20160601|customer1|article3|50
2|1|created|20200811|20170215|customer2|article4|10
2|2|created|20200811|20170215|customer2|article6|50
2|3|created|20200811|20170215|customer2|article1|30
3|1|created|20200811|20170215|customer1|article5|200
3|2|created|20200811|20170215|customer1|article2|120
3|3|created|20200811|20170215|customer1|article4|90
4|1|created|20200811|20170430|customer3|article3|80
4|2|created|20200811|20170430|customer3|article7|70
4|3|created|20200811|20170430|customer3|article1|30
4|4|created|20200811|20170430|customer3|article2|50
5|1|created|20200811|20170510|customer4|article6|150
5|2|created|20200811|20170510|customer4|article3|100
5|3|created|20200811|20170510|customer4|article5|80
6|1|created|20200811|20170601|customer2|article4|100
6|2|created|20200811|20170601|customer2|article1|50
6|3|created|20200811|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_all/data/source/WE_SO_SCL_202108111500000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
7|1|created|20200811|20180110|customer5|article2|120
7|1|cancelled|20200811|20180110|customer5|article2|120
1|1|shipped|20200811|20160601|customer1|article1|150
2|2|released|20200811|20170215|customer2|article2|50
3|2|released|20200811|20170215|customer1|article2|120
3|3|released|20200811|20170215|customer1|article4|90

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_column_set/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "example_bronze",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/data"
    },
    {
      "spec_id": "example_silver",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_column_set/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_silver_timestamp",
      "input_id": "example_silver",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "extraction_date"
          }
        }
      ]
    },
    {
      "spec_id": "example_transform",
      "input_id": "example_bronze",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "extraction_date",
            "increment_df": "max_sales_silver_timestamp"
          }
        },
        {
          "function": "with_auto_increment_id"
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "example_output",
      "input_id": "example_transform",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_column_set/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item",
        "update_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",
        "update_column_set": {"event": "current.event", "lhe_row_id": "new.lhe_row_id + 100" },
        "delete_predicate": "new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_column_set/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "example_input",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/source_schema.json",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/merge_options/update_column_set/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "example_transform",
      "input_id": "example_input",
      "transformers": [
        {
          "function": "with_auto_increment_id"
        },
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "example_bronze",
      "input_id": "example_transform",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/merge_options/update_column_set/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_column_set/data/control/batch.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount|lhe_row_id|extraction_date
1|2|created|20200811|20160601|customer1|article2|200|1|202108111400000000
2|2|created|20200811|20170215|customer2|article6|50|103|202108111400000000
2|3|created|20200811|20170215|customer2|article1|30|5|202108111400000000
3|2|created|20200811|20170215|customer1|article2|120|104|202108111400000000
3|3|created|20200811|20170215|customer1|article4|90|105|202108111400000000
4|1|created|20200811|20170430|customer3|article3|80|9|202108111400000000
4|4|created|20200811|20170430|customer3|article2|50|12|202108111400000000
6|3|created|20200811|20170601|customer2|article2|90|18|202108111400000000
1|1|created|20200811|20160601|customer1|article1|100|102|202108111400000000
1|3|created|20200811|20160601|customer1|article3|50|2|202108111400000000
2|1|created|20200811|20170215|customer2|article4|10|3|202108111400000000
3|1|created|20200811|20170215|customer1|article5|200|6|202108111400000000
4|2|created|20200811|20170430|customer3|article7|70|10|202108111400000000
4|3|created|20200811|20170430|customer3|article1|30|11|202108111400000000
5|1|created|20200811|20170510|customer4|article6|150|13|202108111400000000
5|2|created|20200811|20170510|customer4|article3|100|14|202108111400000000
5|3|created|20200811|20170510|customer4|article5|80|15|202108111400000000
6|1|created|20200811|20170601|customer2|article4|100|16|202108111400000000
6|2|created|20200811|20170601|customer2|article1|50|17|202108111400000000
7|1|created|20200811|20180110|customer5|article2|120|0|202108111500000000
7|1|cancelled|20200811|20180110|customer5|article2|120|1|202108111500000000

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_column_set/data/source/WE_SO_SCL_202108111400000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
1|1|created|20200811|20160601|customer1|article1|100
1|2|created|20200811|20160601|customer1|article2|200
1|3|created|20200811|20160601|customer1|article3|50
2|1|created|20200811|20170215|customer2|article4|10
2|2|created|20200811|20170215|customer2|article6|50
2|3|created|20200811|20170215|customer2|article1|30
3|1|created|20200811|20170215|customer1|article5|200
3|2|created|20200811|20170215|customer1|article2|120
3|3|created|20200811|20170215|customer1|article4|90
4|1|created|20200811|20170430|customer3|article3|80
4|2|created|20200811|20170430|customer3|article7|70
4|3|created|20200811|20170430|customer3|article1|30
4|4|created|20200811|20170430|customer3|article2|50
5|1|created|20200811|20170510|customer4|article6|150
5|2|created|20200811|20170510|customer4|article3|100
5|3|created|20200811|20170510|customer4|article5|80
6|1|created|20200811|20170601|customer2|article4|100
6|2|created|20200811|20170601|customer2|article1|50
6|3|created|20200811|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/merge_options/update_column_set/data/source/WE_SO_SCL_202108111500000000.csv
================================================
salesorder|item|event|changed_on|date|customer|article|amount
7|1|created|20200811|20180110|customer5|article2|120
7|1|cancelled|20200811|20180110|customer5|article2|120
1|1|shipped|20200811|20160601|customer1|article1|150
2|2|released|20200811|20170215|customer2|article2|50
3|2|released|20200811|20170215|customer1|article2|120
3|3|released|20200811|20170215|customer1|article4|90

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_backfill.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_value": "20180110120052t",
            "greater_or_equal": true
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/backfill/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/backfill/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|1500
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|2000
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|500
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|100
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|500
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|300
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|2000
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|700
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|400
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|700
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|1500
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|1000
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|800
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|1000
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|500
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|900
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|1200

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|1000
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|2000
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|500
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|500
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|300
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|2000
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|1200
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|900
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|800
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|700
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|300
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|500
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|1500
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|1000
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|800
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|1000
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|500
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-03.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-04.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/backfill/data/source/part-05.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|1200
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|1000
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|1500
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|500
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|500
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|1200
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-900
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|800
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|1000
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|700
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|800
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|300
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|500
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|600
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|600
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|700
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|1000
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|700
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|800
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|400

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/direct_silver_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/bronze/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp",
            "greater_or_equal": true
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/bronze/data",
      "merge_opts": {
        "merge_predicate": "current.actrequest_timestamp = new.actrequest_timestamp and current.datapakid = new.datapakid and current.partno = new.partno and current.record = new.record and current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    },
    {
      "spec_id": "sales_silver",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/silver/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/direct_silver_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "partno"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/bronze/data",
      "merge_opts": {
        "merge_predicate": "current.actrequest_timestamp = new.actrequest_timestamp and current.datapakid = new.datapakid and current.partno = new.partno and current.record = new.record and current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    },
    {
      "spec_id": "sales_silver",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/direct_silver_load/silver/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/control/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-03.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/direct_silver_load/data/source/part-04.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp",
            "greater_or_equal": true
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-03.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/data/source/part-04.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "transformed_sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "transformed_sales_source",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/checkpoint"
      },
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp",
            "greater_or_equal": true
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/batch/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20211227175200t|20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|70
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N|20160601|customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2|N|20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-03.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|2|1|14|4|4||20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/data/source/part-04.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|3|1|1|4|4||20170430|customer3|article2|70
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "transformed_sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "transformed_sales_source",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/out_of_order_changes/streaming/checkpoint"
      },
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "update_predicate": "new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "ranking_key_asc": [
              "recordmode"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|discount|uninteresting_column
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|0.0|10.0
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|10.0|
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|10.0|
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|10.0|
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|10.0|
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|10.0|
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|10.0|
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|10.0|

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-03.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|discount|uninteresting_column
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100|10.0|
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70|10.0|
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80|10.0|
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30|10.0|
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50|10.0|
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60|10.0|
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60|10.0|

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_deletes_additional_columns/data/source/part-04.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|discount|uninteresting_column
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70|10.0|
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100|10.0|
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70|10.0|
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80|10.0|
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40|10.0|

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_duplicates/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_duplicates/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_duplicates/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_duplicates/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "ranking_key_asc": [
              "recordmode"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_duplicates/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|00000000000000t|0|0|0|0|1|1|N||customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|2|N||customer2|article6|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80
20211227175200t|00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70
20211227175200t|00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30
20211227175200t|00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-02.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120
20211227175200t|20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120
20211227175200t|20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90
20211227175200t|20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80
20211227175200t|20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-03.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|9|4|1||20170430|customer3|article3|100
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|1|1|10|4|2|X|20170430|customer3|article7|70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110120052t|request1|1|1|12|4|3|D|20170430|customer3|article1|30
20211227175200t|20180110120052t|request1|1|1|13|4|4|X|20170430|customer3|article2|50
20211227175200t|20180110120052t|request1|1|1|14|4|4||20170430|customer3|article2|60
20211227175200t|20180110120052t|request1|2|1|1|4|4|X|20170430|customer3|article2|60

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_duplicates/data/source/part-04.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3|article2|70
20211227175200t|20180110130103t|request2|1|1|3|4|1|X|20170430|customer3|article3|100
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3|N|20170430|customer3|article1|40
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3|article3|70
20211227175200t|20180110130103t|request2|1|1|5|4|2|D|20170430|customer3|article7|80

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/batch_delta.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "json",
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')",
        "insert_predicate": "new.recordmode is null or new.recordmode not in ('R','D','X')"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "json",
      "location": "file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "extraction_timestamp",
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "ranking_key_asc": [
              "recordmode"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/control/part-01.csv
================================================
extraction_timestamp|actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount
20211227175200t|20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150
20211227175200t|00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200
20211227175200t|00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50
20211227175200t|00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10
20211227175200t|20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50
20211227175200t|00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30
20211227175200t|00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200
20211227175200t|00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120
20211227175200t|00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90
20211227175200t|20180110130103t|request2|1|1|4|4|1||20170430|customer3||70
20211227175200t|20180110120052t|request1|1|1|11|4|2||20170430|customer3|article7|80
20211227175200t|20180110130103t|request2|1|1|6|4|3||20170430|customer3||40
20211227175200t|20180110120052t|request1|2|1|2|4|4||20170430|customer3||70
20211227175200t|00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150
20211227175200t|00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100
20211227175200t|00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80
20211227175200t|00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100
20211227175200t|00000000000000t|0|0|0|0|6|2|N|20170601|customer2|article1|50
20211227175200t|00000000000000t|0|0|0|0|6|3|N|20170601|customer2|article2|90
20211227175200t|20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-01.json
================================================
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 1, "item": 1, "recordmode": "N", "date": "20160601", "customer": "customer1", "article": "article1", "amount": 100 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 1, "item": 2, "recordmode": "N", "date": "20160601", "customer": "customer1", "article": "article2", "amount": 200 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 1, "item": 3, "recordmode": "N", "date": "20160601", "customer": "customer1", "article": "article3", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 2, "item": 1, "recordmode": "N", "date": "20170215", "customer": "customer2", "article": "article4", "amount": 10 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 2, "item": 2, "recordmode": "N", "date": "20170215", "customer": "customer2", "article": "article6", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 2, "item": 3, "recordmode": "N", "date": "20170215", "customer": "customer2", "article": "article1", "amount": 30 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 3, "item": 1, "recordmode": "N", "date": "20170215", "customer": "customer1", "article": "article5", "amount": 200 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 3, "item": 2, "recordmode": "N", "date": "20170215", "customer": "customer1", "article": "article2", "amount": 120 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 3, "item": 3, "recordmode": "N", "date": "20170215", "customer": "customer1", "article": "article4", "amount": 90 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 1, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article3", "amount": 80 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 2, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article7", "amount": 70 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 3, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article1", "amount": 30 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 4, "item": 4, "recordmode": "N", "date": "20170430", "customer": "customer3", "article": "article2", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 5, "item": 1, "recordmode": "N", "date": "20170510", "customer": "customer4", "article": "article6", "amount": 150 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 5, "item": 2, "recordmode": "N", "date": "20170510", "customer": "customer4", "article": "article3", "amount": 100 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 5, "item": 3, "recordmode": "N", "date": "20170510", "customer": "customer4", "article": "article5", "amount": 80 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 6, "item": 1, "recordmode": "N", "date": "20170601", "customer": "customer2", "article": "article4", "amount": 100 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 6, "item": 2, "recordmode": "N", "date": "20170601", "customer": "customer2", "article": "article1", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "00000000000000t", "request": "0", "datapakid": 0, "partno": 0, "record": 0, "salesorder": 6, "item": 3, "recordmode": "N", "date": "20170601", "customer": "customer2", "article": "article2", "amount": 90 }

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-02.json
================================================
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 1, "salesorder": 7, "item": 1, "recordmode": "N", "date": "20180110", "customer": "customer5", "article": "article2", "amount": 120 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 2, "salesorder": 1, "item": 1, "recordmode": "X", "date": "20160601", "customer": "customer1", "article": "article1", "amount": 100 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 3, "salesorder": 1, "item": 1, "recordmode": null, "date": "20160601", "customer": "customer1", "article": "article1", "amount": 150 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 4, "salesorder": 2, "item": 2, "recordmode": "X", "date": "20170215", "customer": "customer2", "article": "article6", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 5, "salesorder": 2, "item": 2, "recordmode": null, "date": "20170215", "customer": "customer2", "article": "article2", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 8, "salesorder": 4, "item": 1, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article3", "amount": 80 }

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-03.json
================================================
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 9, "salesorder": 4, "item": 1, "recordmode": null, "date": "20170430", "customer": "customer3", "article": "article3", "amount": 100 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 10, "salesorder": 4, "item": 2, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article7", "amount": 70 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 11, "salesorder": 4, "item": 2, "recordmode": null, "date": "20170430", "customer": "customer3", "article": "article7", "amount": 80 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 13, "salesorder": 4, "item": 4, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article2", "amount": 50 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 1, "partno": 1, "record": 14, "salesorder": 4, "item": 4, "recordmode": null, "date": "20170430", "customer": "customer3", "article": "article2", "amount": 60 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 2, "partno": 1, "record": 1, "salesorder": 4, "item": 4, "recordmode": "X", "date": "20170430", "customer": "customer3", "article": "article2", "amount": 60 }

================================================
FILE: tests/resources/feature/delta_load/record_mode_cdc/with_upserts_only_removed_columns/data/source/part-04.json
================================================
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110120052t", "request": "request1", "datapakid": 2, "partno": 1, "record": 2, "salesorder": 4, "item": 4, "recordmode": null, "date": "20170430", "customer": "customer3", "amount": 70 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 3, "salesorder": 4, "item": 1, "recordmode": "X", "date": "20170430", "customer": "customer3", "amount": 100 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 4, "salesorder": 4, "item": 1, "recordmode": null, "date": "20170430", "customer": "customer3", "amount": 70 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 5, "salesorder": 4, "item": 3, "recordmode": "X", "date": "20170430", "customer": "customer3", "amount": 30 }
{ "extraction_timestamp": "20211227175200t", "actrequest_timestamp": "20180110130103t", "request": "request2", "datapakid": 1, "partno": 1, "record": 6, "salesorder": 4, "item": 3, "recordmode": null, "date": "20170430", "customer": "customer3", "amount": 40 }

================================================
FILE: tests/resources/feature/dq_validator/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/dq_validator/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "db_table": "test_db.dq_sales",
      "location": "file:///app/tests/lakehouse/out/feature/dq_validator/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/dq_validator/data/control/data_restore_control.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900

================================================
FILE: tests/resources/feature/dq_validator/data/control/dq_control_failure.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source

================================================
FILE: tests/resources/feature/dq_validator/data/control/dq_control_failure_disabled.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source

================================================
FILE: tests/resources/feature/dq_validator/data/control/dq_control_success.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/dq_validator/data/control/dq_control_success_explode.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|true|dq_sales|sales_source

================================================
FILE: tests/resources/feature/dq_validator/data/control/dq_control_success_explode_disabled.csv
================================================
checkpoint_config|run_id|run_results|success|spec_id|input_id
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source
checkpoint configs|{20220729-143444-dq_sales-sales_source-checkpoint, 2022-07-29T14:34:44.852796+00:00}|run_results_for_all_expectations|false|dq_sales|sales_source

================================================
FILE: tests/resources/feature/dq_validator/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900

================================================
FILE: tests/resources/feature/dq_validator/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/dq_validator/dq_sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/dq_validator/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/dq_validator/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "string",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/dq_validator/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/dq_validator/checkpoint"
      }
    }
  ],
  "exec_env": {
    "spark.sql.streaming.schemaInference": true
  }
}

================================================
FILE: tests/resources/feature/dq_validator/streaming_dataframe_two_runs/data/dq_functions/test_db.dq_functions_streaming_dataframe_two_runs_first_run.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}

================================================
FILE: tests/resources/feature/dq_validator/streaming_dataframe_two_runs/data/dq_functions/test_db.dq_functions_streaming_dataframe_two_runs_second_run.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}


================================================
FILE: tests/resources/feature/dq_validator/table_batch_dataframe_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_dataframe_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_dataframe_success/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_dataframe_success/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_dq_rule/data/dq_functions/test_db.dq_table_rule_id_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_3|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "article", "min_value": 3}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_4|expect_wrong_expectation|at_rest|test_db|dummy_invoice|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "article", "column_B": "amount"}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_dq_rule/data/dq_functions/test_db.dq_table_rule_id_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_success/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_batch_success/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_streaming_dq_rule/data/dq_functions/test_db.dq_table_rule_id_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article", "min_value": 0}
rule_3|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "article", "min_value": 3}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|dummy_invoice|amount|{"min_value": 3, "max_value": 11}
rule_5|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "article", "column_B": "amount"}

================================================
FILE: tests/resources/feature/dq_validator/table_streaming_dq_rule/data/dq_functions/test_db.dq_table_rule_id_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article", "min_value": 0}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_streaming_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_streaming_failure_disabled/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_streaming_success/data/dq_functions/test_db.dq_functions_source_table_failure.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_table_row_count_to_be_between|at_rest|test_db|dummy_sales|amount|{"min_value": 3, "max_value": 11}
rule_6|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/dq_validator/table_streaming_success/data/dq_functions/test_db.dq_functions_source_table_success.csv
================================================
dq_rule_id|dq_tech_function|execution_point|schema|table|column|arguments
rule_1|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_2|expect_column_min_to_be_between|in_motion|test_db|dummy_sales|amount|{"column": "amount", "min_value": 0}
rule_3|expect_column_to_exist|at_rest|test_db|dummy_sales|amount|{"column": "article"}
rule_4|expect_column_pair_a_to_be_smaller_or_equal_than_b|at_rest|test_db|dummy_sales|amount|{"column_A": "salesorder", "column_B": "amount"}
rule_5|expect_wrong_expectation|at_rest|test_db|no_table|amount|{"min_value": 3, "max_value": 11}

================================================
FILE: tests/resources/feature/engine_usage_stats/dq_validator/data/control.json
================================================
{"acon": {"input_spec": {"spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": true, "delimiter": "|"}, "location": "/app/tests/lakehouse/in/feature/engine_usage_stats/dq_validator/data/"}, "dq_spec": {"spec_id": "dq_sales", "input_id": "sales_source", "dq_type": "validator", "store_backend": "file_system", "local_fs_root_dir": "/app/tests/lakehouse/out/feature/engine_usage_stats/dq", "result_sink_db_table": "test_db.dq_validator", "result_sink_format": "json", "result_sink_explode": false, "dq_functions": [{"function": "expect_column_to_exist", "args": {"column": "article"}}, {"function": "expect_table_row_count_to_be_between", "args": {"min_value": 3, "max_value": 11}}, {"function": "expect_column_pair_a_to_be_smaller_or_equal_than_b", "args": {"column_A": "salesorder", "column_B": "amount"}}]}, "exec_env": {"dp_name": "dq_validator"}}, "dp_name": "dq_validator", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "execute_dq_validation", "engine_version": "1.17.0", "start_timestamp": "2024-01-03 15:05:58.808058", "year": 2024, "month": 1}

================================================
FILE: tests/resources/feature/engine_usage_stats/dq_validator/data/source.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/engine_usage_stats/load_custom_transf_and_df/data/control.json
================================================
{"acon": {"input_specs": [{"spec_id": "sales_source", "read_type": "batch", "data_format": "dataframe", "df_name": "DataFrame[salesorder: int, item: int, date: int, customer: string, article: string, amount: int]"}], "transform_specs": [{"spec_id": "renamed_kpi", "input_id": "sales_source", "transformers": [{"function": "rename", "args": {"cols": {"salesorder": "salesorder1"}}}, {"function": "custom_transformation", "args": {"custom_transformer": "<function custom_transformation at 0xffff92554d30>"}}]}], "output_specs": [{"spec_id": "sales_bronze", "input_id": "renamed_kpi", "write_type": "overwrite", "data_format": "delta", "location": "/app/tests/lakehouse/out/feature/engine_usage_stats/load_custom_transf_and_df/data/"}], "exec_env": {"dp_name": "load_custom_transf_and_df"}}, "dp_name": "load_custom_transf_and_df", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "load_data", "engine_version": "1.17.0", "start_timestamp": "2023-12-29 18:24:55.282039", "year": 2023, "month": 12}

================================================
FILE: tests/resources/feature/engine_usage_stats/load_custom_transf_and_df/data/source.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/engine_usage_stats/load_simple_acon/data/control.json
================================================
{"acon": {"input_specs": [{"spec_id": "sales_source", "read_type": "batch", "data_format": "csv", "options": {"mode": "FAILFAST", "header": true, "delimiter": "|", "password": "******"}, "location": "/app/tests/lakehouse/in/feature/engine_usage_stats/load_simple_acon/data/"}], "transform_specs": [{"spec_id": "renamed_kpi", "input_id": "sales_source", "transformers": [{"function": "rename", "args": {"cols": {"salesorder": "salesorder1"}}}]}], "output_specs": [{"spec_id": "sales_bronze", "input_id": "renamed_kpi", "write_type": "overwrite", "data_format": "delta", "location": "/app/tests/lakehouse/out/feature/engine_usage_stats/load_simple_acon/data/"}], "exec_env": {"dp_name": "load_simple_acon"}}, "dp_name": "load_simple_acon", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "load_data", "engine_version": "1.17.0", "start_timestamp": "2023-12-29 22:43:27.654809", "year": 2023, "month": 12}

================================================
FILE: tests/resources/feature/engine_usage_stats/load_simple_acon/data/source.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/engine_usage_stats/table_manager/data/control.json
================================================
{"acon": {"function": "execute_sql", "sql": "select 1", "exec_env": {"dp_name": "table_manager"}}, "dp_name": "table_manager", "environment": "", "workspace_id": "", "job_id": "", "job_name": "", "run_id": "", "function": "manage_table", "engine_version": "1.17.0", "start_timestamp": "2024-01-03 00:00:00.000000", "year": 2024, "month": 1}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/control/dummy_table.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210812171010000000000|94|100|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20210812171010000000000|95|101|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20210812171010000000000|96|102|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20210812181010000000000|97|103|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20210812181010000000000|98|104|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20210812181010000000000|99|105|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|
20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|
20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/control/dummy_table_join_condition.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210812171010000000000|94|100|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20210812171010000000000|95|101|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20210812171010000000000|96|102|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20210812181010000000000|97|103|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20210812181010000000000|98|104|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20210812181010000000000|99|105|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/control/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "reqtsn",
      "type": "decimal(23,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/dummy_table.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210812171010000000000|94|100|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20210812171010000000000|95|101|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20210812171010000000000|96|102|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20210812181010000000000|97|103|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20210812181010000000000|98|104|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20210812181010000000000|99|105|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/dummy_table_1.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210712171010000000000|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15|
20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/dummy_table_2.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/data/source/rspmrequest.csv
================================================
request_tsn|storage|last_operation_type|last_process_tsn|last_time_stamp|records|records_read|records_updated|creation_end_time|uname|source|request_status|request_status_before_deletion|last_request_status|request_is_in_process|tlogo|datatarget|syst_date|syst_time|housekeeping_status
20210712171010000000000|AQ|C|20210712171010000000000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00
20210812181010000000000|AQ|C|20211006073059000008000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00
20210912171010000000000|AQ|C|20211006073059000008000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00
20211112171010000000000|AQ|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00
20211113121010000000000|AQ|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00
20211115111010000000000|AQ|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00
20211116111010000000000|CL|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00
20211117111010000000000|AQ|C|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|GR|GR||N|ADSO|DUMMY_TABLE|20211020|123236|00
20211118111010000000000|AQ|C|20211020223734000053000|20211020223735000009000|431528|0|0|20211020223240000008000|UNAME|SOURCE|GR|GR||N|ADSO|DUMMY_TABLE|20211020|223236|00
20211118111010000000000|CL|D|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|123236|00

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "reqtsn",
      "type": "decimal(23,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_aq_dso/rspmrequest_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "request_tsn",
      "type": "decimal(23,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "storage",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_operation_type",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_process_tsn",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_time_stamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "records",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "records_read",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "records_updated",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "creation_end_time",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "uname",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "source",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_status_before_deletion",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_request_status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_is_in_process",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "tlogo",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datatarget",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "syst_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "syst_time",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "housekeeping_status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/control/dummy_table.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210713151010000000000|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20210713151010000000000|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20210713151010000000000|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20210713151010000000000|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20210713151010000000000|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20210713151010000000000|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|
20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|
20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/control/dummy_table_join_condition.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210713151010000000000|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20210713151010000000000|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20210713151010000000000|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20210713151010000000000|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20210713151010000000000|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20210713151010000000000|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/control/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "reqtsn",
      "type": "decimal(23,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/dummy_table.csv
================================================
salesorder|item|date|time|customer|/bic/article|amount|order_date
1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/dummy_table_cl_1.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20210712171010000000000|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15|
20211112171010000000000|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211112171010000000000|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211112171010000000000|1|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211112171010000000000|1|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211112171010000000000|2|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211112171010000000000|2|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211112171010000000000|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211112171010000000000|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211112171010000000000|3|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211112171010000000000|3|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/dummy_table_cl_2.csv
================================================
reqtsn|datapakid|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211113121010000000000|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211113121010000000000|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211113121010000000000|1|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211113121010000000000|1|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211113121010000000000|1|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211113121010000000000|2|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211117111010000000000|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211117111010000000000|3|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211118111010000000000|3|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211118111010000000000|4|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/data/source/rspmrequest.csv
================================================
REQUEST_TSN|STORAGE|LAST_OPERATION_TYPE|LAST_PROCESS_TSN|LAST_TIME_STAMP|RECORDS|RECORDS_READ|RECORDS_UPDATED|CREATION_END_TIME|UNAME|SOURCE|REQUEST_STATUS|REQUEST_STATUS_BEFORE_DELETION|LAST_REQUEST_STATUS|REQUEST_IS_IN_PROCESS|TLOGO|DATATARGET|SYST_DATE|SYST_TIME|HOUSEKEEPING_STATUS
20210712171010000000000|AT|C|20211006073059000008000|20211006073103000116000|643705|0|0|20211006073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211006|073100|00
20211112171010000000000|AT|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00
20211113121010000000000|AT|C|20211206073059000008000|20211206073103000116000|643705|0|0|20211206073103000116000|UNAME|SOURCE|GG|||N|ADSO|DUMMY_TABLE|20211206|073100|00
20211115111010000000000|AT|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00
20211116111010000000000|CL|D|20211020123121000011000|20211020123121000097000|381824|0|0|20211020113419000145000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|113416|00
20211117111010000000000|AT|C|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|GG|GG||N|ADSO|DUMMY_TABLE|20211020|123236|00
20211118111010000000000|AT|C|20211020223734000053000|20211020223735000009000|431528|0|0|20211020223240000008000|UNAME|SOURCE|GG|GG||N|ADSO|DUMMY_TABLE|20211020|223236|00
20211118111010000000000|CL|D|20211020123734000053000|20211020123735000009000|431528|0|0|20211020123240000008000|UNAME|SOURCE|D|GG||N|ADSO|DUMMY_TABLE|20211020|123236|00

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/dummy_table_cl_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "reqtsn",
      "type": "decimal(23,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_b4/extract_cl_dso/rspmrequest_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "request_tsn",
      "type": "decimal(23,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "storage",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_operation_type",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_process_tsn",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_time_stamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "records",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "records_read",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "records_updated",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "creation_end_time",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "uname",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "source",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_status_before_deletion",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "last_request_status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_is_in_process",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "tlogo",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datatarget",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "syst_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "syst_time",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "housekeeping_status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/RSBASIDOC_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "slogsys",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "rlogsys",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "tsprefix",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/RSTSODS_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "odsname_tech",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "odsname",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "userapp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "version",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/data/source/RSBASIDOC.csv
================================================
slogsys|rlogsys|tsprefix
DHACLNT003|DHACLNT003|OA
FFEWFEWCLN|FFEWFEWCLN|CA
PHACLNT003|DHACLNT003|CA
PHACLNT003|DHACLNT003|CB
AHACLNT003|DHACLNT001|CT
AHACLNT003|DHACLNT002|CD

================================================
FILE: tests/resources/feature/extract_from_sap_bw/derive_changelog_table_name/data/source/RSTSODS.csv
================================================
odsname_tech|odsname|userapp|version
test_table_OA|8test_table_OA|CHANGELOG|000
testchartable_OA|8testchartable_OA|CHANGELOG|000
testrtable_OA|8testrtable_OA|CHANGELOG|000
test_test_table_OA|8test_test_table_OA|CHANGELOG|000
test_table_OA|8test_table_OA|CHANGELOG|001
test_table_OA|8test_table_OA|NOTCHANGELOG|000
testtable_OA|8testtable_OA|CHANGELOG|000
testtable_OA|8testtable_OA|CHANGELOG|001
testtable_OA|8testtable_OA|NOTCHANGELOG|000

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/control/dummy_table.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211004151010|0|0|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20211004151010|0|0|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20211004151010|0|0|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20211004151010|0|0|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20211004151010|0|0|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20211004151010|0|0|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211104151010|ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|
20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211112171010|ODSR_2C6Q7CHLJJ08WG131T491L1ZF|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211113121010|ODSR_3C6Q7CHLJJ08WG131T491L1ZF|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211113121010|ODSR_3C6Q7CHLJJ08WG131T491L1ZF|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZA|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|
20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZF|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZF|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/control/dummy_table_join_condition.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211004151010|0|0|0|0|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20211004151010|0|0|0|0|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20211004151010|0|0|0|0|1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
20211004151010|0|0|0|0|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20211004151010|0|0|0|0|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20211004151010|0|0|0|0|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|
20211114111010|ODSR_4C6Q7CHLJJ08WG131T491L1ZA|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/control/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "decimal(15,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/dummy_table.csv
================================================
salesorder|item|date|time|customer|/bic/article|amount|order_date
1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
1|4|20160601|2016-06-01 10:01:12.000|customer99||3000|
1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/dummy_table_cl_1.csv
================================================
request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
DTPR_OLD_REQUEST_TO_IGNORE_444|1|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZF|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|
ODSR_1C6Q7CHLJJ08WG131T491L1ZA|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/dummy_table_cl_2.csv
================================================
request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
ODSR_2C6Q7CHLJJ08WG131T491L1ZF|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
ODSR_2C6Q7CHLJJ08WG131T491L1ZF|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
ODSR_3C6Q7CHLJJ08WG131T491L1ZF|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
ODSR_3C6Q7CHLJJ08WG131T491L1ZF|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
ODSR_4C6Q7CHLJJ08WG131T491L1ZF|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
ODSR_4C6Q7CHLJJ08WG131T491L1ZF|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|
ODSR_4C6Q7CHLJJ08WG131T491L1ZA|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/data/source/rsodsactreq.csv
================================================
odsobject|request|datapakid|activate|sidconversion|actrequest|operation|status|paketsize|timestamp
dummy_table|DTPR_OLD_REQUEST_TO_IGNORE_444|0|||DTPR_OLD_REQUEST_TO_IGNORE_444|A|0|0000020000|20211004151010
dummy_table|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_1C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211104151010
dummy_table|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_2C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211112171010
dummy_table|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_3C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211113121010
dummy_table|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_4C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211114111010
dummy_table|ODSR_4C6Q7CHLJJ08WG131T491L1ZA|0|||ODSR_4C6Q7CHLJJ08WG131T491L1ZA|A|0|0000020000|20211114111010

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/dummy_table_cl_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_dso/rsodsactreq_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "odsobject",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "activate",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "sidconversion",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "actrequest",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "operation",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "paketsize",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "timestamp",
      "type": "decimal(15,0)",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211004151010|DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20211004151010|DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20211004151010|DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table_actreq_timestamp.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211003161010|DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20211003161010|DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20211003161010|DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20211003161010|DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20211003161010|DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
20211104151010|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
20211112171010|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
20211113121010|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
20211114111010|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table_join_condition.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
20211004151010|DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
20211004151010|DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
20211004151010|DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
20211004151010|DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/control/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "decimal(15,0)",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/dummy_table.csv
================================================
request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
DTPR_INIT_REQUEST_123|1|1|1|1|1|20160601|2016-06-01 10:01:12.000|customer1|article1|1000|
DTPR_INIT_REQUEST_123|1|1|3|1|2|20160601|2016-06-01 10:01:12.000|customer1|article2|2000|
DTPR_INIT_REQUEST_123|1|1|3|1|3|20160601|2016-06-01 10:01:12.000|customer1|article3|500|
DTPR_INIT_REQUEST_123|2|2|1|2|1|20160701|2016-07-01 10:01:12.000|customer11|article33|500|
DTPR_INIT_REQUEST_123|2|3|1|3|1|20160701|2016-07-01 10:01:13.000|customer11|article33|500|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/dummy_table_1.csv
================================================
request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
DTPR_OLD_REQUEST_TO_IGNORE_444|1|1|1|3|1|20170510|2017-05-10 01:01:01.000|customer40|article60|15|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|1|2|1|20170215|2017-02-15 10:01:12.000|customer2|article4|1000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|1|2|2|2|20170215|2017-02-15 10:01:12.000|customer2|article6|5000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|3|2|3|20170215|2017-02-15 10:01:12.000|customer2|article1|3000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|1|2|4|3|1|20170215|2017-02-15 10:01:12.000|customer1|article5|20000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|5|3|2|20170215|2017-02-15 10:01:12.000|customer1|article2|12000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|1|6|3|3|20170215|2017-02-15 10:01:12.000|customer1|article4|9000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|7|4|1|20170430|2017-04-30 10:01:12.000|customer3|article3|8000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|2|2|8|4|2|20170430|2017-04-30 10:01:12.000|customer3|article7|7000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|9|4|3|20170430|2017-04-30 10:01:12.000|customer3|article1|3000|
DTPR_F49Y1VBE6JO079PMFTL1X8BPY|3|1|10|4|4|20170430|2017-04-30 10:01:12.000|customer3|article2|5000|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/dummy_table_2.csv
================================================
request|datapakid|partno|record|salesorder|item|date|time|customer|/bic/article|amount|order_date
DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|1|5|1|20170510|2017-05-10 01:01:01.000|customer4|article6|15000|
DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|1|2|5|2|20170510|2017-05-10 01:01:01.000|customer4|article3|10000|
DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|3|5|3|20170510|2017-05-10 01:01:01.000|customer4|article5|8000|
DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|2|4|6|1|20170601|2017-06-01 01:01:01.000|customer2|article4|10000|
DTPR_F69Y1VBE6JO079PMFTL1X8BPY|1|3|5|6|2|20170601|2017-06-01 01:01:01.000|customer2|article1|5000|
DTPR_F69Y1VBE6JO079PMFTL1X8BPY|2|1|6|6|3|20170601|2017-06-01 01:01:01.000|customer2|article2|9000|
DTPR_F89Y1VBE6JO079PMFTL1X8BPY|2|2|7|6|2|20170602|2017-06-02 01:01:01.000|customer5|article1|5320|
DTPR_F89Y1VBE6JO079PMFTL1X8BPY|3|1|8|6|3|20170602|2017-06-02 01:01:01.000|customer5|article2|9320|
DTPR_F99Y1VBE6JO079PMFTL1X8BPY|3|2|9|6|2|20170603|2017-06-03 01:01:01.000|customer6|article1|5010|
DTPR_F99Y1VBE6JO079PMFTL1X8BPY|4|1|10|6|3|20170603|2017-06-03 01:01:01.000|customer6|article2|50|

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/data/source/rsodsactreq.csv
================================================
odsobject|request|datapakid|activate|sidconversion|actrequest|operation|status|paketsize|timestamp
dummy_table|DTPR_OLD_REQUEST_TO_IGNORE_444|0|||DTPR_OLD_REQUEST_TO_IGNORE_444|A|0|0000020000|20211003151010
dummy_table|DTPR_F49Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_1C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211104151010
dummy_table|DTPR_F69Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_2C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211112171010
dummy_table|DTPR_F89Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_3C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211113121010
dummy_table|DTPR_F99Y1VBE6JO079PMFTL1X8BPY|0|||ODSR_4C6Q7CHLJJ08WG131T491L1ZF|A|0|0000020000|20211114111010
dummy_table|DTPR_INIT_REQUEST_123|0|||INIT_RECORD_L23F|A|0|0000010000|20211003161010

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/dummy_table_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "time",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "/bic/article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/extract_from_sap_bw/extract_write_optimised_dso/rsodsactreq_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "odsobject",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "activate",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "sidconversion",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "actrequest",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "operation",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "status",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "paketsize",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "timestamp",
      "type": "decimal(15,0)",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/file_manager/check_restore_status/acon_check_restore_status_directory.json
================================================
{
  "function": "check_restore_status",
  "bucket": "test_bucket",
  "source_object": "test_directory"
}

================================================
FILE: tests/resources/feature/file_manager/check_restore_status/acon_check_restore_status_single_object.json
================================================
{
  "function": "check_restore_status",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json"
}

================================================
FILE: tests/resources/feature/file_manager/copy_object/acon_copy_directory.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_directory",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/copy_object/acon_copy_directory_dry_run.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_directory",
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager/copy_object/acon_copy_single_object.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/copy_object/acon_copy_single_object_dry_run.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager/delete_objects/acon_delete_objects.json
================================================
{
  "function": "delete_objects",
  "bucket": "test_bucket",
  "object_paths": ["test_single_file.json", "test_directory"],
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/delete_objects/acon_delete_objects_dry_run.json
================================================
{
  "function": "delete_objects",
  "bucket": "test_bucket",
  "object_paths": ["test_single_file.json", "test_directory"],
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager/request_restore/acon_request_restore_directory.json
================================================
{
  "function": "request_restore",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "restore_expiration": 1,
  "retrieval_tier": "Bulk",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/request_restore/acon_request_restore_single_object.json
================================================
{
  "function": "request_restore",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "restore_expiration": 1,
  "retrieval_tier": "Bulk",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_directory.json
================================================
{
  "function": "request_restore_to_destination_and_wait",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_directory",
  "restore_expiration": 1,
  "retrieval_tier": "Expedited",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object.json
================================================
{
  "function": "request_restore_to_destination_and_wait",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "restore_expiration": 1,
  "retrieval_tier": "Expedited",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object_raise_error.json
================================================
{
  "function": "request_restore_to_destination_and_wait",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "restore_expiration": 1,
  "retrieval_tier": "Bulk",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/copy_objects/acon_copy_directory.json
================================================
{
  "function": "copy_objects",
  "bucket": "",
  "source_object": "tests/lakehouse/dbfs/test_directory",
  "destination_bucket": "",
  "destination_object": "tests/lakehouse/dbfs/destination_directory",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/copy_objects/acon_copy_directory_dry_run.json
================================================
{
  "function": "copy_objects",
  "bucket": "",
  "source_object": "tests/lakehouse/dbfs/test_directory",
  "destination_bucket": "",
  "destination_object": "tests/lakehouse/dbfs/destination_directory",
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/copy_objects/acon_copy_single_object.json
================================================
{
  "function": "copy_objects",
  "bucket": "",
  "source_object": "tests/lakehouse/dbfs/test_single_file.json",
  "destination_bucket": "",
  "destination_object": "tests/lakehouse/dbfs/destination_single_file.json",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/delete_objects/acon_delete_objects.json
================================================
{
  "function": "delete_objects",
  "bucket": "",
  "object_paths": ["tests/lakehouse/dbfs/destination_directory"],
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/delete_objects/acon_delete_objects_dry_run.json
================================================
{
  "function": "delete_objects",
  "bucket": "",
  "object_paths": ["tests/lakehouse/dbfs/test_directory", "tests/lakehouse/dbfs/destination_directory"],
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/move_objects/acon_move_objects.json
================================================
{
  "function": "move_objects",
  "bucket": "",
  "source_object": "tests/lakehouse/dbfs/test_directory",
  "destination_bucket": "",
  "destination_object": "tests/lakehouse/dbfs/test_mv_directory",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_dbfs/move_objects/acon_move_objects_dry_run.json
================================================
{
  "function": "move_objects",
  "bucket": "",
  "source_object": "tests/lakehouse/dbfs/test_directory",
  "destination_bucket": "",
  "destination_object": "tests/lakehouse/dbfs/test_mv_directory",
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager_s3/check_restore_status/acon_check_restore_status_directory.json
================================================
{
  "function": "check_restore_status",
  "bucket": "test_bucket",
  "source_object": "test_directory"
}

================================================
FILE: tests/resources/feature/file_manager_s3/check_restore_status/acon_check_restore_status_single_object.json
================================================
{
  "function": "check_restore_status",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json"
}

================================================
FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_directory.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_directory",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_directory_dry_run.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_directory",
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_single_object.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/copy_objects/acon_copy_single_object_dry_run.json
================================================
{
  "function": "copy_objects",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager_s3/delete_objects/acon_delete_objects.json
================================================
{
  "function": "delete_objects",
  "bucket": "test_bucket",
  "object_paths": ["test_single_file.json", "test_directory"],
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/delete_objects/acon_delete_objects_dry_run.json
================================================
{
  "function": "delete_objects",
  "bucket": "test_bucket",
  "object_paths": ["test_single_file.json", "test_directory"],
  "dry_run": true
}

================================================
FILE: tests/resources/feature/file_manager_s3/request_restore/acon_request_restore_directory.json
================================================
{
  "function": "request_restore",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "restore_expiration": 1,
  "retrieval_tier": "Bulk",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/request_restore/acon_request_restore_single_object.json
================================================
{
  "function": "request_restore",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "restore_expiration": 1,
  "retrieval_tier": "Bulk",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_directory.json
================================================
{
  "function": "request_restore_to_destination_and_wait",
  "bucket": "test_bucket",
  "source_object": "test_directory",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_directory",
  "restore_expiration": 1,
  "retrieval_tier": "Expedited",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object.json
================================================
{
  "function": "request_restore_to_destination_and_wait",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "restore_expiration": 1,
  "retrieval_tier": "Expedited",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/file_manager_s3/request_restore_to_destination_and_wait/acon_request_restore_to_destination_and_wait_single_object_raise_error.json
================================================
{
  "function": "request_restore_to_destination_and_wait",
  "bucket": "test_bucket",
  "source_object": "test_single_file.json",
  "destination_bucket": "destination_bucket",
  "destination_object": "destination_single_file",
  "restore_expiration": 1,
  "retrieval_tier": "Bulk",
  "dry_run": false
}

================================================
FILE: tests/resources/feature/full_load/full_overwrite/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/full_load/full_overwrite/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "repartitioned_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "repartition",
          "args": {
            "num_partitions": 1,
            "cols": ["date", "customer"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/full_load/full_overwrite/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/full_load/full_overwrite/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/full_load/full_overwrite/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/full_load/full_overwrite/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/full_load/full_overwrite/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/full_load/full_overwrite/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/full_load/full_overwrite/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/full_load/with_filter/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "filtered_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "expression_filter",
          "args": {
            "exp": "date like '2016%'"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "filtered_sales",
      "write_type": "overwrite",
      "data_format": "parquet",
      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/full_load/with_filter/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "parquet",
      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/full_load/with_filter/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/full_load/with_filter/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/full_load/with_filter/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "filtered_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "expression_filter",
          "args": {
            "exp": "date like '2016%'"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "filtered_sales",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data",
      "options": {
        "replaceWhere": "date like '2016%'"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "location": "file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": [
        "date",
        "customer"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|100
2|2|20170215|customer2|article6|500
2|3|20170215|customer2|article1|300
3|1|20170215|customer1|article5|2000
3|2|20170215|customer1|article2|1200
3|3|20170215|customer1|article4|900
4|1|20170430|customer3|article3|800
4|2|20170430|customer3|article7|700
4|3|20170430|customer3|article1|300
4|4|20170430|customer3|article2|500
5|1|20170510|customer4|article6|1500
5|2|20170510|customer4|article3|1000
5|3|20170510|customer4|article5|800
6|1|20170601|customer2|article4|1000
6|2|20170601|customer2|article1|500
6|3|20170601|customer2|article2|900

================================================
FILE: tests/resources/feature/full_load/with_filter_partition_overwrite/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|10000
1|2|20160601|customer1|article2|20000
1|3|20160601|customer1|article3|5000
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000
5|1|20170510|customer4|article6|15000
5|2|20170510|customer4|article3|10000
5|3|20170510|customer4|article5|8000
6|1|20170601|customer2|article4|10000
6|2|20170601|customer2|article1|5000
6|3|20170601|customer2|article2|9000

================================================
FILE: tests/resources/feature/gab/control/data/vw_dummy_sales_kpi.csv
================================================
cadence|order_date|to_date|category_name|qty_articles|total_amount|total_amount_last_year|avg_total_amount_last_2_years|discounted_total_amount
YEAR|2016-01-01|2016-12-31|category_a|3|7000|0|0|3920.0000000000005
YEAR|2017-01-01|2017-12-31|category_a|10|15000|7000|7000|8400
YEAR|2018-01-01|2018-12-31|category_a|4|36|15000|11000|20.160000000000004
YEAR|2017-01-01|2017-12-31|category_b|5|11000|0|0|6160.000000000001


================================================
FILE: tests/resources/feature/gab/control/data/vw_nam_orders_all_snapshot.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY2|10|1010|0|0|0|5
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY2|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY7|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY11|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY1|24|2424|0|0|0|12
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY2|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY5|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY1|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY9|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY4|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY4|9|909|0|0|0|4.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY8|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY9|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY1|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY5|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY8|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY4|9|909|0|0|0|4.5
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY1|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY10|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY1|12|1213|0|0|0|6
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY5|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY2|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY1|78|7878|0|0|0|39
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY2|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY5|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY2|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY1|78|7878|0|0|0|39
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY2|10|1010|0|0|0|5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY1|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY1|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY4|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY2|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY2|5|505|0|0|0|2.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY7|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY1|24|2424|0|0|0|12
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY1|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY11|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY1|12|1213|0|0|0|6
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY10|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY2|5|505|0|0|0|2.5


================================================
FILE: tests/resources/feature/gab/control/data/vw_nam_orders_filtered_snapshot.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1


================================================
FILE: tests/resources/feature/gab/control/data/vw_negative_offset_orders_all.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY1|24|2424|21|21|21|12
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY2|5|505|5|5|13|2.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY5|3|303|3|3|6|1.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY8|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY11|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY4|9|909|9|9|9|4.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY7|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5
WEEK|2022-01-02|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY7|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102418|COUNTRY1|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY4|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY1|78|7878|78|78|227|39
WEEK|2022-01-02|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2
WEEK|2022-01-02|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5
WEEK|2022-01-02|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-08|10102415|COUNTRY1|12|1213|12|12|22|6
WEEK|2022-01-02|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY8|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY5|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY11|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY2|10|1010|10|10|27|5
WEEK|2022-01-02|2022-01-09|10102416|COUNTRY2|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-08|10102416|COUNTRY2|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY9|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2
WEEK|2022-01-02|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY4|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-07|10102417|COUNTRY1|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3
WEEK|2022-01-02|2022-01-08|10102415|COUNTRY2|4|404|3|3|6|2
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY4|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102417|COUNTRY1|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5
WEEK|2022-01-02|2022-01-09|10102417|COUNTRY2|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-09|10102416|COUNTRY1|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-08|10102417|COUNTRY2|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-09|10102415|COUNTRY2|4|404|4|4|10|2
WEEK|2022-01-02|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY1|24|2424|24|24|69|12
WEEK|2022-01-02|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY4|9|909|9|9|18|4.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4
WEEK|2022-01-02|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY2|5|505|4|4|8|2.5
WEEK|2022-01-02|2022-01-09|10102418|COUNTRY1|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2
WEEK|2022-01-02|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY4|9|909|9|9|27|4.5
WEEK|2022-01-02|2022-01-07|10102416|COUNTRY2|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY1|78|7878|71|71|71|39
WEEK|2022-01-02|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-07|10102415|COUNTRY2|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY7|3|303|3|3|6|1.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY9|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-08|10102418|COUNTRY1|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3
WEEK|2022-01-02|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
WEEK|2022-01-02|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-07|10102416|COUNTRY1|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-07|10102415|COUNTRY1|12|1213|10|10|10|6
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY9|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY5|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY5|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY1|24|2424|24|24|45|12
WEEK|2022-01-02|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3
WEEK|2022-01-02|2022-01-09|10102415|COUNTRY1|12|1213|12|12|34|6
WEEK|2022-01-02|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY2|10|1010|9|9|17|5
WEEK|2022-01-02|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5
WEEK|2022-01-02|2022-01-07|10102417|COUNTRY2|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY11|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY1|78|7878|78|78|149|39
WEEK|2022-01-02|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5
WEEK|2022-01-02|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY8|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY2|9|909|8|8|8|4.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY10|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-08|10102417|COUNTRY1|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY2|4|404|4|4|4|2
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY5|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY5|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY10|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-08|10102416|COUNTRY1|3|303|3|3|6|1.5


================================================
FILE: tests/resources/feature/gab/control/data/vw_negative_offset_orders_filtered.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5
WEEK|2022-01-02|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2
WEEK|2022-01-02|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5
WEEK|2022-01-02|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4
WEEK|2022-01-02|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2
WEEK|2022-01-02|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3
WEEK|2022-01-02|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
WEEK|2022-01-02|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
WEEK|2022-01-02|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2
WEEK|2022-01-02|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4
WEEK|2022-01-02|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-02|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3
WEEK|2022-01-02|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
WEEK|2022-01-02|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-02|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1
WEEK|2022-01-02|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3
WEEK|2022-01-02|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-02|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1
WEEK|2022-01-02|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5
WEEK|2022-01-02|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1
WEEK|2022-01-02|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-02|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-02|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5
WEEK|2022-01-02|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1


================================================
FILE: tests/resources/feature/gab/control/data/vw_orders_all.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
DAY|2022-01-06|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY1|7|707|71|0|71|3.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4
DAY|2022-01-07|2022-01-07|10102412|COUNTRY2|1|101|8|0|8|0.5
DAY|2022-01-08|2022-01-08|10102412|COUNTRY2|1|101|1|0|9|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
DAY|2022-01-06|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5
DAY|2022-01-07|2022-01-07|10102413|COUNTRY1|3|303|21|0|21|1.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2
DAY|2022-01-08|2022-01-08|10102413|COUNTRY2|1|101|4|0|4|0.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
DAY|2022-01-06|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5
DAY|2022-01-07|2022-01-07|10102415|COUNTRY1|2|202|10|0|10|1
DAY|2022-01-06|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5
DAY|2022-01-08|2022-01-08|10102415|COUNTRY2|1|101|3|0|3|0.5
DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY1|78|7878|0|0|0|39
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY10|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY11|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY2|10|1010|0|0|0|5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY4|9|909|0|0|0|4.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY5|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY7|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY8|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY9|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY1|24|2424|0|0|0|12
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY2|5|505|0|0|0|2.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY4|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY5|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY1|12|1213|0|0|0|6
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY2|4|404|0|0|0|2
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY1|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY2|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY1|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY2|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY1|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY1|78|7878|0|0|0|39
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY10|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY11|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY2|10|1010|0|0|0|5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY4|9|909|0|0|0|4.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY5|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY7|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY8|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY9|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY1|24|2424|0|0|0|12
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY2|5|505|0|0|0|2.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY4|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY5|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY1|12|1213|0|0|0|6
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY2|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY1|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY2|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY1|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY2|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY1|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY1|78|7878|0|0|0|39
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY10|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY11|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY2|10|1010|0|0|0|5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|0|0|0|3
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY4|9|909|0|0|0|4.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY5|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|0|0|0|4
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY7|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY8|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY9|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY1|24|2424|0|0|0|12
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY2|5|505|0|0|0|2.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY4|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY5|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|0|0|0|2
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY1|12|1213|0|0|0|6
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY2|4|404|0|0|0|2
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY1|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY2|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY1|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY2|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY1|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY1|78|7878|0|0|0|39
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY10|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY11|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY2|10|1010|0|0|0|5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY3|6|606|0|0|0|3
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY4|9|909|0|0|0|4.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY5|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY6|8|808|0|0|0|4
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY7|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY8|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY9|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY1|24|2424|0|0|0|12
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY2|5|505|0|0|0|2.5
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY3|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY4|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY5|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY6|4|404|0|0|0|2
YEAR|2022-01-01|2022-12-31|10102415|COUNTRY1|12|1213|0|0|0|6
YEAR|2022-01-01|2022-12-31|10102415|COUNTRY2|4|404|0|0|0|2
YEAR|2022-01-01|2022-12-31|10102415|COUNTRY3|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102415|COUNTRY6|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102416|COUNTRY1|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102416|COUNTRY2|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102416|COUNTRY3|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102416|COUNTRY6|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102417|COUNTRY1|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102417|COUNTRY2|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102417|COUNTRY3|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102417|COUNTRY6|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102418|COUNTRY1|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102418|COUNTRY3|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102418|COUNTRY6|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102419|COUNTRY6|1|101|0|0|0|0.5


================================================
FILE: tests/resources/feature/gab/control/data/vw_orders_all_snapshot.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
DAY|2022-01-06|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
DAY|2022-01-06|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2
DAY|2022-01-08|2022-01-08|10102415|COUNTRY2|1|101|3|0|3|0.5
DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5
DAY|2022-01-07|2022-01-07|10102415|COUNTRY1|2|202|10|0|10|1
DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1
DAY|2022-01-07|2022-01-07|10102412|COUNTRY1|7|707|71|0|71|3.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY2|1|101|8|0|8|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5
DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
DAY|2022-01-06|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4
DAY|2022-01-07|2022-01-07|10102413|COUNTRY1|3|303|21|0|21|1.5
DAY|2022-01-08|2022-01-08|10102413|COUNTRY2|1|101|4|0|4|0.5
DAY|2022-01-06|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5
DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5
DAY|2022-01-08|2022-01-08|10102412|COUNTRY2|1|101|1|0|9|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY2|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY9|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY7|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY2|4|404|4|4|10|2
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY8|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY4|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-07|10102415|COUNTRY1|12|1213|10|10|10|6
WEEK|2022-01-03|2022-01-06|10102417|COUNTRY1|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY8|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY2|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4
WEEK|2022-01-03|2022-01-07|10102415|COUNTRY2|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY7|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY2|5|505|4|4|8|2.5
WEEK|2022-01-03|2022-01-07|10102417|COUNTRY2|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY11|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-06|10102415|COUNTRY2|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY1|21|2121|0|0|0|10.5
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY4|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY2|10|1010|10|10|27|5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY4|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-08|10102418|COUNTRY1|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3
WEEK|2022-01-03|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-06|10102416|COUNTRY1|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-08|10102417|COUNTRY2|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY8|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY1|71|7171|0|0|0|35.5
WEEK|2022-01-03|2022-01-06|10102418|COUNTRY1|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-06|10102417|COUNTRY2|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY9|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102416|COUNTRY2|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
WEEK|2022-01-03|2022-01-06|10102415|COUNTRY1|10|1011|0|0|0|5
WEEK|2022-01-03|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY4|9|909|0|0|0|4.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY10|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-07|10102416|COUNTRY1|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY4|9|909|9|9|27|4.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY7|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-07|10102418|COUNTRY1|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-08|10102416|COUNTRY1|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY5|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2
WEEK|2022-01-03|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY2|9|909|8|8|8|4.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY2|10|1010|9|9|17|5
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY5|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY5|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY9|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-08|10102415|COUNTRY2|4|404|3|3|6|2
WEEK|2022-01-03|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-08|10102415|COUNTRY1|12|1213|12|12|22|6
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY5|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY5|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY2|5|505|5|5|13|2.5
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY2|4|404|4|4|4|2
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY1|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY10|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY8|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4
WEEK|2022-01-03|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY2|4|404|0|0|0|2
WEEK|2022-01-03|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY7|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-07|10102417|COUNTRY1|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY1|78|7878|78|78|227|39
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY5|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY1|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY4|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY4|9|909|9|9|9|4.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY1|24|2424|24|24|45|12
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY1|78|7878|78|78|149|39
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY9|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY5|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY11|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY11|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY10|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY1|24|2424|24|24|69|12
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY5|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-08|10102417|COUNTRY1|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY1|78|7878|71|71|71|39
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY1|12|1213|12|12|34|6
WEEK|2022-01-03|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY11|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY1|24|2424|21|21|21|12
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY2|8|808|0|0|0|4
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY4|9|909|9|9|18|4.5
WEEK|2022-01-03|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-08|10102416|COUNTRY2|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY1|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-07|10102416|COUNTRY2|2|202|2|2|2|1


================================================
FILE: tests/resources/feature/gab/control/data/vw_orders_filtered.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY3|6|606|0|0|0|3
MONTH|2022-01-01|2022-01-31|10102412|COUNTRY6|8|808|0|0|0|4
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY3|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102413|COUNTRY6|4|404|0|0|0|2
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102415|COUNTRY6|3|303|0|0|0|1.5
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY3|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102416|COUNTRY6|2|202|0|0|0|1
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102417|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY3|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102418|COUNTRY6|1|101|0|0|0|0.5
MONTH|2022-01-01|2022-01-31|10102419|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY3|6|606|0|0|0|3
QUARTER|2022-01-01|2022-03-31|10102412|COUNTRY6|8|808|0|0|0|4
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY3|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102413|COUNTRY6|4|404|0|0|0|2
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102415|COUNTRY6|3|303|0|0|0|1.5
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY3|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102416|COUNTRY6|2|202|0|0|0|1
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102417|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY3|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102418|COUNTRY6|1|101|0|0|0|0.5
QUARTER|2022-01-01|2022-03-31|10102419|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|0|0|0|3
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|0|0|0|4
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|0|0|0|2
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY3|6|606|0|0|0|3
YEAR|2022-01-01|2022-12-31|10102412|COUNTRY6|8|808|0|0|0|4
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY3|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102413|COUNTRY6|4|404|0|0|0|2
YEAR|2022-01-01|2022-12-31|10102415|COUNTRY3|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102415|COUNTRY6|3|303|0|0|0|1.5
YEAR|2022-01-01|2022-12-31|10102416|COUNTRY3|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102416|COUNTRY6|2|202|0|0|0|1
YEAR|2022-01-01|2022-12-31|10102417|COUNTRY3|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102417|COUNTRY6|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102418|COUNTRY3|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102418|COUNTRY6|1|101|0|0|0|0.5
YEAR|2022-01-01|2022-12-31|10102419|COUNTRY6|1|101|0|0|0|0.5


================================================
FILE: tests/resources/feature/gab/control/data/vw_orders_filtered_snapshot.csv
================================================
cadence|order_date|to_date|sales_order_schedule|delivery_country_cod|orders|total_sales|orders_last_cad|orders_last_year|orders_avg_last_3_1|orders_derived
DAY|2022-01-06|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
DAY|2022-01-06|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
DAY|2022-01-07|2022-01-07|10102412|COUNTRY3|1|101|5|0|5|0.5
DAY|2022-01-06|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
DAY|2022-01-06|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
DAY|2022-01-06|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
DAY|2022-01-06|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102418|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY3|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY6|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-08|10102415|COUNTRY6|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY6|8|808|8|8|8|4
WEEK|2022-01-03|2022-01-07|10102417|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102418|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-06|10102419|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-07|10102418|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-07|10102418|COUNTRY3|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-09|10102413|COUNTRY6|4|404|4|4|12|2
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY3|6|606|6|6|17|3
WEEK|2022-01-03|2022-01-06|10102415|COUNTRY6|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-06|10102418|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-06|10102415|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY6|4|404|0|0|0|2
WEEK|2022-01-03|2022-01-08|10102415|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-08|10102416|COUNTRY3|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY3|5|505|0|0|0|2.5
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY6|3|303|3|3|9|1.5
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY6|4|404|4|4|4|2
WEEK|2022-01-03|2022-01-06|10102417|COUNTRY6|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-08|10102419|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-07|10102412|COUNTRY3|6|606|5|5|5|3
WEEK|2022-01-03|2022-01-09|10102415|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-09|10102418|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-07|10102415|COUNTRY6|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY6|4|404|4|4|8|2
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY3|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102417|COUNTRY3|1|101|0|0|0|0.5
WEEK|2022-01-03|2022-01-08|10102417|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-09|10102416|COUNTRY3|2|202|2|2|6|1
WEEK|2022-01-03|2022-01-06|10102416|COUNTRY3|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-07|10102413|COUNTRY3|3|303|3|3|3|1.5
WEEK|2022-01-03|2022-01-09|10102412|COUNTRY6|8|808|8|8|24|4
WEEK|2022-01-03|2022-01-08|10102416|COUNTRY6|2|202|2|2|4|1
WEEK|2022-01-03|2022-01-07|10102419|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-09|10102419|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102413|COUNTRY3|3|303|0|0|0|1.5
WEEK|2022-01-03|2022-01-06|10102416|COUNTRY6|2|202|0|0|0|1
WEEK|2022-01-03|2022-01-07|10102415|COUNTRY3|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-07|10102417|COUNTRY6|1|101|1|1|1|0.5
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY3|6|606|6|6|11|3
WEEK|2022-01-03|2022-01-09|10102417|COUNTRY6|1|101|1|1|3|0.5
WEEK|2022-01-03|2022-01-06|10102412|COUNTRY6|8|808|0|0|0|4
WEEK|2022-01-03|2022-01-08|10102412|COUNTRY6|8|808|8|8|16|4
WEEK|2022-01-03|2022-01-07|10102416|COUNTRY6|2|202|2|2|2|1
WEEK|2022-01-03|2022-01-08|10102417|COUNTRY3|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-08|10102418|COUNTRY6|1|101|1|1|2|0.5
WEEK|2022-01-03|2022-01-08|10102413|COUNTRY3|3|303|3|3|6|1.5
WEEK|2022-01-03|2022-01-07|10102416|COUNTRY3|2|202|2|2|2|1


================================================
FILE: tests/resources/feature/gab/control/schema/vw_dummy_sales_kpi.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name":"cadence",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"order_date",
      "type":"date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"to_date",
      "type":"date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"category_name",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"qty_articles",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"total_amount",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"total_amount_last_year",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"avg_total_amount_last_2_years",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"discounted_total_amount",
      "type":"double",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/gab/control/schema/vw_orders.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "cadence",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "to_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "sales_order_schedule",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "delivery_country_cod",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "orders",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "total_sales",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "orders_last_cad",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "orders_last_year",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "orders_avg_last_3_1",
      "type":"double",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "orders_derived",
      "type":"double",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/gab/setup/column_list/calendar.json
================================================
{
  "calendar_date": "date",
  "day_en": "string",
  "weeknum_mon": "int",
  "weekstart_mon": "date",
  "weekend_mon": "date",
  "weekstart_sun": "date",
  "weekend_sun": "date",
  "month_start": "date",
  "month_end": "date",
  "quarter_start": "date",
  "quarter_end": "date",
  "year_start": "date",
  "year_end": "date"
}

================================================
FILE: tests/resources/feature/gab/setup/column_list/dummy_sales_kpi.json
================================================
{
  "order_date": "date",
  "article_id": "string",
  "amount": "int"
}

================================================
FILE: tests/resources/feature/gab/setup/column_list/gab_log_events.json
================================================
{
  "run_start_time": "timestamp",
  "run_end_time": "timestamp",
  "input_start_date": "timestamp",
  "input_end_date": "timestamp",
  "query_id": "string",
  "query_label": "string",
  "cadence": "string",
  "stage_name": "string",
  "stage_query": "string",
  "status": "string",
  "error_code": "string"
}

================================================
FILE: tests/resources/feature/gab/setup/column_list/gab_use_case_results.json
================================================
{
  "query_id": "string",
  "cadence": "string",
  "from_date": "date",
  "to_date": "date",
  "d1": "string",
  "d2": "string",
  "d3": "string",
  "d4": "string",
  "d5": "string",
  "d6": "string",
  "d7": "string",
  "d8": "string",
  "d9": "string",
  "d10": "string",
  "d11": "string",
  "d12": "string",
  "d13": "string",
  "d14": "string",
  "d15": "string",
  "d16": "string",
  "d17": "string",
  "d18": "string",
  "d19": "string",
  "d20": "string",
  "d21": "string",
  "d22": "string",
  "d23": "string",
  "d24": "string",
  "d25": "string",
  "d26": "string",
  "d27": "string",
  "d28": "string",
  "d29": "string",
  "d30": "string",
  "d31": "string",
  "d32": "string",
  "d33": "string",
  "d34": "string",
  "d35": "string",
  "d36": "string",
  "d37": "string",
  "d38": "string",
  "d39": "string",
  "d40": "string",
  "m1": "double",
  "m2": "double",
  "m3": "double",
  "m4": "double",
  "m5": "double",
  "m6": "double",
  "m7": "double",
  "m8": "double",
  "m9": "double",
  "m10": "double",
  "m11": "double",
  "m12": "double",
  "m13": "double",
  "m14": "double",
  "m15": "double",
  "m16": "double",
  "m17": "double",
  "m18": "double",
  "m19": "double",
  "m20": "double",
  "m21": "double",
  "m22": "double",
  "m23": "double",
  "m24": "double",
  "m25": "double",
  "m26": "double",
  "m27": "double",
  "m28": "double",
  "m29": "double",
  "m30": "double",
  "m31": "double",
  "m32": "double",
  "m33": "double",
  "m34": "double",
  "m35": "double",
  "m36": "double",
  "m37": "double",
  "m38": "double",
  "m39": "double",
  "m40": "double",
  "lh_created_on": "timestamp"
}

================================================
FILE: tests/resources/feature/gab/setup/column_list/lkp_query_builder.json
================================================
{
  "query_id": "int",
  "query_label": "string",
  "query_type": "string",
  "mappings": "string",
  "intermediate_stages": "string",
  "recon_window": "string",
  "timezone_offset": "int",
  "start_of_the_week": "string",
  "is_active": "string",
  "queue": "string",
  "lh_created_on": "timestamp"
}

================================================
FILE: tests/resources/feature/gab/setup/column_list/order_events.json
================================================
{
  "request_timestamp": "string",
  "data_pack_id": "string",
  "record_number": "int",
  "update_mode": "string",
  "sales_order_header": "string",
  "sales_order_schedule": "string",
  "sales_order_item": "string",
  "orgsales_orgp": "string",
  "order_header_key": "string",
  "order_line_key": "string",
  "derived_order_header": "string",
  "derived_order_line_k": "string",
  "return_reason": "string",
  "reqmnt_category": "string",
  "delivery_status10": "string",
  "req_del_dt_item": "date",
  "reason_for_rejsize": "string",
  "invoice_item_price": "string",
  "id_of_the_customer": "string",
  "logistics_profit_ctr": "string",
  "material_availabilit": "date",
  "mso_store": "string",
  "name_of_orderer": "string",
  "overall_delivery_sta": "string",
  "overall_processing_s20": "string",
  "overall_processing_s21": "string",
  "coupon_code": "string",
  "org_grape_bapcx": "string",
  "cust_service_rep": "string",
  "customer_purchase_or25": "date",
  "delivery_country_cod": "string",
  "delivery_city_code": "string",
  "delivery_post_code": "string",
  "delivery_state_code": "string",
  "delivery_status30": "string",
  "ops_del_block_sohdr": "string",
  "ops_del_block_soscl": "string",
  "ecom_crm_id": "string",
  "conf_del_date_size": "date",
  "created_on": "date",
  "time": "string",
  "sales_doc_item_cat": "string",
  "shipping_campaign_id": "string",
  "shipping_coupon_code": "string",
  "shipping_city": "string",
  "shipping_postal_code": "string",
  "shp_promotion_code": "string",
  "size_grid": "string",
  "main_chan_frm_src": "string",
  "prctr_billing": "string",
  "prere_indfrm_src": "string",
  "reg__clr_from_src": "string",
  "update_flag": "string",
  "usage": "string",
  "so_header_usgindp": "string",
  "vas_customer_defined": "string",
  "adidas_group_article": "string",
  "billto_cust": "string",
  "requirement_type": "string",
  "shipto_cust__r2": "string",
  "soldto_cust_r2": "string",
  "sales_doc_category": "string",
  "product_division": "string",
  "promotion_code": "string",
  "sd_categ_precdoc": "string",
  "so_hdrpreceding_doc": "string",
  "so_itmpreceding_doc": "string",
  "so_scl_prec_doc": "string",
  "article__region__s": "string",
  "reference_1": "string",
  "mkt_place_order_num": "string",
  "sales_representative": "string",
  "subtotal_1_source": "decimal",
  "subtotal_2_source": "decimal",
  "subtotal_3_source": "decimal",
  "subtotal_4_source": "decimal",
  "subtotal_5_source": "decimal",
  "subtotal_6_source": "decimal",
  "grid_value": "string",
  "orgcompcodep": "string",
  "created_by": "string",
  "miscdistchcopap": "string",
  "document_currency": "string",
  "reason_for_order": "string",
  "opsplantp": "string",
  "sales_group": "string",
  "sales_office": "string",
  "sales_unit": "string",
  "storage_location": "string",
  "so_net_price_2": "decimal",
  "sales_order_net_valu": "decimal",
  "so_conf_qty": "decimal",
  "so_cum_order_qty": "decimal",
  "so_net_price": "decimal",
  "so_net_value": "decimal",
  "so_org_qty": "decimal",
  "so_conf_qty_actual": "decimal",
  "sales_order_qty": "decimal",
  "sales_odr_qty_actual": "decimal",
  "article_campaign_id": "string",
  "sales_document_type": "string",
  "order_date_header": "date",
  "billing_city": "string",
  "billing_postal_code": "string",
  "customer_po_time": "string",
  "customer_purchase_or101": "string",
  "overall_rej_status": "string",
  "changed_on": "date",
  "epoch_status": "string",
  "sales_order_canqty": "decimal",
  "epoch_entry_type": "string",
  "epoch_entry_by": "string",
  "epoch_order_type": "string",
  "epoch_line_type": "string",
  "omnihub_marketplace": "string",
  "confirmed_delivery_t": "string",
  "shipping_city_addres112": "string",
  "shipping_city_addres113": "string",
  "shipping_city_addres114": "string",
  "billing_city_address115": "string",
  "billing_city_address116": "string",
  "billing_city_address117": "string",
  "omnihub_seller_org": "string",
  "omnihub_locale_code": "string",
  "customer_po_type": "string",
  "omnihub_carrier_serv": "string",
  "qualifier": "string",
  "omnihub_document_typ": "string",
  "omnihub_return_code": "string",
  "refund_process_date": "date",
  "refund_process_time": "string",
  "omni_cancel_reason": "string",
  "sales_order_ecom_fre": "decimal",
  "omnihub_custom_order": "string",
  "vas_packing_type_so": "string",
  "vas_spl_ser_type_so": "string",
  "vas_tktlbl_type_so": "string",
  "exchange_flag": "string",
  "exchange_type": "string",
  "customer_po_timedw": "string",
  "cnc_store_id": "string",
  "last_hold__type": "string",
  "last_hold_released_t": "string",
  "last_hold_release_dt": "date",
  "dynamic_pricing_iden": "string",
  "dynamic_pricing_valu": "string",
  "dymamic_pricing_amnt": "decimal",
  "exchange_reason": "string",
  "omnihub_site_id": "string",
  "international_shipme": "string",
  "exchange_variant": "string",
  "secondary_article_ca": "string",
  "secondary_article_pr": "string",
  "secondary_coupon_cod": "string",
  "double_discount_flag": "string",
  "extraction_date": "string",
  "lhe_batch_id": "int",
  "lhe_row_id": "bigint",
  "source_update_date": "date",
  "source_update_time": "string"
}

================================================
FILE: tests/resources/feature/gab/setup/data/dummy_sales_kpi.csv
================================================
order_date|article_id|amount
2017-02-15|article1|600
2017-02-15|article6|1000
2017-02-15|article2|2400
2017-02-15|article4|2000
2017-02-15|article5|4000
2017-04-30|article7|1400
2017-04-30|article2|1000
2017-04-30|article3|1600
2017-04-30|article1|600
2016-06-01|article2|4000
2016-06-01|article1|2000
2016-06-01|article3|1000
2017-05-10|article5|1600
2017-05-10|article6|3000
2017-05-10|article3|2000
2017-06-01|article4|2000
2017-06-01|article1|1000
2017-06-01|article2|1800
2018-07-11|article3|6
2018-07-11|article1|2
2018-07-01|article2|18
2018-07-01|article1|10


================================================
FILE: tests/resources/feature/gab/setup/data/lkp_query_builder.csv
================================================
query_id|query_label|query_type|mappings|intermediate_stages|recon_window|timezone_offset|start_of_the_week|is_active|queue|lh_created_on
742783030|order_events|GLOBAL|{ 'vw_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{'1': {'file_path': 'order_events/1_order_events.sql','table_alias': 'order_events_query','storage_level': 'MEMORY_ONLY','project_date_column': 'order_date_header','filter_date_column': 'order_date_header','repartition': {}}}|{'DAY': {}, 'WEEK': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'MONTH': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'QUARTER': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'YEAR': {'recon_window': {'DAY': {'snapshot': 'N'}}}}|0|SUNDAY|Y|Medium|2024-02-08T11:33:49.76Z
74776315|dummy_sales_kpi|GLOBAL|{ 'vw_dummy_sales_kpi': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'category_name' }, 'metric': { 'm1': { 'metric_name': 'qty_articles', 'calculated_metric': {}, 'derived_metric': {} }, 'm2': { 'metric_name': 'total_amount', 'calculated_metric': { 'last_cadence': [ { 'label': 'total_amount_last_year', 'window': '1' } ], 'window_function': [ { 'label': 'avg_total_amount_last_2_years', 'window': [ 2, 1 ], 'agg_func': 'avg' } ] }, 'derived_metric': [ { 'label': 'discounted_total_amount', 'formula': 'total_amount*0.56' } ] } }, 'filter': {} } }|{ '1': { 'file_path': 'dummy_sales_kpi/1_article_category.sql', 'table_alias': 'article_categories', 'storage_level': 'MEMORY_ONLY', 'project_date_column': '', 'filter_date_column': '', 'repartition': {} }, '2': { 'file_path': 'dummy_sales_kpi/2_dummy_sales_kpi.sql', 'table_alias': 'dummy_sales_kpi', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date', 'filter_date_column': 'order_date', 'repartition': {} } }|{'YEAR': {}}|0|MONDAY|Y|Low|2024-03-07T15:38:52.922Z
742783031|order_events_snapshot|GLOBAL|{ 'vw_orders_all_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_orders_filtered_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {} } }|{'DAY': {}, 'WEEK': {'recon_window': {'DAY': {'snapshot': 'Y'}}}, 'MONTH': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'QUARTER': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'YEAR': {'recon_window': {'DAY': {'snapshot': 'N'}}}}|0|SUNDAY|Y|Medium|2024-03-25T10:17:51.907Z
742783032|order_events_nam|NAM|{ 'vw_nam_orders_all_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_nam_orders_filtered_snapshot': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {} } }|{'DAY': {}, 'WEEK': {'recon_window': {'DAY': {'snapshot': 'Y'}}}, 'MONTH': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'QUARTER': {'recon_window': {'DAY': {'snapshot': 'N'}}}, 'YEAR': {'recon_window': {'DAY': {'snapshot': 'N'}}}}|0|MONDAY|Y|Medium|2024-03-25T10:19:12.597Z
742783034|order_events_negative_timezone_offset|GLOBAL|{ 'vw_negative_offset_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_negative_offset_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {'numPartitions':3, 'keys':['order_date']} } }|{'WEEK': {'recon_window': {'DAY': {'snapshot': 'Y'}}}}|-3|MONDAY|Y|Medium|2024-03-25T10:20:27.992Z
742783035|order_events_empty_reconciliation_window|GLOBAL|{ 'vw_negative_offset_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_negative_offset_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {'numPartitions':3, 'keys':['order_date']} } }|{}|-3|MONDAY|Y|Medium|2024-03-25T10:20:27.992Z
742783036|order_events_unexisting_cadence|GLOBAL|{ 'vw_negative_offset_orders_all': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': {} }, 'vw_negative_offset_orders_filtered': { 'dimensions': { 'from_date': 'order_date', 'to_date': 'to_date', 'd1': 'sales_order_schedule', 'd2': 'delivery_country_cod' }, 'metric': { 'm1': { 'metric_name': 'orders', 'calculated_metric': { 'last_cadence': [ { 'label': 'orders_last_cad', 'window': '1' } ], 'last_year_cadence': [ { 'label': 'orders_last_year', 'window': 1 } ], 'window_function': [ { 'label': 'orders_avg_last_3_1', 'window': [ 3, 1 ], 'agg_func': 'sum' } ] }, 'derived_metric': [ { 'label': 'orders_derived', 'formula': 'orders*0.5' } ] }, 'm2': { 'metric_name': 'total_sales', 'calculated_metric': {}, 'derived_metric': {} } }, 'filter': 'd2 in ("COUNTRY6", "COUNTRY3")' } }|{ '1': { 'file_path': 'order_events/1_order_events.sql', 'table_alias': 'order_events_query', 'storage_level': 'MEMORY_ONLY', 'project_date_column': 'order_date_header', 'filter_date_column': 'order_date_header', 'repartition': {'numPartitions':3, 'keys':['order_date']} } }|{'UNEXINSTING_CADENCE': {'recon_window': {'DAY': {'snapshot': 'Y'}}}}|-3|MONDAY|Y|Medium|2024-03-25T10:20:27.992Z


================================================
FILE: tests/resources/feature/gab/setup/data/order_events.csv
================================================
request_timestamp|data_pack_id|record_number|update_mode|sales_order_header|sales_order_schedule|sales_order_item|orgsales_orgp|order_header_key|order_line_key|derived_order_header|derived_order_line_k|return_reason|reqmnt_category|delivery_status10|req_del_dt_item|reason_for_rejsize|invoice_item_price|id_of_the_customer|logistics_profit_ctr|material_availabilit|mso_store|name_of_orderer|overall_delivery_sta|overall_processing_s20|overall_processing_s21|coupon_code|org_grape_bapcx|cust_service_rep|customer_purchase_or25|delivery_country_cod|delivery_city_code|delivery_post_code|delivery_state_code|delivery_status30|ops_del_block_sohdr|ops_del_block_soscl|ecom_crm_id|conf_del_date_size|created_on|time|sales_doc_item_cat|shipping_campaign_id|shipping_coupon_code|shipping_city|shipping_postal_code|shp_promotion_code|size_grid|main_chan_frm_src|prctr_billing|prere_indfrm_src|reg__clr_from_src|update_flag|usage|so_header_usgindp|vas_customer_defined|adidas_group_article|billto_cust|requirement_type|shipto_cust__r2|soldto_cust_r2|sales_doc_category|product_division|promotion_code|sd_categ_precdoc|so_hdrpreceding_doc|so_itmpreceding_doc|so_scl_prec_doc|article__region__s|reference_1|mkt_place_order_num|sales_representative|subtotal_1_source|subtotal_2_source|subtotal_3_source|subtotal_4_source|subtotal_5_source|subtotal_6_source|grid_value|orgcompcodep|created_by|miscdistchcopap|document_currency|reason_for_order|opsplantp|sales_group|sales_office|sales_unit|storage_location|so_net_price_2|sales_order_net_valu|so_conf_qty|so_cum_order_qty|so_net_price|so_net_value|so_org_qty|so_conf_qty_actual|sales_order_qty|sales_odr_qty_actual|article_campaign_id|sales_document_type|order_date_header|billing_city|billing_postal_code|customer_po_time|customer_purchase_or101|overall_rej_status|changed_on|epoch_status|sales_order_canqty|epoch_entry_type|epoch_entry_by|epoch_order_type|epoch_line_type|omnihub_marketplace|confirmed_delivery_t|shipping_city_addres112|shipping_city_addres113|shipping_city_addres114|billing_city_address115|billing_city_address116|billing_city_address117|omnihub_seller_org|omnihub_locale_code|customer_po_type|omnihub_carrier_serv|qualifier|omnihub_document_typ|omnihub_return_code|refund_process_date|refund_process_time|omni_cancel_reason|sales_order_ecom_fre|omnihub_custom_order|vas_packing_type_so|vas_spl_ser_type_so|vas_tktlbl_type_so|exchange_flag|exchange_type|customer_po_timedw|cnc_store_id|last_hold__type|last_hold_released_t|last_hold_release_dt|dynamic_pricing_iden|dynamic_pricing_valu|dymamic_pricing_amnt|exchange_reason|omnihub_site_id|international_shipme|exchange_variant|secondary_article_ca|secondary_article_pr|secondary_coupon_cod|double_discount_flag|extraction_date|lhe_batch_id|lhe_row_id|source_update_date|source_update_time
null|null|null|null|VALUE1|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING1|null|AC|null|null|STRING1|2022-01-10|2022-01-06|81351101|VALUE1|null|null|CITY1|888420101|null|2300101|404|null|RER|CRC|XCX|null|null|null|STRING2|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING2|STRING1|STRING1|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY1|885201|8123401001|STRING1|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING1|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING1|null|null|null|null|null|124814001|null|STRING1|2140001|2022-01-06|null|null|null|STRING1|COMP1COUNTRY1|null|null|STRING1|STRING1|STRING1|No||2|1|null|null
null|null|null|null|VALUE3|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING3|null|AC|null|null|STRING3|2022-01-11|2022-01-06|81351103|VALUE1|null|null|CITY3|888420103|null|2300104|404|null|RER|CRC|XCX|null|null|null|STRING4|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING4|STRING4|STRING4|null|null|15002.00|null|15002.00|null|null|230003|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85003.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY3|885203|8123401003|STRING4|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING3|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING3|null|null|null|null|null|124814003|null|STRING1|2140003|2022-01-06|null|null|null|STRING3|COMP1COUNTRY3|null|null|null|null|null|No||2|3|null|null
null|null|null|null|VALUE4|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING4|METHOD1|AC|null|null|STRING4|2022-01-11|2022-01-06|81351104|VALUE1|null|null|CITY4|888420104|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING5|null|101|null|null|null|1001|null|null|null|null|null|STRING5|STRING5|STRING5|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY4|885204|8123401004|STRING5|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING4|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING4|null|null|null|null|null|124814004|null|STRING1|2140004|2022-01-06|null|null|null|STRING4|COMP1COUNTRY1|null|null|null|null|null|No||2|4|null|null
null|null|null|null|VALUE7|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE3|null|null|2022-01-05|COUNTRY1|null|STRING7|null|AC|null|null|STRING7|2022-01-11|2022-01-06|81351107|VALUE1|null|null|CITY7|888420107|null|2300110|404|null|RER|RCR|XCX|null|null|null|STRING10|null|101|null|null|null|1001|STRING4|null|null|null|null|STRING10|STRING10|STRING10|null|null|null|null|null|null|null|230007|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING4|STRING1|2022-01-06|CITY7|885207|8123401007|STRING10|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING7|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING7|null|null|null|null|null|124814007|null|STRING1|2140007|2022-01-06|null|null|null|STRING7|COMP1COUNTRY1|null|null|null|null|null|No||2|9|null|null
null|null|null|null|VALUE8|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE4|null|null|2022-01-05|COUNTRY1|null|STRING8|null|AC|null|null|STRING8|2022-01-13|2022-01-06|81351108|VALUE1|null|null|CITY8|888420108|null|2300111|404|null|RER|RCR|XCX|null|null|STRING1|STRING11|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING11|STRING11|STRING11|null|null|15004.00|null|15004.00|null|null|230008|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85009.00|101.000|101.000|101.000|101.000|STRING5|STRING1|2022-01-06|CITY8|885208|8123401008|STRING11|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE2|STRING8|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING8|null|EFL|null|null|null|124814008|null|STRING1|2140008|2022-01-06|null|null|null|STRING8|COMP1COUNTRY1|null|null|null|null|null|No||2|10|null|null
null|null|null|null|VALUE8|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE4|null|null|2022-01-05|COUNTRY1|null|STRING8|null|AC|null|null|STRING8|2022-01-13|2022-01-06|81351108|VALUE1|null|null|CITY8|888420108|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING12|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING12|STRING11|STRING11|null|null|15005.00|null|15005.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85010.00|101.000|101.000|101.000|101.000|STRING5|STRING1|2022-01-06|CITY8|885208|8123401008|STRING11|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING8|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING8|null|null|null|null|null|124814008|null|STRING1|2140008|2022-01-06|null|null|null|STRING8|COMP1COUNTRY1|null|null|null|null|null|No||2|11|null|null
null|null|null|null|VALUE8|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE4|null|null|2022-01-05|COUNTRY1|null|STRING8|null|AC|null|null|STRING8|2022-01-13|2022-01-06|81351108|VALUE1|null|null|CITY8|888420108|null|2300113|404|null|RER|RCR|XCX|null|null|null|STRING13|null|101|null|null|null|1003|STRING5|null|null|null|null|STRING13|STRING11|STRING11|null|null|15006.00|null|15006.00|null|null|230010|null|null|101|STRING1|null|null|null|null|PCP|101|6500132.00|null|102.000|102.000|6500132.00|85011.00|102.000|102.000|102.000|102.000|STRING5|STRING1|2022-01-06|CITY8|885208|8123401008|STRING11|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING8|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING8|null|null|null|null|null|124814008|null|STRING1|2140008|2022-01-06|null|null|null|STRING8|COMP1COUNTRY1|null|null|null|null|null|No||2|12|null|null
null|null|null|null|VALUE9|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING9|null|AC|null|null|STRING9|2022-01-11|2022-01-06|81351109|VALUE1|null|null|CITY9|888420109|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING14|null|101|null|null|null|1002|null|null|null|null|null|STRING14|STRING14|STRING14|null|null|15003.00|null|15003.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500133.00|null|101.000|101.000|6500133.00|85012.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY9|885209|8123401009|STRING14|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING9|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING9|null|null|null|null|null|124814009|null|STRING1|2140009|2022-01-06|null|null|null|STRING9|COMP1COUNTRY1|null|null|null|null|null|No||2|13|null|null
null|null|null|null|VALUE10|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE5|null|null|2022-01-05|COUNTRY1|null|STRING10|null|AC|null|null|STRING10|2022-01-13|2022-01-06|81351110|VALUE1|null|null|CITY10|888420110|null|2300115|404|null|RER|RCR|XCX|null|null|STRING2|STRING15|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING15|STRING15|STRING15|null|null|15007.00|null|15007.00|null|null|230012|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85004.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY10|885210|8123401010|STRING15|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE2|STRING10|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING10|null|EFL|null|null|null|124814010|null|STRING1|2140010|2022-01-06|null|null|null|STRING10|COMP1COUNTRY1|null|null|null|null|null|No||2|14|null|null
null|null|null|null|VALUE11|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY4|null|STRING11|null|AC|null|null|STRING11|2022-01-11|2022-01-06|81351111|VALUE1|null|null|CITY11|888420111|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING16|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING16|STRING16|STRING16|null|null|15008.00|null|15008.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|85013.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY11|885211|8123401011|STRING16|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING11|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING11|null|null|null|null|null|124814011|null|STRING1|2140011|2022-01-06|null|null|null|STRING11|COMP1COUNTRY4|null|null|null|null|null|No||2|15|null|null
null|null|null|null|VALUE13|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING13|null|AC|null|null|STRING13|2022-01-11|2022-01-06|81351113|VALUE1|null|null|CITY13|888420113|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING18|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING18|STRING18|STRING18|null|null|15010.00|null|15010.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85015.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY13|885213|8123401013|STRING18|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING13|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING13|null|null|null|null|null|124814013|null|STRING1|2140013|2022-01-06|null|null|null|STRING13|COMP1COUNTRY1|null|null|null|null|null|No||2|17|null|null
null|null|null|null|VALUE14|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING14|null|AC|null|null|STRING14|2022-01-11|2022-01-06|81351114|VALUE1|null|null|CITY14|888420114|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING19|null|101|null|null|null|1001|null|null|null|null|null|STRING19|STRING19|STRING19|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|85016.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY14|885214|8123401014|STRING19|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING14|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING14|null|null|null|null|null|124814014|null|STRING1|2140014|2022-01-06|null|null|null|STRING14|COMP1COUNTRY1|null|null|null|null|null|No||2|18|null|null
null|null|null|null|VALUE15|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-14|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING15|null|AA|null|null|STRING15|2022-01-14|2022-01-06|81351115|VALUE1|null|null|CITY15|888420115|null|2300120|404|null|RER|RCR|XCX|null|null|STRING3|STRING20|null|101|null|null|null|1002|null|null|null|null|null|STRING20|STRING20|STRING20|null|null|15003.00|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85009.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY15|885215|8123401015|STRING20|null|2022-01-06|STRING4|10020.000|STRING3|STORE1|STRING1|TYPE2|STRING15|10240403|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING1|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING15|null|EFL|null|null|null|124814015|null|null|null|null|null|null|null|STRING15|COMP1COUNTRY4|null|null|null|null|null|No||2|19|null|null
null|null|null|null|VALUE16|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY5|null|STRING16|null|AA|null|null|STRING16|null|2022-01-06|81351116|VALUE1|null|null|CITY16|888420116|null|2300121|404|null|RER|CRC|XCX|null|null|null|STRING21|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING21|STRING21|STRING21|null|null|null|15003.00|null|null|null|230016|null|null|101|STRING2|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY16|885216|8123401016|STRING21|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING16|10240404|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING16|null|null|null|null|null|124814016|null|null|null|null|null|null|null|STRING16|COMP1COUNTRY5|null|null|STRING2|STRING1|null|Yes||2|20|null|null
null|null|null|null|VALUE16|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY5|null|STRING16|null|AA|null|null|STRING16|null|2022-01-06|81351116|VALUE1|null|null|CITY16|888420116|null|2300105|404|null|RER|CRC|XCX|null|null|null|STRING22|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING22|STRING21|STRING21|null|null|null|15003.00|null|null|null|230004|null|null|101|STRING2|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY16|885216|8123401016|STRING21|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING16|10240404|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING16|null|null|null|null|null|124814016|null|null|null|null|null|null|null|STRING16|COMP1COUNTRY5|null|null|STRING2|STRING1|null|Yes||2|21|null|null
null|null|null|null|VALUE17|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING17|null|AC|null|null|STRING17|2022-01-11|2022-01-06|81351117|VALUE1|null|null|CITY17|888420117|null|2300123|404|null|RER|CRC|XCX|null|null|null|STRING23|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING23|STRING23|STRING23|null|null|15010.00|null|15010.00|null|null|230017|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85008.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY17|885217|8123401017|STRING23|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING17|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING17|null|null|null|null|null|124814017|null|STRING1|2140016|2022-01-06|null|null|null|STRING17|COMP1COUNTRY1|null|null|null|null|null|No||2|22|null|null
null|null|null|null|VALUE18|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING18|null|AC|null|null|STRING18|2022-01-11|2022-01-06|81351118|VALUE1|null|null|CITY18|888420118|null|2300114|404|null|RER|CRC|XCX|null|null|null|STRING24|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING24|STRING24|STRING24|null|null|null|null|null|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY18|885218|8123401018|STRING24|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING18|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING18|null|null|null|null|null|124814018|null|STRING1|2140017|2022-01-06|null|null|null|STRING18|COMP1COUNTRY1|null|null|null|null|null|No||2|23|null|null
null|null|null|null|VALUE18|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING18|null|AC|null|null|STRING18|2022-01-11|2022-01-06|81351118|VALUE1|null|null|CITY18|888420118|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING14|null|101|null|null|null|1002|null|null|null|null|null|STRING14|STRING24|STRING24|null|null|15003.00|null|15003.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500133.00|null|101.000|101.000|6500133.00|85012.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY18|885218|8123401018|STRING24|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING18|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING18|null|null|null|null|null|124814018|null|STRING1|2140017|2022-01-06|null|null|null|STRING18|COMP1COUNTRY1|null|null|null|null|null|No||2|24|null|null
null|null|null|null|VALUE19|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING19|null|AC|null|null|STRING19|2022-01-10|2022-01-06|81351119|VALUE1|null|null|CITY19|888420119|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING25|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING25|STRING26|STRING26|null|null|15008.00|null|15008.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|85013.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY19|885219|8123401019|STRING26|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING19|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING19|null|null|null|null|null|124814019|null|STRING1|2140018|2022-01-06|null|null|null|STRING19|COMP1COUNTRY1|null|null|null|null|null|No||2|25|null|null
null|null|null|null|VALUE20|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING20|null|AC|null|null|STRING20|2022-01-11|2022-01-06|81351120|VALUE1|null|null|CITY20|888420120|null|2300107|404|null|RER|RCR|XCX|null|null|null|STRING26|null|101|null|null|null|1001|null|null|null|null|null|STRING26|STRING27|STRING27|null|null|15003.00|null|15003.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85017.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY20|885220|8123401020|STRING27|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING20|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING20|null|null|null|null|null|124814020|null|STRING1|2140019|2022-01-06|null|null|null|STRING20|COMP1COUNTRY1|null|null|null|null|null|No||2|26|null|null
null|null|null|null|VALUE21|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING21|null|AC|null|null|STRING21|2022-01-11|2022-01-06|81351121|VALUE1|null|null|CITY21|888420121|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING27|null|101|null|null|null|1003|STRING2|null|null|null|null|STRING27|STRING28|STRING28|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY21|885221|8123401021|STRING28|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING21|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING21|null|null|null|null|null|124814021|null|STRING1|2140010|2022-01-06|null|null|null|STRING21|COMP1COUNTRY1|null|null|null|null|null|No||2|27|null|null
null|null|null|null|VALUE22|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING22|null|AC|null|null|STRING22|2022-01-11|2022-01-06|81351122|VALUE1|null|null|CITY22|888420122|null|2300129|404|null|RER|RCR|XCX|null|null|null|STRING28|null|101|null|null|null|1002|null|null|null|null|null|STRING28|STRING29|STRING29|null|null|15003.00|null|15003.00|null|null|230019|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY22|885222|8123401022|STRING29|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING22|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING22|null|null|null|null|null|124814022|null|STRING1|2140011|2022-01-06|null|null|null|STRING22|COMP1COUNTRY1|null|null|null|null|null|No||2|28|null|null
null|null|null|null|VALUE23|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING29|null|101|null|null|null|1002|null|null|null|null|null|STRING29|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|85019.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|29|null|null
null|null|null|null|VALUE23|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING30|null|101|null|null|null|1002|null|null|null|null|null|STRING30|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|30|null|null
null|null|null|null|VALUE23|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING31|null|101|null|null|null|1002|null|null|null|null|null|STRING31|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|31|null|null
null|null|null|null|VALUE23|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING23|null|AC|null|null|STRING23|2022-01-13|2022-01-06|81351123|VALUE1|null|null|CITY23|888420123|null|2300133|404|null|RER|RCR|XCX|null|null|null|STRING32|null|101|null|null|null|1001|null|null|null|null|null|STRING32|STRING30|STRING30|null|null|15003.00|null|15003.00|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|85019.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY23|885223|8123401023|STRING30|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING23|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING23|null|null|null|null|null|124814023|null|STRING1|2140020|2022-01-06|null|null|null|STRING23|COMP1COUNTRY6|null|null|null|null|null|No||2|32|null|null
null|null|null|null|VALUE24|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE7|null|null|2022-01-05|COUNTRY1|null|STRING24|null|AC|null|null|STRING24|2022-01-13|2022-01-06|81351124|VALUE1|null|null|CITY24|888420124|null|2300134|404|null|RER|RCR|XCX|null|null|null|STRING33|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING33|STRING34|STRING34|null|null|15011.00|null|15011.00|null|null|230021|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85016.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY24|885224|8123401024|STRING34|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING24|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING24|null|null|null|null|null|124814024|null|STRING1|2140021|2022-01-06|null|null|null|STRING24|COMP1COUNTRY1|null|null|null|null|null|No||2|33|null|null
null|null|null|null|VALUE25|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300135|404|null|RER|CRC|XCX|null|null|null|STRING34|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING34|STRING35|STRING35|null|null|15012.00|null|15012.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85020.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|34|null|null
null|null|null|null|VALUE25|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING35|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING35|STRING35|STRING35|null|null|15012.00|null|15012.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85020.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|35|null|null
null|null|null|null|VALUE25|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300107|404|null|RER|CRC|XCX|null|null|null|STRING36|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING36|STRING35|STRING35|null|null|15013.00|null|15013.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85021.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|36|null|null
null|null|null|null|VALUE25|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300138|404|null|RER|CRC|XCX|null|null|null|STRING37|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING37|STRING35|STRING35|null|null|15013.00|null|15013.00|null|null|230023|null|null|101|STRING1|null|null|null|null|PCP|101|6500138.00|null|101.000|101.000|6500138.00|85021.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|37|null|null
null|null|null|null|VALUE25|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING25|null|AC|null|null|STRING25|2022-01-11|2022-01-06|81351125|VALUE1|null|null|CITY25|888420125|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING38|null|101|null|null|null|1002|null|null|null|null|null|STRING38|STRING35|STRING35|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY25|885225|8123401025|STRING35|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING25|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING25|null|null|null|null|null|124814025|null|STRING1|2140022|2022-01-06|null|null|null|STRING25|COMP1COUNTRY2|null|null|null|null|null|No||2|38|null|null
null|null|null|null|VALUE26|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY7|null|STRING26|null|AC|null|null|STRING26|2022-01-11|2022-01-06|81351126|VALUE1|null|null|CITY26|888420126|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING39|null|101|null|null|null|1002|null|null|null|null|null|STRING39|STRING40|STRING40|null|null|15003.00|null|15003.00|null|null|230025|null|null|101|STRING3|null|null|null|null|PCP|101|6500139.00|null|101.000|101.000|6500139.00|85022.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY26|885226|8123401026|STRING40|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING26|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB7|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING26|null|null|null|null|null|124814026|null|STRING1|2140023|2022-01-06|null|null|null|STRING26|COMP1COUNTRY7|null|null|null|null|null|No||2|39|null|null
null|null|null|null|VALUE27|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE8|null|null|2022-01-05|COUNTRY1|null|STRING27|null|AC|null|null|STRING27|2022-01-10|2022-01-06|81351127|VALUE1|null|null|CITY27|888420127|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING40|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING40|STRING41|STRING41|null|null|15014.00|null|15014.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY27|885227|8123401027|STRING41|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING27|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING27|null|null|null|null|null|124814027|null|STRING1|2140024|2022-01-06|null|null|null|STRING27|COMP1COUNTRY1|null|null|null|null|null|No||2|40|null|null
null|null|null|null|VALUE27|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE8|null|null|2022-01-05|COUNTRY1|null|STRING27|null|AC|null|null|STRING27|2022-01-10|2022-01-06|81351127|VALUE1|null|null|CITY27|888420127|null|2300142|404|null|RER|RCR|XCX|null|null|null|STRING41|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING41|STRING41|STRING41|null|null|15014.00|null|15014.00|null|null|230026|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY27|885227|8123401027|STRING41|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING27|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING27|null|null|null|null|null|124814027|null|STRING1|2140024|2022-01-06|null|null|null|STRING27|COMP1COUNTRY1|null|null|null|null|null|No||2|41|null|null
null|null|null|null|VALUE27|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING27|null|AC|null|null|STRING27|2022-01-10|2022-01-06|81351127|VALUE1|null|null|CITY27|888420127|null|2300143|404|null|RER|CRC|XCX|null|null|null|STRING42|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING42|STRING41|STRING41|null|null|null|null|null|null|null|230027|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY27|885227|8123401027|STRING41|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING27|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING27|null|null|null|null|null|124814027|null|STRING1|2140024|2022-01-06|null|null|null|STRING27|COMP1COUNTRY1|null|null|STRING1|STRING1|STRING2|Yes||2|42|null|null
null|null|null|null|VALUE28|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE9|null|null|2022-01-05|COUNTRY1|null|STRING28|null|AC|null|null|STRING28|2022-01-11|2022-01-06|81351128|VALUE1|null|null|CITY20|888420128|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING43|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING43|STRING44|STRING44|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY20|885228|8123401028|STRING44|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING28|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING28|null|null|null|null|null|124814028|null|STRING1|2140025|2022-01-06|null|null|null|STRING28|COMP1COUNTRY1|null|null|null|null|null|No||2|43|null|null
null|null|null|null|VALUE28|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING28|null|AC|null|null|STRING28|2022-01-11|2022-01-06|81351128|VALUE1|null|null|CITY20|888420128|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING44|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING44|STRING44|STRING44|null|null|null|null|null|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885228|8123401028|STRING44|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING28|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING28|null|null|null|null|null|124814028|null|STRING1|2140025|2022-01-06|null|null|null|STRING28|COMP1COUNTRY1|null|null|STRING3|STRING2|STRING3|Yes||2|44|null|null
null|null|null|null|VALUE29|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING29|METHOD2|AC|null|null|STRING29|2022-01-11|2022-01-06|81351129|VALUE1|null|null|CITY28|888420129|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING45|null|101|null|null|null|1001|null|null|null|null|null|STRING45|STRING46|STRING46|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY28|885229|8123401029|STRING46|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING29|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING29|null|null|null|null|null|124814029|null|STRING1|2140026|2022-01-06|null|null|null|STRING29|COMP1COUNTRY1|null|null|null|null|null|No||2|45|null|null
null|null|null|null|VALUE30|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY4|null|STRING30|null|AC|null|null|STRING30|2022-01-11|2022-01-06|81351130|VALUE1|null|null|CITY29|888420130|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING46|null|101|null|null|null|1001|null|null|null|null|null|STRING46|STRING47|STRING47|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY29|885230|8123401030|STRING47|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING30|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING30|null|null|null|null|null|124814030|null|STRING1|2140027|2022-01-06|null|null|null|STRING30|COMP1COUNTRY4|null|null|null|null|null|No||2|46|null|null
null|null|null|null|VALUE31|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING31|null|AA|null|null|STRING31|2022-01-13|2022-01-06|81351131|VALUE1|null|null|CITY11|888420131|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING47|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING47|STRING48|STRING48|null|null|null|null|null|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY11|885231|8123401031|STRING48|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING31|10240405|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING31|null|null|null|null|null|124814031|null|null|null|null|null|null|null|STRING31|COMP1COUNTRY4|null|null|STRING4|STRING1|null|Yes||2|47|null|null
null|null|null|null|VALUE32|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING32|null|AC|null|null|STRING32|2022-01-11|2022-01-06|81351132|VALUE1|null|null|CITY30|888420132|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING48|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING48|STRING49|STRING49|null|null|15015.00|null|15015.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500141.00|null|101.000|101.000|6500141.00|85026.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY30|885232|8123401032|STRING49|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING32|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING32|null|null|null|null|null|124814032|null|STRING1|2140008|2022-01-06|null|null|null|STRING32|COMP1COUNTRY2|null|null|null|null|null|No||2|48|null|null
null|null|null|null|VALUE32|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING32|null|AC|null|null|STRING32|2022-01-11|2022-01-06|81351132|VALUE1|null|null|CITY30|888420132|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING49|null|101|null|null|null|1001|null|null|null|null|null|STRING49|STRING49|STRING49|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY30|885232|8123401032|STRING49|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING32|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING32|null|null|null|null|null|124814032|null|STRING1|2140008|2022-01-06|null|null|null|STRING32|COMP1COUNTRY2|null|null|null|null|null|No||2|49|null|null
null|null|null|null|VALUE33|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE10|null|null|2022-01-05|COUNTRY1|null|STRING33|null|AC|null|null|STRING33|2022-01-13|2022-01-06|81351133|VALUE1|null|null|CITY31|888420133|null|2300151|404|null|RER|RCR|XCX|null|null|null|STRING50|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING50|STRING51|STRING51|null|null|15016.00|null|15016.00|null|null|230028|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85028.00|101.000|101.000|101.000|101.000|STRING9|STRING1|2022-01-06|CITY31|885233|8123401033|STRING51|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING33|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING33|null|null|null|null|null|124814033|null|STRING1|2140028|2022-01-06|null|null|null|STRING33|COMP1COUNTRY1|null|null|null|null|null|No||2|50|null|null
null|null|null|null|VALUE33|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE10|null|null|2022-01-05|COUNTRY1|null|STRING33|null|AC|null|null|STRING33|2022-01-13|2022-01-06|81351133|VALUE1|null|null|CITY31|888420133|null|2300151|404|null|RER|RCR|XCX|null|null|null|STRING51|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING51|STRING51|STRING51|null|null|15016.00|null|15016.00|null|null|230028|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85028.00|101.000|101.000|101.000|101.000|STRING9|STRING1|2022-01-06|CITY31|885233|8123401033|STRING51|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING33|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING33|null|null|null|null|null|124814033|null|STRING1|2140028|2022-01-06|null|null|null|STRING33|COMP1COUNTRY1|null|null|null|null|null|No||2|51|null|null
null|null|null|null|VALUE34|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE11|null|null|2022-01-05|COUNTRY1|null|STRING34|null|AC|null|null|STRING34|2022-01-11|2022-01-06|81351134|VALUE1|null|null|CITY20|888420134|null|2300104|404|null|RER|RCR|XCX|null|null|null|STRING52|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING52|STRING53|STRING53|null|null|15011.00|null|15011.00|null|null|230003|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85016.00|101.000|101.000|101.000|101.000|STRING10|STRING1|2022-01-06|CITY20|885234|8123401034|STRING53|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING34|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING34|null|null|null|null|null|124814034|null|STRING1|2140029|2022-01-06|null|null|null|STRING34|COMP1COUNTRY1|null|null|null|null|null|No||2|52|null|null
null|null|null|null|VALUE35|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE12|null|null|2022-01-05|COUNTRY8|null|STRING35|null|AC|null|null|STRING35|2022-01-11|2022-01-06|81351135|VALUE1|null|null|CITY32|888420135|null|2300111|404|null|RER|RCR|XCX|null|null|null|STRING53|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING53|STRING54|STRING54|null|null|null|null|null|null|null|230008|null|null|101|STRING4|null|null|null|null|PCP|101|6500143.00|null|101.000|101.000|6500143.00|null|101.000|101.000|101.000|101.000|STRING11|STRING1|2022-01-06|CITY32|885235|8123401035|STRING54|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING35|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB8|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING35|null|null|null|null|null|124814035|null|STRING1|2140030|2022-01-06|null|null|null|STRING35|COMP1COUNTRY8|null|null|null|null|null|No||2|53|null|null
null|null|null|null|VALUE36|10102412|10102412|null|3,02E+25|3,02E+25|null|null|XXX|null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY7|null|STRING36|null|AA|null|null|STRING36|null|2022-01-06|81351136|VALUE1|null|null|CITY33|888420136|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING54|null|101|null|null|null|1001|STRING7|null|6024050701|null|null|STRING54|STRING55|STRING55|null|null|null|null|null|null|null|230004|null|null|101|STRING3|null|null|null|null|PCP|101|6500144.00|null|101.000|101.000|6500144.00|null|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY33|885236|8123401036|STRING55|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING36|10240406|null|null|null|null|null|null|COMPANY2|COUNTRYAB7|STRING3|STRING3|1923002|1923001|123051|null|null|null|10349200.00|STRING36|null|null|null|EOD|STRING1|124814036|null|null|null|null|null|null|null|STRING36|null|null|null|STRING5|STRING3|null|Yes||2|54|null|null
null|null|null|null|VALUE38|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING38|null|AC|null|null|STRING38|2022-01-11|2022-01-06|81351138|VALUE1|null|null|CITY35|888420138|null|2300110|404|null|RER|CRC|XCX|null|null|null|STRING56|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING56|STRING57|STRING57|null|null|15010.00|null|15010.00|null|null|230007|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85015.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY35|885238|8123401038|STRING57|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING38|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING38|null|null|null|null|null|124814038|null|STRING1|2140032|2022-01-06|null|null|null|STRING38|COMP1COUNTRY1|null|null|null|null|null|No||2|56|null|null
null|null|null|null|VALUE39|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING39|null|AC|null|null|STRING39|2022-01-11|2022-01-06|81351139|VALUE1|null|null|CITY36|888420139|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING57|null|101|null|null|null|1001|null|null|null|null|null|STRING57|STRING58|STRING58|null|null|15003.00|null|15003.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY36|885239|8123401039|STRING58|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING39|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING39|null|null|null|null|null|124814039|null|STRING1|2140033|2022-01-06|null|null|null|STRING39|COMP1COUNTRY1|null|null|null|null|null|No||2|57|null|null
null|null|null|null|VALUE40|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE13|null|null|2022-01-05|COUNTRY1|null|STRING40|METHOD3|AC|null|null|STRING40|2022-01-13|2022-01-06|81351140|VALUE1|null|null|CITY37|888420140|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING58|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING58|STRING59|STRING59|null|null|15001.00|null|15001.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85027.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY37|885240|8123401040|STRING59|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING40|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING40|null|null|null|null|null|124814040|null|STRING1|2140034|2022-01-06|null|null|null|STRING40|COMP1COUNTRY1|null|null|null|null|null|No||2|58|null|null
null|null|null|null|VALUE41|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE14|null|null|2022-01-05|COUNTRY5|null|STRING41|null|AA|null|null|STRING41|2022-01-10|2022-01-06|81351141|VALUE1|null|null|CITY38|888420141|null|2300123|404|null|RER|RCR|XCX|null|null|null|STRING59|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING59|STRING60|STRING60|null|null|null|null|null|null|null|230017|null|null|101|STRING2|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|null|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY38|885241|8123401041|STRING60|null|2022-01-06|STRING4|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING41|10240406|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING1|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING41|null|null|null|null|null|124814041|null|null|null|null|null|null|null|STRING41|COMP1COUNTRY5|null|null|null|null|null|No||2|59|null|null
null|null|null|null|VALUE42|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE15|null|null|2022-01-05|COUNTRY9|null|STRING42|METHOD4|AA|null|null|STRING42|null|2022-01-06|81351142|VALUE1|null|null|CITY39|888420142|null|2300110|404|null|RER|RCR|XCX|null|null|null|STRING60|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING60|STRING61|STRING61|null|null|null|null|null|null|null|230007|null|null|101|STRING5|null|null|null|null|PCP|101|6500146.00|null|101.000|101.000|6500146.00|null|101.000|101.000|101.000|101.000|STRING13|STRING2|2022-01-06|CITY39|885242|8123401042|STRING61|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING3|TYPE1|STRING42|10240407|null|null|null|null|null|null|COMPANY1|COUNTRYAB9|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|STRING42|null|null|null|null|null|124814031|3059002|null|null|null|null|null|null|STRING42|COMP1COUNTRY9|null|null|null|null|null|No||2|60|null|null
null|null|null|null|VALUE43|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-14|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY7|null|STRING43|null|AA|null|null|STRING43|2022-01-14|2022-01-06|81351143|VALUE1|null|null|CITY40|888420143|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING61|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING61|STRING62|STRING62|null|null|null|null|null|null|null|230013|null|null|101|STRING3|null|null|null|null|PCP|101|6500147.00|null|101.000|101.000|6500147.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY40|885243|8123401043|STRING62|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING43|10240408|null|null|null|null|null|null|COMPANY1|COUNTRYAB7|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING43|null|null|null|null|null|124814042|null|null|null|null|null|null|null|STRING43|COMP1COUNTRY7|null|null|STRING6|STRING1|null|Yes||2|61|null|null
null|null|null|null|VALUE44|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE16|null|null|2022-01-05|COUNTRY1|null|STRING44|null|AC|null|null|STRING44|2022-01-13|2022-01-06|81351144|VALUE1|null|null|CITY41|888420144|null|2300163|404|null|RER|RCR|XCX|null|null|STRING4|STRING62|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING62|STRING63|STRING63|null|null|15014.00|null|15014.00|null|null|230029|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85030.00|101.000|101.000|101.000|101.000|STRING4|STRING1|2022-01-06|CITY41|885244|8123401044|STRING63|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE2|STRING44|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING44|null|EFL|null|null|null|124814043|null|STRING1|2140035|2022-01-06|null|null|null|STRING44|COMP1COUNTRY1|null|null|null|null|null|No||2|62|null|null
null|null|null|null|VALUE45|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING45|null|AC|null|null|STRING45|2022-01-10|2022-01-06|81351145|VALUE1|null|null|CITY42|888420145|null|2300164|404|null|RER|RCR|XCX|null|null|null|STRING63|null|101|null|null|null|1002|null|null|null|null|null|STRING63|STRING64|STRING64|null|null|15003.00|null|15003.00|null|null|230030|null|null|101|STRING1|null|null|null|null|PCP|101|6500149.00|null|101.000|101.000|6500149.00|85031.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY42|885245|8123401045|STRING64|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING45|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING45|null|null|null|null|null|124814044|null|STRING1|2140036|2022-01-06|null|null|null|STRING45|COMP1COUNTRY1|null|null|null|null|null|No||2|63|null|null
null|null|null|null|VALUE46|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE17|null|null|2022-01-05|COUNTRY1|null|STRING46|METHOD5|AC|null|null|STRING46|2022-01-11|2022-01-06|81351146|VALUE1|null|null|CITY20|888420146|null|2300165|404|null|RER|RCR|XCX|null|null|null|STRING64|null|101|null|null|null|1001|STRING8|null|null|null|null|STRING64|STRING65|STRING65|null|null|15017.00|null|15017.00|null|null|230031|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85032.00|101.000|101.000|101.000|101.000|STRING14|STRING1|2022-01-06|CITY20|885246|8123401046|STRING65|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING46|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING46|null|null|null|null|null|124814045|null|STRING1|2140037|2022-01-06|null|null|null|STRING46|COMP1COUNTRY1|null|null|null|null|null|No||2|64|null|null
null|null|null|null|VALUE47|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING47|METHOD3|AC|null|null|STRING47|2022-01-11|2022-01-06|81351147|VALUE1|null|null|CITY20|888420147|null|2300120|404|null|RER|CRC|XCX|null|null|null|STRING65|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING65|STRING66|STRING66|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885247|8123401047|STRING66|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING47|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING47|null|null|null|null|null|124814046|null|STRING1|2140003|2022-01-06|null|null|null|STRING47|COMP1COUNTRY1|null|null|null|null|null|No||2|65|null|null
null|null|null|null|VALUE47|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING47|METHOD3|AC|null|null|STRING47|2022-01-11|2022-01-06|81351147|VALUE1|null|null|CITY20|888420147|null|2300120|404|null|RER|CRC|XCX|null|null|null|STRING66|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING66|STRING66|STRING66|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500133.00|null|101.000|101.000|6500133.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885247|8123401047|STRING66|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING47|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING47|null|null|null|null|null|124814046|null|STRING1|2140003|2022-01-06|null|null|null|STRING47|COMP1COUNTRY1|null|null|null|null|null|No||2|66|null|null
null|null|null|null|VALUE47|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING47|METHOD3|AC|null|null|STRING47|2022-01-11|2022-01-06|81351147|VALUE1|null|null|CITY20|888420147|null|2300120|404|null|RER|CRC|XCX|null|null|null|STRING67|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING67|STRING66|STRING66|null|null|15014.00|null|15014.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885247|8123401047|STRING66|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING47|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING47|null|null|null|null|null|124814046|null|STRING1|2140003|2022-01-06|null|null|null|STRING47|COMP1COUNTRY1|null|null|null|null|null|No||2|67|null|null
null|null|null|null|VALUE49|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING49|null|AC|null|null|STRING49|2022-01-10|2022-01-06|81351149|VALUE1|null|null|CITY44|888420149|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING70|null|101|null|null|null|1002|null|null|null|null|null|STRING70|STRING71|STRING71|null|null|15003.00|null|15003.00|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY44|885249|8123401049|STRING71|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING49|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING49|null|null|null|null|null|124814048|null|STRING1|2140039|2022-01-06|null|null|null|STRING49|COMP1COUNTRY1|null|null|null|null|null|No||2|70|null|null
null|null|null|null|VALUE49|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING49|null|AC|null|null|STRING49|2022-01-10|2022-01-06|81351149|VALUE1|null|null|CITY44|888420149|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING71|null|101|null|null|null|1002|null|null|null|null|null|STRING71|STRING71|STRING71|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|85034.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY44|885249|8123401049|STRING71|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING49|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING49|null|null|null|null|null|124814048|null|STRING1|2140039|2022-01-06|null|null|null|STRING49|COMP1COUNTRY1|null|null|null|null|null|No||2|71|null|null
null|null|null|null|VALUE50|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING50|null|AA|null|null|STRING50|2022-01-13|2022-01-06|81351142|VALUE1|null|null|CITY45|888420150|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING72|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING72|STRING73|STRING73|null|null|null|null|null|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY45|885250|8123401042|STRING73|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING50|10240407|null|null|null|null|null|null|COMPANY2|COUNTRYAB4|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING50|null|null|null|null|null|124814049|null|null|null|null|null|null|null|STRING50|COMP2COUNTRY4|null|null|null|null|null|No||2|72|null|null
null|null|null|null|VALUE51|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING51|null|AC|null|null|STRING51|2022-01-11|2022-01-06|81351150|VALUE1|null|null|CITY46|888420151|null|2300121|404|null|RER|CRC|XCX|null|null|null|STRING73|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING73|STRING74|STRING74|null|null|null|null|null|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY46|885251|8123401050|STRING74|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING51|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING51|null|null|null|null|null|124814050|null|STRING1|2140030|2022-01-06|null|null|null|STRING51|COMP1COUNTRY3|null|null|null|null|null|No||2|73|null|null
null|null|null|null|VALUE51|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING51|null|AC|null|null|STRING51|2022-01-11|2022-01-06|81351150|VALUE1|null|null|CITY46|888420151|null|2300175|404|null|RER|CRC|XCX|null|null|null|STRING74|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING74|STRING74|STRING74|null|null|15019.00|null|15019.00|null|null|230032|null|null|101|STRING1|null|null|null|null|PCP|101|6500151.00|null|101.000|101.000|6500151.00|85009.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY46|885251|8123401050|STRING74|null|2022-01-06|STRING5|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING51|10240409|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING51|null|null|null|null|null|124814050|null|STRING1|2140030|2022-01-06|null|null|null|STRING51|COMP1COUNTRY3|null|null|null|null|null|No||2|74|null|null
null|null|null|null|VALUE52|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE18|null|null|2022-01-05|COUNTRY1|null|STRING52|null|AC|null|null|STRING52|2022-01-11|2022-01-06|81351151|VALUE1|null|null|CITY47|888420152|null|2300115|404|null|RER|RCR|XCX|null|null|null|STRING75|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING75|STRING76|STRING76|null|null|15007.00|null|15007.00|null|null|230012|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85004.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY47|885252|8123401051|STRING76|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING52|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING52|null|null|null|null|null|124814051|null|STRING1|2140040|2022-01-06|null|null|null|STRING52|COMP1COUNTRY1|null|null|null|null|null|No||2|75|null|null
null|null|null|null|VALUE53|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING53|METHOD6|AC|null|null|STRING53|2022-01-11|2022-01-06|81351152|VALUE1|null|null|CITY48|888420153|null|2300177|404|null|RER|RCR|XCX|null|null|null|STRING76|null|101|null|null|null|1001|null|null|null|null|null|STRING76|STRING77|STRING77|null|null|15003.00|null|15003.00|null|null|230033|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85015.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY48|885253|8123401052|STRING77|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING53|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING53|null|null|null|null|null|124814052|null|STRING1|2140041|2022-01-06|null|null|null|STRING53|COMP1COUNTRY1|null|null|null|null|null|No||2|76|null|null
null|null|null|null|VALUE54|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE19|null|null|2022-01-05|COUNTRY1|null|STRING54|null|AC|null|null|STRING54|2022-01-11|2022-01-06|81351153|VALUE1|null|null|CITY49|888420154|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING77|null|101|null|null|null|1003|STRING6|null|null|null|null|STRING77|STRING78|STRING78|null|null|15020.00|null|15020.00|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500152.00|null|101.000|101.000|6500152.00|85035.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY49|885254|8123401053|STRING78|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING54|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING54|null|null|null|null|null|124814053|null|STRING1|2140042|2022-01-06|null|null|null|STRING54|COMP1COUNTRY1|null|null|null|null|null|No||2|77|null|null
null|null|null|null|VALUE54|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE19|null|null|2022-01-05|COUNTRY1|null|STRING54|null|AC|null|null|STRING54|2022-01-11|2022-01-06|81351153|VALUE1|null|null|CITY49|888420154|null|2300133|404|null|RER|RCR|XCX|null|null|null|STRING78|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING78|STRING78|STRING78|null|null|null|null|null|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|null|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY49|885254|8123401053|STRING78|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING54|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING54|null|null|null|null|null|124814053|null|STRING1|2140042|2022-01-06|null|null|null|STRING54|COMP1COUNTRY1|null|null|null|null|null|No||2|78|null|null
null|null|null|null|VALUE55|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING55|null|AC|null|null|STRING55|2022-01-13|2022-01-06|81351154|VALUE1|null|null|CITY50|888420155|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING79|null|101|null|null|null|1001|null|null|null|null|null|STRING79|STRING80|STRING80|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85036.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY50|885255|8123401054|STRING80|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING55|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING55|null|null|null|null|null|124814054|null|STRING1|2140043|2022-01-06|null|null|null|STRING55|COMP1COUNTRY6|null|null|null|null|null|No||2|79|null|null
null|null|null|null|VALUE56|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE20|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|2022-01-11|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING80|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING80|STRING81|STRING81|null|null|15016.00|null|15016.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85028.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING56|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING56|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|80|null|null
null|null|null|null|VALUE56|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|2022-01-11|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300175|404|null|RER|RCR|XCX|null|null|null|STRING81|null|101|null|null|null|1001|null|null|null|null|null|STRING81|STRING81|STRING81|null|null|15003.00|null|15003.00|null|null|230032|null|null|101|STRING1|null|null|null|null|PCP|101|6500141.00|null|101.000|101.000|6500141.00|85037.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING56|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING56|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|81|null|null
null|null|null|null|VALUE56|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE20|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|2022-01-11|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING82|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING82|STRING81|STRING81|null|null|15021.00|null|15021.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85038.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING56|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING56|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|82|null|null
null|null|null|null|VALUE56|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE20|null|null|2022-01-05|COUNTRY1|null|STRING56|null|AC|null|null|STRING56|null|2022-01-06|81351155|VALUE1|null|null|CITY51|888420156|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING83|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING83|STRING81|STRING81|null|null|15020.00|null|15020.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85023.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY51|885256|8123401055|STRING81|null|2022-01-06|STRING6|10020.000|STRING1|STORE1|STRING1|TYPE3|STRING56|10240402|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|null|1923002|1923001|null|null|null|null|10349200.00|STRING57|null|null|null|null|null|124814055|null|STRING1|2140021|2022-01-06|null|null|null|STRING56|COMP1COUNTRY1|null|null|null|null|null|No||2|83|null|null
null|null|null|null|VALUE57|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING57|null|AC|null|null|STRING57|2022-01-11|2022-01-06|81351156|VALUE1|null|null|CITY52|888420157|null|2300185|404|null|RER|RCR|XCX|null|null|null|STRING84|null|101|null|null|null|1001|null|null|null|null|null|STRING84|STRING85|STRING85|null|null|15003.00|null|15003.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85006.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY52|885257|8123401056|STRING85|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING57|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING58|null|null|null|null|null|124814056|null|STRING1|2140044|2022-01-06|null|null|null|STRING57|COMP1COUNTRY6|null|null|null|null|null|No||2|84|null|null
null|null|null|null|VALUE57|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING57|null|AC|null|null|STRING57|2022-01-11|2022-01-06|81351156|VALUE1|null|null|CITY52|888420157|null|2300185|404|null|RER|RCR|XCX|null|null|null|STRING85|null|101|null|null|null|1001|null|null|null|null|null|STRING85|STRING85|STRING85|null|null|15003.00|null|15003.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500153.00|null|101.000|101.000|6500153.00|85039.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY52|885257|8123401056|STRING85|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING57|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING58|null|null|null|null|null|124814056|null|STRING1|2140044|2022-01-06|null|null|null|STRING57|COMP1COUNTRY6|null|null|null|null|null|No||2|85|null|null
null|null|null|null|VALUE57|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING57|null|AC|null|null|STRING57|2022-01-11|2022-01-06|81351156|VALUE1|null|null|CITY52|888420157|null|2300185|404|null|RER|RCR|XCX|null|null|null|STRING86|null|101|null|null|null|1001|null|null|null|null|null|STRING86|STRING85|STRING85|null|null|15003.00|null|15003.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY52|885257|8123401056|STRING85|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING57|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING58|null|null|null|null|null|124814056|null|STRING1|2140044|2022-01-06|null|null|null|STRING57|COMP1COUNTRY6|null|null|null|null|null|No||2|86|null|null
null|null|null|null|VALUE58|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING58|null|AC|null|null|STRING58|2022-01-13|2022-01-06|81351157|VALUE1|null|null|CITY53|888420158|null|2300101|404|null|RER|RCR|XCX|null|null|STRING5|STRING87|null|101|null|null|null|1002|null|null|null|null|null|STRING87|STRING88|STRING88|null|null|15003.00|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY53|885258|8123401057|STRING88|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE2|STRING58|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING59|null|EFL|null|null|null|124814057|null|STRING1|2140045|2022-01-06|null|null|null|STRING58|COMP1COUNTRY3|null|null|null|null|null|No||2|87|null|null
null|null|null|null|VALUE59|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING59|null|AC|null|null|STRING59|2022-01-11|2022-01-06|81351158|VALUE1|null|null|CITY54|888420159|null|2300105|404|null|RER|CRC|XCX|null|null|null|STRING88|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING88|STRING89|STRING89|null|null|null|null|null|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY54|885259|8123401058|STRING89|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING59|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING60|null|null|null|null|null|124814058|null|STRING1|2140046|2022-01-06|null|null|null|STRING59|COMP1COUNTRY1|null|null|null|null|null|No||2|88|null|null
null|null|null|null|VALUE59|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING59|null|AC|null|null|STRING59|2022-01-11|2022-01-06|81351158|VALUE1|null|null|CITY54|888420159|null|2300190|404|null|RER|RCR|XCX|null|null|null|STRING89|null|101|null|null|null|1003|null|null|null|null|null|STRING89|STRING89|STRING89|null|null|15003.00|null|15003.00|null|null|230036|null|null|101|STRING1|null|null|null|null|PCP|101|6500154.00|null|101.000|101.000|6500154.00|85040.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY54|885259|8123401058|STRING89|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING59|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING60|null|null|null|null|null|124814058|null|STRING1|2140046|2022-01-06|null|null|null|STRING59|COMP1COUNTRY1|null|null|null|null|null|No||2|89|null|null
null|null|null|null|VALUE59|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING59|null|AC|null|null|STRING59|2022-01-11|2022-01-06|81351158|VALUE1|null|null|CITY54|888420159|null|2300191|404|null|RER|RCR|XCX|null|null|null|STRING90|null|101|null|null|null|1003|null|null|null|null|null|STRING90|STRING89|STRING89|null|null|15003.00|null|15003.00|null|null|230037|null|null|101|STRING1|null|null|null|null|PCP|101|6500154.00|null|101.000|101.000|6500154.00|85040.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY54|885259|8123401058|STRING89|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING59|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING60|null|null|null|null|null|124814058|null|STRING1|2140046|2022-01-06|null|null|null|STRING59|COMP1COUNTRY1|null|null|null|null|null|No||2|90|null|null
null|null|null|null|VALUE60|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-05|COUNTRY1|null|STRING60|null|AC|null|null|STRING60|2022-01-13|2022-01-06|81351159|VALUE1|null|null|CITY55|888420160|null|2300192|404|null|RER|RCR|XCX|null|null|null|STRING91|null|101|null|null|null|1002|STRING3|null|null|null|null|STRING91|STRING92|STRING92|null|null|null|null|null|null|null|230038|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY55|885260|8123401059|STRING92|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING60|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING61|null|null|null|null|null|124814059|null|STRING1|2140047|2022-01-06|null|null|null|STRING60|COMP1COUNTRY1|null|null|null|null|null|No||2|91|null|null
null|null|null|null|VALUE61|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING61|null|AC|null|null|STRING61|2022-01-11|2022-01-06|81351160|VALUE1|null|null|CITY20|888420161|null|2300123|404|null|RER|RCR|XCX|null|null|null|STRING92|null|101|null|null|null|1001|null|null|null|null|null|STRING92|STRING93|STRING93|null|null|15003.00|null|15003.00|null|null|230017|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY20|885261|8123401060|STRING93|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING61|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING62|null|null|null|null|null|124814060|null|STRING1|2140048|2022-01-06|null|null|null|STRING61|COMP1COUNTRY1|null|null|null|null|null|No||2|92|null|null
null|null|null|null|VALUE61|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE21|null|null|2022-01-05|COUNTRY1|null|STRING61|null|AC|null|null|STRING61|2022-01-11|2022-01-06|81351160|VALUE1|null|null|CITY20|888420161|null|2300129|404|null|RER|RCR|XCX|null|null|null|STRING93|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING93|STRING93|STRING93|null|null|15022.00|null|15022.00|null|null|230019|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85041.00|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY20|885261|8123401060|STRING93|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING61|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING62|null|null|null|null|null|124814060|null|STRING1|2140048|2022-01-06|null|null|null|STRING61|COMP1COUNTRY1|null|null|null|null|null|No||2|93|null|null
null|null|null|null|VALUE63|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING63|null|AC|null|null|STRING63|2022-01-11|2022-01-06|81351162|VALUE1|null|null|CITY57|888420163|null|2300101|404|null|RER|CRC|XCX|null|null|null|STRING95|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING95|STRING96|STRING96|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY57|885263|8123401062|STRING96|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING63|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING64|null|null|null|null|null|124814062|null|STRING1|2140044|2022-01-06|null|null|null|STRING63|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|95|null|null
null|null|null|null|VALUE64|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300165|404|null|RER|RCR|XCX|null|null|null|STRING96|null|101|null|null|null|1002|STRING7|null|null|null|null|STRING96|STRING97|STRING97|null|null|15016.00|null|15016.00|null|null|230031|null|null|101|STRING1|null|null|null|null|PCP|101|6500152.00|null|101.000|101.000|6500152.00|85042.00|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|96|null|null
null|null|null|null|VALUE64|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300164|404|null|RER|RCR|XCX|null|null|null|STRING97|null|101|null|null|null|1002|STRING7|null|null|null|null|STRING97|STRING97|STRING97|null|null|null|null|null|null|null|230030|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|null|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|97|null|null
null|null|null|null|VALUE64|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300164|404|null|RER|RCR|XCX|null|null|null|STRING98|null|101|null|null|null|1002|STRING7|null|null|null|null|STRING98|STRING97|STRING97|null|null|null|null|null|null|null|230030|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|null|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|98|null|null
null|null|null|null|VALUE64|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING64|null|AC|null|null|STRING64|2022-01-11|2022-01-06|81351163|VALUE1|null|null|CITY58|888420164|null|2300175|404|null|RER|RCR|XCX|null|null|null|STRING99|null|101|null|null|null|1001|STRING7|null|null|null|null|STRING99|STRING97|STRING97|null|null|15011.00|null|15011.00|null|null|230032|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85016.00|101.000|101.000|101.000|101.000|STRING12|STRING1|2022-01-06|CITY58|885264|8123401063|STRING97|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING64|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING65|null|null|null|null|null|124814063|null|STRING1|2140050|2022-01-06|null|null|null|STRING64|COMP2COUNTRY3|null|null|null|null|null|No||2|99|null|null
null|null|null|null|VALUE65|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING65|null|AC|null|null|STRING65|2022-01-11|2022-01-06|81351164|VALUE1|null|null|CITY59|888420165|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING100|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING100|STRING101|STRING101|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING15|STRING1|2022-01-06|CITY59|885265|8123401064|STRING101|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING65|10240401|null|null|null|null|null|null|COMPANY2|COUNTRYAB10|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING66|null|null|null|null|null|124814064|null|STRING1|2140051|2022-01-06|null|null|null|STRING65|COMP2COUNTRY6|null|null|null|null|null|No||2|100|null|null
null|null|null|null|VALUE66|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING66|null|AC|null|null|STRING66|2022-01-13|2022-01-06|81351165|VALUE1|null|null|CITY60|888420166|null|2300116|404|null|RER|CRC|XCX|null|null|null|STRING101|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING101|STRING102|STRING102|null|null|15023.00|null|15023.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500149.00|null|101.000|101.000|6500149.00|85043.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY60|885266|8123401065|STRING102|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING66|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING67|null|null|null|null|null|124814065|null|STRING1|2140052|2022-01-06|null|null|null|STRING66|COMP1COUNTRY1|null|null|null|null|null|No||2|101|null|null
null|null|null|null|VALUE67|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE23|null|null|2022-01-05|COUNTRY1|null|STRING67|null|AC|null|null|STRING67|2022-01-11|2022-01-06|81351166|VALUE1|null|null|CITY61|888420167|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING102|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING102|STRING103|STRING103|null|null|15017.00|null|15017.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85044.00|101.000|101.000|101.000|101.000|STRING5|STRING1|2022-01-06|CITY61|885267|8123401066|STRING103|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING67|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING68|null|null|null|null|null|124814066|null|STRING1|2140053|2022-01-06|null|null|null|STRING67|COMP1COUNTRY1|null|null|null|null|null|No||2|102|null|null
null|null|null|null|VALUE68|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING68|null|AC|null|null|STRING68|2022-01-11|2022-01-06|81351167|VALUE1|null|null|CITY62|888420168|null|2300140|404|null|RER|CRC|XCX|null|null|null|STRING103|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING103|STRING104|STRING104|null|null|null|null|null|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY62|885268|8123401067|STRING104|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING68|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING69|null|null|null|null|null|124814067|null|STRING1|2140054|2022-01-06|null|null|null|STRING68|COMP1COUNTRY1|null|null|STRING7|STRING2|null|Yes||2|103|null|null
null|null|null|null|VALUE69|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING69|null|AC|null|null|STRING69|2022-01-11|2022-01-06|81351168|VALUE1|null|null|CITY63|888420169|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING104|null|101|null|null|null|1002|null|null|null|null|null|STRING104|STRING105|STRING105|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY63|885269|8123401068|STRING105|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING69|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING70|null|null|null|null|null|124814068|null|STRING1|2140055|2022-01-06|null|null|null|STRING69|COMP1COUNTRY1|null|null|null|null|null|No||2|104|null|null
null|null|null|null|VALUE70|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING70|METHOD2|AC|null|null|STRING70|2022-01-11|2022-01-06|81351169|VALUE1|null|null|CITY64|888420170|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING85|null|101|null|null|null|1001|null|null|null|null|null|STRING105|STRING106|STRING106|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500153.00|null|101.000|101.000|6500153.00|85039.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY64|885270|8123401069|STRING106|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING70|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING71|null|null|null|null|null|124814069|null|STRING1|2140056|2022-01-06|null|null|null|STRING70|COMP1COUNTRY1|null|null|null|null|null|No||2|105|null|null
null|null|null|null|VALUE71|10102412|10102412|null|3,02E+25|3,02E+25|null|null|XYX|null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY2|null|STRING71|null|AA|null|null|STRING71|null|2022-01-06|81351170|VALUE1|null|null|CITY65|888420171|null|2300207|404|null|RER|RCR|XCX|null|null|null|STRING105|null|101|null|null|null|1002|null|null|6024050703|null|null|STRING106|STRING107|STRING107|null|null|15003.00|null|15003.00|null|null|230039|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY65|885271|8123401070|STRING107|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING71|10240410|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING3|1923002|1923001|123124061|null|null|null|10349200.00|STRING72|null|null|null|EOD|STRING1|124814070|null|null|null|null|null|null|null|STRING71|null|null|STRING1|null|null|null|No||2|106|null|null
null|null|null|null|VALUE72|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE2|null|null|2022-01-05|COUNTRY4|null|STRING72|null|AA|null|null|STRING72|2022-01-13|2022-01-06|81351170|VALUE1|null|null|CITY66|888420172|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING106|null|101|null|null|null|1001|STRING3|null|null|null|null|STRING107|STRING108|STRING108|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|null|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY66|885272|8123401070|STRING108|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING72|10240410|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING73|null|null|null|null|null|124814071|null|null|null|null|null|null|null|STRING72|COMP1COUNTRY4|null|null|null|null|null|No||2|107|null|null
null|null|null|null|VALUE73|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE24|null|null|2022-01-05|COUNTRY1|null|STRING73|null|AC|null|null|STRING73|2022-01-13|2022-01-06|81351171|VALUE1|null|null|CITY67|888420173|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING107|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING108|STRING109|STRING109|null|null|15024.00|null|15024.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85045.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY67|885273|8123401071|STRING109|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING73|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING74|null|null|null|null|null|124814072|null|STRING1|2140057|2022-01-06|null|null|null|STRING73|COMP1COUNTRY1|null|null|null|null|null|No||2|108|null|null
null|null|null|null|VALUE74|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING74|null|AC|null|null|STRING74|2022-01-13|2022-01-06|81351172|VALUE1|null|null|CITY68|888420174|null|2300143|404|null|RER|RCR|XCX|null|null|null|STRING108|null|101|null|null|null|1001|null|null|null|null|null|STRING109|STRING110|STRING110|null|null|15003.00|null|15003.00|null|null|230027|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY68|885274|8123401072|STRING110|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING74|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING75|null|null|null|null|null|124814073|null|STRING1|2140058|2022-01-06|null|null|null|STRING74|COMP1COUNTRY1|null|null|null|null|null|No||2|109|null|null
null|null|null|null|VALUE75|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300211|404|null|RER|RCR|XCX|null|null|null|STRING109|null|101|null|null|null|1003|null|null|null|null|null|STRING110|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230040|null|null|101|STRING1|null|null|null|null|PCP|101|6500155.00|null|101.000|101.000|6500155.00|85046.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|110|null|null
null|null|null|null|VALUE75|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300211|404|null|RER|RCR|XCX|null|null|null|STRING110|null|101|null|null|null|1003|null|null|null|null|null|STRING111|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230040|null|null|101|STRING1|null|null|null|null|PCP|101|6500155.00|null|101.000|101.000|6500155.00|85046.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|111|null|null
null|null|null|null|VALUE75|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300135|404|null|RER|RCR|XCX|null|null|null|STRING111|null|101|null|null|null|1001|null|null|null|null|null|STRING112|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|112|null|null
null|null|null|null|VALUE75|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING112|null|101|null|null|null|1002|null|null|null|null|null|STRING113|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500156.00|null|101.000|101.000|6500156.00|85023.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|113|null|null
null|null|null|null|VALUE75|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING75|null|AC|null|null|STRING75|2022-01-11|2022-01-06|81351173|VALUE1|null|null|CITY69|888420175|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING113|null|101|null|null|null|1002|null|null|null|null|null|STRING114|STRING111|STRING111|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY69|885275|8123401073|STRING111|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING75|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING76|null|null|null|null|null|124814074|null|STRING1|2140039|2022-01-06|null|null|null|STRING75|COMP1COUNTRY2|null|null|null|null|null|No||2|114|null|null
null|null|null|null|VALUE76|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE25|null|null|2022-01-05|COUNTRY2|null|STRING76|null|AC|null|null|STRING76|2022-01-11|2022-01-06|81351174|VALUE1|null|null|CITY70|888420176|null|2300133|404|null|RER|RCR|XCX|null|null|null|STRING114|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING115|STRING116|STRING116|null|null|15017.00|15004.00|15017.00|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|5003.00|101.000|101.000|6500148.00|85032.00|101.000|101.000|101.000|101.000|STRING16|STRING1|2022-01-06|CITY70|885276|8123401074|STRING116|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING76|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING77|null|null|null|null|null|124814075|null|STRING1|2140059|2022-01-06|null|null|null|STRING76|COMP1COUNTRY2|null|null|null|null|null|No||2|115|null|null
null|null|null|null|VALUE77|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE26|null|null|2022-01-05|COUNTRY1|null|STRING77|null|AC|null|null|STRING77|2022-01-13|2022-01-06|81351175|VALUE1|null|null|CITY71|888420177|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING115|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING116|STRING117|STRING117|null|null|15024.00|null|15024.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85045.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY71|885277|8123401075|STRING117|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING77|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING78|null|null|null|null|null|124814076|null|STRING1|2140060|2022-01-06|null|null|null|STRING77|COMP1COUNTRY1|null|null|null|null|null|No||2|116|null|null
null|null|null|null|VALUE78|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING78|null|AC|null|null|STRING78|2022-01-11|2022-01-06|81351176|VALUE1|null|null|CITY72|888420178|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING116|null|101|null|null|null|1001|null|null|null|null|null|STRING117|STRING118|STRING118|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY72|885278|8123401076|STRING118|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING78|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING79|null|null|null|null|null|124814077|null|STRING1|2140061|2022-01-06|null|null|null|STRING78|COMP1COUNTRY1|null|null|null|null|null|No||2|117|null|null
null|null|null|null|VALUE79|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING79|null|AC|null|null|STRING79|2022-01-11|2022-01-06|81351177|VALUE1|null|null|CITY73|888420179|null|2300114|404|null|RER|RCR|XCX|null|null|null|STRING5|null|101|null|null|null|1001|null|null|null|null|null|STRING118|STRING119|STRING119|null|null|15003.00|null|15003.00|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY73|885279|8123401077|STRING119|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING79|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING80|null|null|null|null|null|124814078|null|STRING1|2140062|2022-01-06|null|null|null|STRING79|COMP1COUNTRY1|null|null|null|null|null|No||2|118|null|null
null|null|null|null|VALUE80|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING80|null|AC|null|null|STRING80|2022-01-11|2022-01-06|81351178|VALUE1|null|null|CITY74|888420180|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING117|null|101|null|null|null|1002|null|null|null|null|null|STRING119|STRING120|STRING120|null|null|15003.00|null|15003.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|85029.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY74|885280|8123401078|STRING120|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING80|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING81|null|null|null|null|null|124814079|null|STRING1|2140063|2022-01-06|null|null|null|STRING80|COMP1COUNTRY2|null|null|null|null|null|No||2|119|null|null
null|null|null|null|VALUE80|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING80|null|AC|null|null|STRING80|2022-01-11|2022-01-06|81351178|VALUE1|null|null|CITY74|888420180|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING118|null|101|null|null|null|1002|null|null|null|null|null|STRING120|STRING120|STRING120|null|null|15003.00|15005.00|15003.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|5004.00|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY74|885280|8123401078|STRING120|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING80|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING81|null|null|null|null|null|124814079|null|STRING1|2140063|2022-01-06|null|null|null|STRING80|COMP1COUNTRY2|null|null|null|null|null|No||2|120|null|null
null|null|null|null|VALUE80|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING80|null|AC|null|null|STRING80|2022-01-11|2022-01-06|81351178|VALUE1|null|null|CITY74|888420180|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING119|null|101|null|null|null|1002|null|null|null|null|null|STRING121|STRING120|STRING120|null|null|15003.00|15005.00|15003.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|5005.00|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY74|885280|8123401078|STRING120|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING80|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING81|null|null|null|null|null|124814079|null|STRING1|2140063|2022-01-06|null|null|null|STRING80|COMP1COUNTRY2|null|null|null|null|null|No||2|121|null|null
null|null|null|null|VALUE81|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING120|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING122|STRING123|STRING123|null|null|15025.00|null|15025.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500141.00|null|101.000|101.000|6500141.00|85047.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|122|null|null
null|null|null|null|VALUE81|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300224|404|null|RER|RCR|XCX|null|null|null|STRING121|null|101|null|null|null|1003|STRING6|null|null|null|null|STRING123|STRING123|STRING123|null|null|null|null|null|null|null|230041|null|null|101|STRING1|null|null|null|null|PCP|101|6500157.00|null|101.000|101.000|6500157.00|null|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|123|null|null
null|null|null|null|VALUE81|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300225|404|null|RER|RCR|XCX|null|null|null|STRING122|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING124|STRING123|STRING123|null|null|null|null|null|null|null|230042|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|124|null|null
null|null|null|null|VALUE81|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE27|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300225|404|null|RER|RCR|XCX|null|null|null|STRING123|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING125|STRING123|STRING123|null|null|15024.00|null|15024.00|null|null|230042|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85048.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|125|null|null
null|null|null|null|VALUE81|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING81|null|AC|null|null|STRING81|2022-01-11|2022-01-06|81351179|VALUE1|null|null|CITY75|888420181|null|2300224|404|null|RER|RCR|XCX|null|null|null|STRING124|null|101|null|null|null|1003|null|null|null|null|null|STRING126|STRING123|STRING123|null|null|15003.00|null|15003.00|null|null|230041|null|null|101|STRING1|null|null|null|null|PCP|101|6500157.00|null|101.000|101.000|6500157.00|85049.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY75|885281|8123401079|STRING123|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING81|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING82|null|null|null|null|null|124814080|null|STRING1|2140013|2022-01-06|null|null|null|STRING81|COMP1COUNTRY1|null|null|null|null|null|No||2|126|null|null
null|null|null|null|VALUE82|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING82|null|AC|null|null|STRING82|2022-01-11|2022-01-06|81351180|VALUE1|null|null|CITY76|888420182|null|2300185|404|null|RER|CRC|XCX|null|null|null|STRING125|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING127|STRING128|STRING128|null|null|15014.00|null|15014.00|null|null|230035|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85023.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY76|885282|8123401080|STRING128|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING82|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING83|null|null|null|null|null|124814081|null|STRING1|2140064|2022-01-06|null|null|null|STRING82|COMP1COUNTRY1|null|null|null|null|null|No||2|127|null|null
null|null|null|null|VALUE83|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING83|null|AC|null|null|STRING83|2022-01-11|2022-01-06|81351181|VALUE1|null|null|CITY77|888420183|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING126|null|101|null|null|null|1002|null|null|null|null|null|STRING128|STRING129|STRING129|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|85016.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY77|885283|8123401081|STRING129|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING83|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING84|null|null|null|null|null|124814082|null|STRING1|2140011|2022-01-06|null|null|null|STRING83|COMP1COUNTRY1|null|null|null|null|null|No||2|128|null|null
null|null|null|null|VALUE84|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING84|null|AC|null|null|STRING84|2022-01-11|2022-01-06|81351182|VALUE1|null|null|CITY78|888420184|null|2300103|404|null|RER|CRC|XCX|null|null|null|STRING127|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING129|STRING130|STRING130|null|null|null|null|null|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY78|885284|8123401082|STRING130|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING84|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING85|null|null|null|null|null|124814083|null|STRING1|2140065|2022-01-06|null|null|null|STRING84|COMP1COUNTRY1|null|null|STRING7|STRING2|null|Yes||2|129|null|null
null|null|null|null|VALUE85|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING85|null|AC|null|null|STRING85|null|2022-01-06|81351183|VALUE1|null|null|CITY79|888420185|null|2300135|404|null|RER|RCR|XCX|null|null|null|STRING128|null|101|null|null|null|1001|null|null|null|null|null|STRING130|STRING131|STRING131|null|null|15003.00|null|15003.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85050.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY79|885285|8123401083|STRING131|null|2022-01-06|STRING6|10020.000|STRING2|STORE1|STRING1|TYPE3|STRING85|10240402|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|null|1923002|1923001|null|null|null|null|10349200.00|STRING86|null|null|null|null|null|124814084|null|STRING1|2140033|2022-01-06|null|null|null|STRING85|COMP1COUNTRY1|null|null|null|null|null|No||2|130|null|null
null|null|null|null|VALUE85|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING85|null|AC|null|null|STRING85|2022-01-10|2022-01-06|81351183|VALUE1|null|null|CITY79|888420185|null|2300232|404|null|RER|RCR|XCX|null|null|STRING6|STRING30|null|101|null|null|null|1002|null|null|null|null|null|STRING131|STRING131|STRING131|null|null|15003.00|null|15003.00|null|null|230043|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85004.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY79|885285|8123401083|STRING131|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE2|STRING85|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING87|null|EFL|null|null|null|124814084|null|STRING1|2140033|2022-01-06|null|null|null|STRING85|COMP1COUNTRY1|null|null|null|null|null|No||2|131|null|null
null|null|null|null|VALUE86|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING86|null|AC|null|null|STRING86|2022-01-11|2022-01-06|81351184|VALUE1|null|null|CITY80|888420186|null|2300233|404|null|RER|RCR|XCX|null|null|null|STRING129|null|101|null|null|null|1001|null|null|null|null|null|STRING132|STRING133|STRING133|null|null|15003.00|null|15003.00|null|null|230044|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY80|885286|8123401084|STRING133|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING86|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING88|null|null|null|null|null|124814085|null|STRING1|2140066|2022-01-06|null|null|null|STRING86|COMP1COUNTRY1|null|null|null|null|null|No||2|132|null|null
null|null|null|null|VALUE86|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING86|null|AC|null|null|STRING86|2022-01-11|2022-01-06|81351184|VALUE1|null|null|CITY80|888420186|null|2300234|404|null|RER|RCR|XCX|null|null|null|STRING130|null|101|null|null|null|1001|null|null|null|null|null|STRING133|STRING133|STRING133|null|null|15003.00|null|15003.00|null|null|230045|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY80|885286|8123401084|STRING133|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING86|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING88|null|null|null|null|null|124814085|null|STRING1|2140066|2022-01-06|null|null|null|STRING86|COMP1COUNTRY1|null|null|null|null|null|No||2|133|null|null
null|null|null|null|VALUE86|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING86|null|AC|null|null|STRING86|2022-01-11|2022-01-06|81351184|VALUE1|null|null|CITY80|888420186|null|2300142|404|null|RER|CRC|XCX|null|null|null|STRING131|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING134|STRING133|STRING133|null|null|null|null|null|null|null|230026|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY80|885286|8123401084|STRING133|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING86|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING88|null|null|null|null|null|124814085|null|STRING1|2140066|2022-01-06|null|null|null|STRING86|COMP1COUNTRY1|null|null|null|null|null|No||2|134|null|null
null|null|null|null|VALUE87|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING87|null|AC|null|null|STRING87|2022-01-13|2022-01-06|81351185|VALUE1|null|null|CITY81|888420187|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING132|null|101|null|null|null|1002|null|null|null|null|null|STRING135|STRING136|STRING136|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|85034.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY81|885287|8123401085|STRING136|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING87|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING89|null|null|null|null|null|124814086|null|STRING1|2140067|2022-01-06|null|null|null|STRING87|COMP1COUNTRY1|null|null|null|null|null|No||2|135|null|null
null|null|null|null|VALUE88|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY4|null|STRING88|null|AC|null|null|STRING88|2022-01-11|2022-01-06|81351186|VALUE1|null|null|CITY82|888420188|null|2300111|404|null|RER|RCR|XCX|null|null|null|STRING133|null|101|null|null|null|1001|null|null|null|null|null|STRING136|STRING137|STRING137|null|null|15003.00|null|15003.00|null|null|230008|null|null|101|STRING1|null|null|null|null|PCP|101|6500158.00|null|101.000|101.000|6500158.00|85051.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY82|885288|8123401086|STRING137|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING88|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING90|null|null|null|null|null|124814087|null|STRING1|2140068|2022-01-06|null|null|null|STRING88|COMP1COUNTRY4|null|null|null|null|null|No||2|136|null|null
null|null|null|null|VALUE89|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING89|null|AC|null|null|STRING89|2022-01-11|2022-01-06|81351187|VALUE1|null|null|CITY83|888420189|null|2300116|404|null|RER|RCR|XCX|null|null|null|STRING134|null|101|null|null|null|1001|null|null|null|null|null|STRING137|STRING138|STRING138|null|null|15003.00|null|15003.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY83|885289|8123401087|STRING138|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING89|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING91|null|null|null|null|null|124814088|null|STRING1|2140069|2022-01-06|null|null|null|STRING89|COMP1COUNTRY1|null|null|null|null|null|No||2|137|null|null
null|null|null|null|VALUE90|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING90|METHOD7|AC|null|null|STRING90|2022-01-13|2022-01-06|81351188|VALUE1|null|null|CITY84|888420190|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING135|null|101|null|null|null|1003|STRING2|null|null|null|null|STRING138|STRING139|STRING139|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY84|885290|8123401088|STRING139|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING90|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING92|null|null|null|null|null|124814089|null|STRING1|2140070|2022-01-06|null|null|null|STRING90|COMP1COUNTRY1|null|null|null|null|null|No||2|138|null|null
null|null|null|null|VALUE91|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE28|null|null|2022-01-05|COUNTRY4|null|STRING91|null|AC|null|null|STRING91|2022-01-11|2022-01-06|81351189|VALUE1|null|null|CITY85|888420191|null|2300240|404|null|RER|RCR|XCX|null|null|null|STRING136|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING139|STRING140|STRING140|null|null|15026.00|null|15026.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85052.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY85|885291|8123401089|STRING140|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING91|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING93|null|null|null|null|null|124814090|null|STRING1|2140071|2022-01-06|null|null|null|STRING91|COMP1COUNTRY4|null|null|null|null|null|No||2|139|null|null
null|null|null|null|VALUE91|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE28|null|null|2022-01-05|COUNTRY4|null|STRING91|null|AC|null|null|STRING91|2022-01-11|2022-01-06|81351189|VALUE1|null|null|CITY85|888420191|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING137|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING140|STRING140|STRING140|null|null|15026.00|null|15026.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500136.00|null|101.000|101.000|6500136.00|85052.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY85|885291|8123401089|STRING140|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING91|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING93|null|null|null|null|null|124814090|null|STRING1|2140071|2022-01-06|null|null|null|STRING91|COMP1COUNTRY4|null|null|null|null|null|No||2|140|null|null
null|null|null|null|VALUE92|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE29|null|null|2022-01-05|COUNTRY1|null|STRING92|null|AC|null|null|STRING92|2022-01-13|2022-01-06|81351190|VALUE1|null|null|CITY86|888420192|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING138|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING141|STRING142|STRING142|null|null|null|null|null|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING8|STRING1|2022-01-06|CITY86|885292|8123401090|STRING142|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING92|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING94|null|null|null|null|null|124814091|null|STRING1|2140072|2022-01-06|null|null|null|STRING92|COMP1COUNTRY1|null|null|null|null|null|No||2|141|null|null
null|null|null|null|VALUE93|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING139|null|101|null|null|null|1002|null|null|null|null|null|STRING142|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|142|null|null
null|null|null|null|VALUE93|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300113|404|null|RER|RCR|XCX|null|null|null|STRING140|null|101|null|null|null|1002|null|null|null|null|null|STRING143|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230010|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|143|null|null
null|null|null|null|VALUE93|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING141|null|101|null|null|null|1002|null|null|null|null|null|STRING144|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|144|null|null
null|null|null|null|VALUE93|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING142|null|101|null|null|null|1002|null|null|null|null|null|STRING145|STRING143|STRING143|null|null|15003.00|null|15003.00|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85024.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|145|null|null
null|null|null|null|VALUE93|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING143|null|101|null|null|null|1002|STRING3|null|null|null|null|STRING146|STRING143|STRING143|null|null|null|null|null|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500156.00|null|101.000|101.000|6500156.00|null|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|146|null|null
null|null|null|null|VALUE93|10102418|10102418|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY3|null|STRING93|null|AC|null|null|STRING93|2022-01-11|2022-01-06|81351191|VALUE1|null|null|CITY46|888420193|null|2300112|404|null|RER|CRC|XCX|null|null|null|STRING144|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING147|STRING143|STRING143|null|null|null|null|null|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY46|885293|8123401091|STRING143|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING93|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING95|null|null|null|null|null|124814092|null|STRING1|2140028|2022-01-06|null|null|null|STRING93|COMP1COUNTRY3|null|null|null|null|null|No||2|147|null|null
null|null|null|null|VALUE94|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-05|COUNTRY1|null|STRING94|null|AC|null|null|STRING94|2022-01-10|2022-01-06|81351192|VALUE1|null|null|CITY44|888420194|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING145|null|101|null|null|null|1001|STRING3|null|null|null|null|STRING148|STRING149|STRING149|null|null|15027.00|null|15027.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85053.00|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-06|CITY44|885294|8123401092|STRING149|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING94|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING96|null|null|null|null|null|124814093|null|STRING1|2140073|2022-01-06|null|null|null|STRING94|COMP1COUNTRY1|null|null|null|null|null|No||2|148|null|null
null|null|null|null|VALUE95|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY2|null|STRING95|null|AC|null|null|STRING95|2022-01-11|2022-01-06|81351193|VALUE1|null|null|CITY87|888420195|null|2300240|404|null|RER|RCR|XCX|null|null|null|STRING146|null|101|null|null|null|1001|null|null|null|null|null|STRING149|STRING150|STRING150|null|null|15003.00|null|15003.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY87|885295|8123401093|STRING150|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING95|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING97|null|null|null|null|null|124814094|null|STRING1|2140074|2022-01-06|null|null|null|STRING95|COMP1COUNTRY2|null|null|null|null|null|No||2|149|null|null
null|null|null|null|VALUE97|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING97|null|AC|null|null|STRING97|2022-01-10|2022-01-06|81351195|VALUE1|null|null|CITY88|888420197|null|2300165|404|null|RER|CRC|XCX|null|null|null|STRING150|null|101|null|null|null|1002|STRING9|null|null|null|null|STRING153|STRING154|STRING154|null|null|null|null|null|null|null|230031|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY88|885297|8123401095|STRING154|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING97|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING99|null|null|null|null|null|124814096|null|STRING1|2140076|2022-01-06|null|null|null|STRING97|COMP1COUNTRY6|null|null|null|null|null|No||2|153|null|null
null|null|null|null|VALUE97|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING97|null|AC|null|null|STRING97|2022-01-10|2022-01-06|81351195|VALUE1|null|null|CITY88|888420197|null|2300133|404|null|RER|CRC|XCX|null|null|null|STRING151|null|101|null|null|null|1002|STRING9|null|null|null|null|STRING154|STRING154|STRING154|null|null|15024.00|null|15024.00|null|null|230020|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85045.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY88|885297|8123401095|STRING154|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING97|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING99|null|null|null|null|null|124814096|null|STRING1|2140076|2022-01-06|null|null|null|STRING97|COMP1COUNTRY6|null|null|null|null|null|No||2|154|null|null
null|null|null|null|VALUE98|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING98|null|AC|null|null|STRING98|2022-01-11|2022-01-06|81351196|VALUE1|null|null|CITY89|888420198|null|2300135|404|null|RER|RCR|XCX|null|null|null|STRING152|null|101|null|null|null|1001|null|null|null|null|null|STRING155|STRING156|STRING156|null|null|15003.00|null|15003.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500159.00|null|101.000|101.000|6500159.00|85054.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY89|885298|8123401096|STRING156|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING98|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING100|null|null|null|null|null|124814097|null|STRING1|2140077|2022-01-06|null|null|null|STRING98|COMP1COUNTRY1|null|null|null|null|null|No||2|155|null|null
null|null|null|null|VALUE99|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING99|null|AC|null|null|STRING99|2022-01-11|2022-01-06|81351197|VALUE1|null|null|CITY89|888420199|null|2300180|404|null|RER|CRC|XCX|null|null|null|STRING153|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING156|STRING157|STRING157|null|null|null|null|null|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500160.00|null|101.000|101.000|6500160.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY89|885299|8123401097|STRING157|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING99|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING101|null|null|null|null|null|124814098|null|STRING1|2140078|2022-01-06|null|null|null|STRING99|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|156|null|null
null|null|null|null|VALUE99|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING99|null|AC|null|null|STRING99|2022-01-11|2022-01-06|81351197|VALUE1|null|null|CITY89|888420199|null|2300114|404|null|RER|CRC|XCX|null|null|null|STRING154|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING157|STRING157|STRING157|null|null|null|null|null|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY89|885299|8123401097|STRING157|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING99|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING101|null|null|null|null|null|124814098|null|STRING1|2140078|2022-01-06|null|null|null|STRING99|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|157|null|null
null|null|null|null|VALUE99|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING99|null|AC|null|null|STRING99|2022-01-11|2022-01-06|81351197|VALUE1|null|null|CITY89|888420199|null|2300114|404|null|RER|CRC|XCX|null|null|null|STRING155|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING158|STRING157|STRING157|null|null|null|null|null|null|null|230011|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY89|885299|8123401097|STRING157|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING99|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING101|null|null|null|null|null|124814098|null|STRING1|2140078|2022-01-06|null|null|null|STRING99|COMP1COUNTRY1|null|null|STRING1|STRING1|null|Yes||2|158|null|null
null|null|null|null|VALUE100|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING100|null|AC|null|null|STRING100|2022-01-10|2022-01-06|81351198|VALUE1|null|null|CITY27|888420200|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING156|null|101|null|null|null|1003|STRING2|null|null|null|null|STRING159|STRING160|STRING160|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500152.00|null|101.000|101.000|6500152.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY27|885300|8123401098|STRING160|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING100|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING102|null|null|null|null|null|124814099|null|STRING1|2140079|2022-01-06|null|null|null|STRING100|COMP1COUNTRY1|null|null|STRING1|STRING1|STRING4|Yes||2|159|null|null
null|null|null|null|VALUE100|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE30|null|null|2022-01-05|COUNTRY1|null|STRING100|null|AC|null|null|STRING100|2022-01-10|2022-01-06|81351198|VALUE1|null|null|CITY27|888420200|null|2300108|404|null|RER|RCR|XCX|null|null|null|STRING157|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING160|STRING160|STRING160|null|null|null|null|null|null|null|230006|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|null|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY27|885300|8123401098|STRING160|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING100|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING102|null|null|null|null|null|124814099|null|STRING1|2140079|2022-01-06|null|null|null|STRING100|COMP1COUNTRY1|null|null|null|null|null|No||2|160|null|null
null|null|null|null|VALUE101|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE31|null|null|2022-01-05|COUNTRY2|null|STRING101|null|AA|null|null|STRING101|2022-01-13|2022-01-06|81351199|VALUE1|null|null|CITY90|888420201|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING158|null|101|null|null|null|1001|STRING1|null|null|null|null|STRING161|STRING162|STRING162|null|null|15028.00|null|15028.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85055.00|101.000|101.000|101.000|101.000|STRING16|STRING1|2022-01-06|CITY90|885301|8123401099|STRING162|null|2022-01-06|STRING4|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING101|10240411|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING103|null|null|null|null|null|124814100|null|null|null|null|null|null|null|STRING101|COMP1COUNTRY2|null|null|null|null|null|No||2|161|null|null
null|null|null|null|VALUE102|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING102|null|AC|null|null|STRING102|2022-01-13|2022-01-06|81351200|VALUE1|null|null|CITY91|888420202|null|2300140|404|null|RER|CRC|XCX|null|null|null|STRING159|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING162|STRING163|STRING163|null|null|null|null|null|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY91|885302|8123401100|STRING163|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING102|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING104|null|null|null|null|null|124814101|null|STRING1|2140080|2022-01-06|null|null|null|STRING102|COMP1COUNTRY1|null|null|null|null|null|No||2|162|null|null
null|null|null|null|VALUE102|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING102|null|AC|null|null|STRING102|2022-01-13|2022-01-06|81351200|VALUE1|null|null|CITY91|888420202|null|2300121|404|null|RER|CRC|XCX|null|null|null|STRING160|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING163|STRING163|STRING163|null|null|null|null|null|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY91|885302|8123401100|STRING163|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING102|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING104|null|null|null|null|null|124814101|null|STRING1|2140080|2022-01-06|null|null|null|STRING102|COMP1COUNTRY1|null|null|null|null|null|No||2|163|null|null
null|null|null|null|VALUE104|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING104|null|AC|null|null|STRING104|2022-01-11|2022-01-06|81351202|VALUE1|null|null|CITY93|888420204|null|2300240|404|null|RER|CRC|XCX|null|null|null|STRING162|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING165|STRING166|STRING166|null|null|15002.00|null|15002.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85003.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY93|885304|8123401102|STRING166|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING104|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING106|null|null|null|null|null|124814103|null|STRING1|2140082|2022-01-06|null|null|null|STRING104|COMP1COUNTRY1|null|null|null|null|null|No||2|165|null|null
null|null|null|null|VALUE105|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING163|null|101|null|null|null|1001|null|null|null|null|null|STRING166|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|166|null|null
null|null|null|null|VALUE105|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING163|null|101|null|null|null|1001|null|null|null|null|null|STRING167|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|167|null|null
null|null|null|null|VALUE105|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300269|404|null|RER|RCR|XCX|null|null|null|STRING163|null|101|null|null|null|1001|null|null|null|null|null|STRING168|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230047|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|168|null|null
null|null|null|null|VALUE105|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING164|null|101|null|null|null|1001|null|null|null|null|null|STRING169|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|169|null|null
null|null|null|null|VALUE105|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300120|404|null|RER|RCR|XCX|null|null|null|STRING164|null|101|null|null|null|1001|null|null|null|null|null|STRING170|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230015|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|170|null|null
null|null|null|null|VALUE105|10102418|10102418|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|3901235|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY1|null|STRING105|null|AA|null|null|STRING105|null|2022-01-06|81351199|VALUE1|null|null|CITY94|888420205|null|2300269|404|null|RER|RCR|XCX|null|null|null|STRING164|null|101|null|null|null|1001|null|null|null|null|null|STRING171|STRING167|STRING167|null|null|15003.00|null|15003.00|null|null|230047|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY94|885305|8123401099|STRING167|null|2022-01-06|STRING4|10020.000|STRING4|STORE1|STRING3|TYPE1|STRING105|10240412|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING2|null|1923002|1923001|null|null|null|null|10349200.00|null|null|null|null|null|null|124814104|3059001|null|null|null|null|null|null|STRING105|COMP1COUNTRY1|null|null|null|null|null|No||2|171|null|null
null|null|null|null|VALUE107|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE32|null|null|2022-01-05|COUNTRY1|null|STRING107|null|AC|null|null|STRING107|2022-01-11|2022-01-06|81351204|VALUE1|null|null|CITY95|888420207|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING167|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING175|STRING176|STRING176|null|null|15020.00|null|15020.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85023.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY95|885307|8123401104|STRING176|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING107|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING109|null|null|null|null|null|124814106|null|STRING1|2140084|2022-01-06|null|null|null|STRING107|COMP1COUNTRY1|null|null|null|null|null|No||2|175|null|null
null|null|null|null|VALUE108|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING108|null|AC|null|null|STRING108|2022-01-11|2022-01-06|81351205|VALUE1|null|null|CITY20|888420208|null|2300101|404|null|RER|CRC|XCX|null|null|null|STRING168|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING176|STRING177|STRING177|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885308|8123401105|STRING177|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING108|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING110|null|null|null|null|null|124814107|null|STRING1|2140017|2022-01-06|null|null|null|STRING108|COMP1COUNTRY1|null|null|STRING7|STRING2|STRING5|Yes||2|176|null|null
null|null|null|null|VALUE108|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE33|null|null|2022-01-05|COUNTRY1|null|STRING108|null|AC|null|null|STRING108|2022-01-11|2022-01-06|81351205|VALUE1|null|null|CITY20|888420208|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING169|null|101|null|null|null|1002|STRING10|null|null|null|null|STRING177|STRING177|STRING177|null|null|15014.00|null|15014.00|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85056.00|101.000|101.000|101.000|101.000|STRING7|STRING1|2022-01-06|CITY20|885308|8123401105|STRING177|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING108|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING110|null|null|null|null|null|124814107|null|STRING1|2140017|2022-01-06|null|null|null|STRING108|COMP1COUNTRY1|null|null|null|null|null|No||2|177|null|null
null|null|null|null|VALUE108|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING108|null|AC|null|null|STRING108|2022-01-11|2022-01-06|81351205|VALUE1|null|null|CITY20|888420208|null|2300143|404|null|RER|CRC|XCX|null|null|null|STRING170|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING178|STRING177|STRING177|null|null|15014.00|null|15014.00|null|null|230027|null|null|101|STRING1|null|null|null|null|PCP|101|6500140.00|null|101.000|101.000|6500140.00|85056.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY20|885308|8123401105|STRING177|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING108|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING110|null|null|null|null|null|124814107|null|STRING1|2140017|2022-01-06|null|null|null|STRING108|COMP1COUNTRY1|null|null|STRING7|STRING2|STRING5|Yes||2|178|null|null
null|null|null|null|VALUE109|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE34|null|null|2022-01-05|COUNTRY6|null|STRING109|null|AC|null|null|STRING109|2022-01-10|2022-01-06|81351206|VALUE1|null|null|CITY96|888420209|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING171|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING179|STRING180|STRING180|null|null|15022.00|null|15022.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|85013.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY96|885309|8123401106|STRING180|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING109|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING1|STRING2|1923001|1923001|null|null|null|null|null|STRING111|null|null|null|null|null|124814108|null|STRING1|2140024|2022-01-06|null|null|null|STRING109|COMP1COUNTRY6|null|null|null|null|null|No||2|179|null|null
null|null|null|null|VALUE111|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE36|null|null|2022-01-05|COUNTRY1|null|STRING111|null|AC|null|null|STRING111|2022-01-11|2022-01-06|81351208|VALUE1|null|null|CITY98|888420211|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING173|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING181|STRING182|STRING182|null|null|15022.00|null|15022.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85041.00|101.000|101.000|101.000|101.000|STRING18|STRING1|2022-01-06|CITY98|885311|8123401108|STRING182|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING111|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING113|null|null|null|null|null|124814110|null|STRING1|2140086|2022-01-06|null|null|null|STRING111|COMP1COUNTRY1|null|null|null|null|null|No||2|181|null|null
null|null|null|null|VALUE111|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE36|null|null|2022-01-05|COUNTRY1|null|STRING111|null|AC|null|null|STRING111|2022-01-11|2022-01-06|81351208|VALUE1|null|null|CITY98|888420211|null|2300112|404|null|RER|RCR|XCX|null|null|null|STRING174|null|101|null|null|null|1001|STRING6|null|null|null|null|STRING182|STRING182|STRING182|null|null|null|null|null|null|null|230009|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING18|STRING1|2022-01-06|CITY98|885311|8123401108|STRING182|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING111|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING113|null|null|null|null|null|124814110|null|STRING1|2140086|2022-01-06|null|null|null|STRING111|COMP1COUNTRY1|null|null|null|null|null|No||2|182|null|null
null|null|null|null|VALUE112|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-14|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING112|null|AC|null|null|STRING112|2022-01-14|2022-01-06|81351209|VALUE1|null|null|CITY99|888420212|null|2300101|404|null|RER|RCR|XCX|null|null|STRING7|STRING175|null|101|null|null|null|1001|null|null|null|null|null|STRING183|STRING184|STRING184|null|null|15003.00|null|15003.00|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85006.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY99|885312|8123401109|STRING184|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE2|STRING112|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING114|null|EFL|null|null|null|124814111|null|STRING1|2140087|2022-01-06|null|null|null|STRING112|COMP1COUNTRY1|null|null|null|null|null|No||2|183|null|null
null|null|null|null|VALUE113|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-15|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY4|null|STRING113|METHOD8|AA|null|null|STRING113|null|2022-01-06|81351210|VALUE1|null|null|CITY100|888420213|null|2300116|404|null|RER|RCR|XCX|null|null|null|STRING176|null|101|null|null|null|1001|null|null|null|null|null|STRING184|STRING185|STRING185|null|null|15003.00|null|15003.00|null|null|230013|null|null|101|STRING1|null|null|null|null|PCP|101|6500158.00|null|101.000|101.000|6500158.00|85051.00|101.000|101.000|101.000|101.000|null|STRING2|2022-01-06|CITY100|885313|8123401110|STRING185|null|2022-01-06|STRING4|10020.000|STRING1|STORE1|STRING3|TYPE1|STRING113|10240413|null|null|null|null|null|null|COMPANY1|COUNTRYAB4|STRING2|STRING1|1923002|1923001|null|null|null|null|10349200.00|STRING115|null|null|null|null|null|124814112|3059003|null|null|null|null|null|null|STRING113|COMP1COUNTRY4|null|null|null|null|null|No||2|184|null|null
null|null|null|null|VALUE114|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING114|METHOD9|AC|null|null|STRING114|2022-01-11|2022-01-06|81351211|VALUE1|null|null|CITY101|888420214|null|2300211|404|null|RER|RCR|XCX|null|null|null|STRING177|null|101|null|null|null|1003|null|null|null|null|null|STRING185|STRING186|STRING186|null|null|15003.00|null|15003.00|null|null|230040|null|null|101|STRING1|null|null|null|null|PCP|101|6500156.00|null|101.000|101.000|6500156.00|85023.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY101|885314|8123401111|STRING186|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING114|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING116|null|null|null|null|null|124814113|null|STRING1|2140088|2022-01-06|null|null|null|STRING114|COMP1COUNTRY1|null|null|null|null|null|No||2|185|null|null
null|null|null|null|VALUE115|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING115|null|AC|null|null|STRING115|2022-01-11|2022-01-06|81351212|VALUE1|null|null|CITY102|888420215|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING132|null|101|null|null|null|1002|null|null|null|null|null|STRING135|STRING187|STRING187|null|null|15003.00|null|15003.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500150.00|null|101.000|101.000|6500150.00|85034.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY102|885315|8123401112|STRING187|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING115|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING117|null|null|null|null|null|124814114|null|STRING1|2140089|2022-01-06|null|null|null|STRING115|COMP1COUNTRY1|null|null|null|null|null|No||2|186|null|null
null|null|null|null|VALUE116|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE37|null|null|2022-01-05|COUNTRY11|null|STRING116|null|AC|null|null|STRING116|2022-01-11|2022-01-06|81351213|VALUE1|null|null|CITY103|888420216|null|2300143|404|null|RER|RCR|XCX|null|null|null|STRING178|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING186|STRING188|STRING188|null|null|null|null|null|null|null|230027|null|null|101|STRING6|null|null|null|null|PCP|101|6500161.00|null|101.000|101.000|6500161.00|null|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-06|CITY103|885316|8123401113|STRING188|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING116|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB12|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING118|null|null|null|null|null|124814115|null|STRING1|2140090|2022-01-06|null|null|null|STRING116|COMP1COUNTRY10|null|null|null|null|null|No||2|187|null|null
null|null|null|null|VALUE117|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING117|null|AC|null|null|STRING117|2022-01-10|2022-01-06|81351214|VALUE1|null|null|CITY104|888420217|null|2300240|404|null|RER|RCR|XCX|null|null|null|STRING179|null|101|null|null|null|1001|null|null|null|null|null|STRING187|STRING189|STRING189|null|null|15003.00|null|15003.00|null|null|230046|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85025.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY104|885317|8123401114|STRING189|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING117|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING119|null|null|null|null|null|124814116|null|STRING1|2140059|2022-01-06|null|null|null|STRING117|COMP1COUNTRY1|null|null|null|null|null|No||2|188|null|null
null|null|null|null|VALUE118|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE38|null|null|2022-01-05|COUNTRY1|null|STRING118|null|AC|null|null|STRING118|2022-01-13|2022-01-06|81351215|VALUE1|null|null|CITY105|888420218|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING180|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING188|STRING190|STRING190|null|null|15001.00|null|15001.00|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85027.00|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY105|885318|8123401115|STRING190|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING118|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING120|null|null|null|null|null|124814117|null|STRING1|2140091|2022-01-06|null|null|null|STRING118|COMP1COUNTRY1|null|null|null|null|null|No||2|189|null|null
null|null|null|null|VALUE119|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY6|null|STRING119|null|AC|null|null|STRING119|2022-01-11|2022-01-06|81351216|VALUE1|null|null|CITY106|888420219|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING181|null|101|null|null|null|1001|null|null|null|null|null|STRING189|STRING191|STRING191|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500137.00|null|101.000|101.000|6500137.00|85018.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY106|885319|8123401116|STRING191|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING119|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING121|null|null|null|null|null|124814118|null|STRING1|2140092|2022-01-06|null|null|null|STRING119|COMP1COUNTRY6|null|null|null|null|null|No||2|190|null|null
null|null|null|null|VALUE120|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300105|404|null|RER|CRC|XCX|null|null|null|STRING182|null|101|null|null|null|1001|STRING9|null|null|null|null|STRING190|STRING192|STRING192|null|null|15029.00|null|15029.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|null|101.000|101.000|6500126.00|85057.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|STRING8|STRING2|STRING6|Yes||2|191|null|null
null|null|null|null|VALUE120|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300293|404|null|RER|CRC|XCX|null|null|null|STRING183|null|101|null|null|null|1002|STRING9|null|null|null|null|STRING191|STRING192|STRING192|null|null|15004.00|null|15004.00|null|null|230048|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85024.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|STRING8|STRING2|STRING6|Yes||2|192|null|null
null|null|null|null|VALUE120|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING184|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING192|STRING192|STRING192|null|null|15022.00|null|15022.00|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500127.00|null|101.000|101.000|6500127.00|85041.00|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|193|null|null
null|null|null|null|VALUE120|10102416|10102416|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING185|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING193|STRING192|STRING192|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|null|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|194|null|null
null|null|null|null|VALUE120|10102417|10102417|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300296|404|null|RER|RCR|XCX|null|null|null|STRING186|null|101|null|null|null|1003|STRING6|null|null|null|null|STRING194|STRING192|STRING192|null|null|null|null|null|null|null|230049|null|null|101|STRING1|null|null|null|null|PCP|101|6500132.00|null|101.000|101.000|6500132.00|null|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|195|null|null
null|null|null|null|VALUE120|10102418|10102418|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|VALUE39|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300139|404|null|RER|RCR|XCX|null|null|null|STRING187|null|101|null|null|null|1002|STRING6|null|null|null|null|STRING195|STRING192|STRING192|null|null|null|null|null|null|null|230024|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING19|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|196|null|null
null|null|null|null|VALUE120|10102419|10102419|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY6|null|STRING120|null|AA|null|null|STRING120|2022-01-10|2022-01-06|81351217|VALUE1|null|null|CITY107|888420220|null|2300111|404|null|RER|RCR|XCX|null|null|null|STRING188|null|101|null|null|null|1001|null|null|null|null|null|STRING196|STRING192|STRING192|null|null|15003.00|null|15003.00|null|null|230008|null|null|101|STRING1|null|null|null|null|PCP|101|6500131.00|null|101.000|101.000|6500131.00|85050.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY107|885320|8123401117|STRING192|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING120|10240414|null|null|null|null|null|null|COMPANY1|COUNTRYAB6|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING122|null|null|null|null|null|124814119|null|null|null|null|null|null|null|STRING120|COMP1COUNTRY6|null|null|null|null|null|No||2|197|null|null
null|null|null|null|VALUE121|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AA|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AA|AA|AA|null|null|null|2022-01-05|COUNTRY5|null|STRING121|null|AA|null|null|STRING121|2022-01-10|2022-01-06|81351218|VALUE1|null|null|CITY108|888420221|null|2300123|404|null|RER|RCR|XCX|null|null|null|STRING189|null|101|null|null|null|1001|null|null|null|null|null|STRING197|STRING199|STRING199|null|null|15003.00|null|15003.00|null|null|230017|null|null|101|STRING2|null|null|null|null|PCP|101|6500153.00|null|101.000|101.000|6500153.00|85039.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-06|CITY108|885321|8123401118|STRING199|null|2022-01-06|STRING4|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING121|10240415|null|null|null|null|null|null|COMPANY1|COUNTRYAB5|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING123|null|null|null|null|null|124814120|null|null|null|null|null|null|null|STRING121|COMP1COUNTRY5|null|null|null|null|null|No||2|198|null|null
null|null|null|null|VALUE1|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|VALUE1|null|null|2022-01-05|COUNTRY1|null|STRING1|null|AC|null|null|STRING1|2022-01-10|2022-01-06|81351101|VALUE1|null|null|CITY1|888420101|null|2300101|404|null|RER|RCR|XCX|null|null|null|STRING1|null|101|null|null|null|1002|STRING1|null|null|null|null|STRING1|STRING1|STRING1|null|null|null|null|null|null|null|230001|null|null|101|STRING1|null|null|null|null|PCP|101|6500123.00|null|101.000|101.000|6500123.00|null|101.000|101.000|101.000|101.000|STRING1|STRING1|2022-01-06|CITY1|885201|8123401001|STRING1|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING1|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING1|null|null|null|null|null|124814001|null|STRING1|2140001|2022-01-06|null|null|null|STRING1|COMP1COUNTRY1|null|null|null|null|null|No||2|0|null|null
null|null|null|null|VALUE2|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE2|null|null|2022-01-04|COUNTRY2|null|STRING2|null|AC|null|null|STRING2|2022-01-11|2022-01-07|81351102|VALUE1|null|null|CITY2|888420102|null|2300103|404|null|RER|RCR|XCX|null|null|null|STRING3|null|101|null|null|null|1001|STRING3|null|null|null|null|STRING3|STRING3|STRING3|null|null|15001.00|null|15001.00|null|null|230002|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85002.00|101.000|101.000|101.000|101.000|STRING3|STRING1|2022-01-07|CITY2|885202|8123401002|STRING3|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING2|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB2|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING2|null|null|null|null|null|124814002|null|STRING1|2140002|2022-01-07|null|null|null|STRING2|COMP1COUNTRY2|null|null|null|null|null|No||2|2|null|null
null|null|null|null|VALUE5|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING5|null|AC|null|null|STRING5|2022-01-11|2022-01-07|81351105|VALUE1|null|null|CITY5|888420105|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING6|null|101|null|null|null|1001|null|null|null|null|null|STRING6|STRING6|STRING6|null|null|15003.00|null|15003.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500128.00|null|101.000|101.000|6500128.00|85005.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY5|885205|8123401005|STRING6|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING5|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING5|null|null|null|null|null|124814005|null|STRING1|2140005|2022-01-07|null|null|null|STRING5|COMP1COUNTRY1|null|null|null|null|null|No||2|5|null|null
null|null|null|null|VALUE12|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-13|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE6|null|null|2022-01-04|COUNTRY1|null|STRING12|null|AC|null|null|STRING12|2022-01-13|2022-01-07|81351112|VALUE1|null|null|CITY12|888420112|null|2300117|404|null|RER|RCR|XCX|null|null|null|STRING17|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING17|STRING17|STRING17|null|null|15009.00|null|15009.00|null|null|230014|null|null|101|STRING1|null|null|null|null|PCP|101|6500124.00|null|101.000|101.000|6500124.00|85014.00|101.000|101.000|101.000|101.000|STRING4|STRING1|2022-01-07|CITY12|885212|8123401012|STRING17|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING12|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING12|null|null|null|null|null|124814012|null|STRING1|2140012|2022-01-07|null|null|null|STRING12|COMP1COUNTRY1|null|null|null|null|null|No||2|16|null|null
null|null|null|null|VALUE37|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING37|null|AC|null|null|STRING37|2022-01-11|2022-01-07|81351137|VALUE1|null|null|CITY34|888420137|null|2300128|404|null|RER|RCR|XCX|null|null|null|STRING55|null|101|null|null|null|1002|null|null|null|null|null|STRING55|STRING56|STRING56|null|null|15003.00|null|15003.00|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|85029.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY34|885237|8123401037|STRING56|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING37|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING37|null|null|null|null|null|124814037|null|STRING1|2140031|2022-01-07|null|null|null|STRING37|COMP1COUNTRY1|null|null|null|null|null|No||2|55|null|null
null|null|null|null|VALUE48|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING48|null|AC|null|null|STRING48|2022-01-11|2022-01-07|81351148|VALUE1|null|null|CITY43|888420148|null|2300135|404|null|RER|CRC|XCX|null|null|null|STRING68|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING68|STRING69|STRING69|null|null|null|null|null|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|null|null|101.000|101.000|null|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-07|CITY43|885248|8123401048|STRING69|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING48|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING48|null|null|null|null|null|124814047|null|STRING1|2140038|2022-01-07|null|null|null|STRING48|COMP1COUNTRY1|null|null|null|null|null|No||2|68|null|null
null|null|null|null|VALUE48|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING48|null|AC|null|null|STRING48|2022-01-11|2022-01-07|81351148|VALUE1|null|null|CITY43|888420148|null|2300135|404|null|RER|CRC|XCX|null|null|null|STRING69|null|101|null|null|null|1001|STRING2|null|null|null|null|STRING69|STRING69|STRING69|null|null|15018.00|null|15018.00|null|null|230022|null|null|101|STRING1|null|null|null|null|PCP|101|6500134.00|null|101.000|101.000|6500134.00|85033.00|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-07|CITY43|885248|8123401048|STRING69|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING48|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING48|null|null|null|null|null|124814047|null|STRING1|2140038|2022-01-07|null|null|null|STRING48|COMP1COUNTRY1|null|null|null|null|null|No||2|69|null|null
null|null|null|null|VALUE62|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE22|null|null|2022-01-04|COUNTRY3|null|STRING62|null|AC|null|null|STRING62|2022-01-11|2022-01-07|81351161|VALUE1|null|null|CITY56|888420162|null|2300140|404|null|RER|RCR|XCX|null|null|null|STRING94|null|101|null|null|null|1002|STRING5|null|null|null|null|STRING94|STRING95|STRING95|null|null|15004.00|null|15004.00|null|null|230025|null|null|101|STRING1|null|null|null|null|PCP|101|6500125.00|null|101.000|101.000|6500125.00|85009.00|101.000|101.000|101.000|101.000|STRING6|STRING1|2022-01-07|CITY56|885262|8123401061|STRING95|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING62|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB3|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING63|null|null|null|null|null|124814061|null|STRING1|2140049|2022-01-07|null|null|null|STRING62|COMP1COUNTRY3|null|null|null|null|null|No||2|94|null|null
null|null|null|null|VALUE96|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING96|null|AC|null|null|STRING96|2022-01-11|2022-01-07|81351194|VALUE1|null|null|CITY73|888420196|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING147|null|101|null|null|null|1002|null|null|null|null|null|STRING150|STRING151|STRING151|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500145.00|null|101.000|101.000|6500145.00|85029.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY73|885296|8123401094|STRING151|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING96|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING98|null|null|null|null|null|124814095|null|STRING1|2140075|2022-01-07|null|null|null|STRING96|COMP1COUNTRY1|null|null|null|null|null|No||2|150|null|null
null|null|null|null|VALUE96|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING96|null|AC|null|null|STRING96|2022-01-11|2022-01-07|81351194|VALUE1|null|null|CITY73|888420196|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING148|null|101|null|null|null|1002|null|null|null|null|null|STRING151|STRING151|STRING151|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY73|885296|8123401094|STRING151|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING96|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING98|null|null|null|null|null|124814095|null|STRING1|2140075|2022-01-07|null|null|null|STRING96|COMP1COUNTRY1|null|null|null|null|null|No||2|151|null|null
null|null|null|null|VALUE96|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING96|null|AC|null|null|STRING96|2022-01-11|2022-01-07|81351194|VALUE1|null|null|CITY73|888420196|null|2300121|404|null|RER|RCR|XCX|null|null|null|STRING149|null|101|null|null|null|1002|null|null|null|null|null|STRING152|STRING151|STRING151|null|null|15003.00|null|15003.00|null|null|230016|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY73|885296|8123401094|STRING151|null|2022-01-06|STRING1|10020.000|STRING2|STORE1|STRING1|TYPE1|STRING96|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING3|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING98|null|null|null|null|null|124814095|null|STRING1|2140075|2022-01-07|null|null|null|STRING96|COMP1COUNTRY1|null|null|null|null|null|No||2|152|null|null
null|null|null|null|VALUE103|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY10|null|STRING103|null|AC|null|null|STRING103|2022-01-11|2022-01-07|81351201|VALUE1|null|null|CITY92|888420203|null|2300128|404|null|RER|CRC|XCX|null|null|null|STRING161|null|101|null|null|null|1002|STRING2|null|null|null|null|STRING164|STRING165|STRING165|null|null|null|null|null|null|null|230018|null|null|101|STRING1|null|null|null|null|PCP|101|6500135.00|null|101.000|101.000|6500135.00|null|101.000|101.000|101.000|101.000|STRING2|STRING1|2022-01-07|CITY92|885303|8123401101|STRING165|null|2022-01-06|STRING1|10020.000|STRING1|STORE1|STRING1|TYPE1|STRING103|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB11|STRING3|STRING2|1923001|1923001|null|null|null|null|null|STRING105|null|null|null|null|null|124814102|null|STRING1|2140081|2022-01-07|null|null|null|STRING103|COMP1COUNTRY9|null|null|null|null|null|No||2|164|null|null
null|null|null|null|VALUE106|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING106|null|AC|null|null|STRING106|2022-01-11|2022-01-07|81351203|VALUE1|null|null|CITY20|888420206|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING64|null|101|null|null|null|1001|null|null|null|null|null|STRING172|STRING173|STRING173|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85036.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY20|885306|8123401103|STRING173|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING106|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING107|null|null|null|null|null|124814105|null|STRING1|2140083|2022-01-07|null|null|null|STRING106|COMP1COUNTRY1|null|null|null|null|null|No||2|172|null|null
null|null|null|null|VALUE106|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING106|null|AC|null|null|STRING106|2022-01-11|2022-01-07|81351203|VALUE1|null|null|CITY20|888420206|null|2300180|404|null|RER|RCR|XCX|null|null|null|STRING165|null|101|null|null|null|1001|null|null|null|null|null|STRING173|STRING173|STRING173|null|null|15003.00|null|15003.00|null|null|230034|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85036.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY20|885306|8123401103|STRING173|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING106|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING107|null|null|null|null|null|124814105|null|STRING1|2140083|2022-01-07|null|null|null|STRING106|COMP1COUNTRY1|null|null|null|null|null|No||2|173|null|null
null|null|null|null|VALUE106|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-14|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|null|null|null|2022-01-04|COUNTRY1|null|STRING106|null|AC|null|null|STRING106|2022-01-14|2022-01-07|81351203|VALUE1|null|null|CITY20|888420206|null|2300233|404|null|RER|RCR|XCX|null|null|null|STRING166|null|101|null|null|null|1001|null|null|null|null|null|STRING174|STRING173|STRING173|null|null|15003.00|null|15003.00|null|null|230044|null|null|101|STRING1|null|null|null|null|PCP|101|6500142.00|null|101.000|101.000|6500142.00|85027.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-07|CITY20|885306|8123401103|STRING173|null|2022-01-06|STRING6|10020.000|STRING3|STORE1|STRING1|TYPE3|STRING106|10240402|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|null|1923002|1923001|null|null|null|null|10349200.00|STRING108|null|null|null|null|null|124814105|null|STRING1|2140083|2022-01-07|null|null|null|STRING106|COMP1COUNTRY1|null|null|null|null|null|No||2|174|null|null
null|null|null|null|VALUE110|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-11|null|null|null|null|2022-01-04|null|STRING1|AC|AC|AC|VALUE35|null|null|2022-01-04|COUNTRY1|null|STRING110|null|AC|null|null|STRING110|2022-01-11|2022-01-07|81351207|VALUE1|null|null|CITY97|888420210|null|2300105|404|null|RER|RCR|XCX|null|null|null|STRING172|null|101|null|null|null|1001|STRING5|null|null|null|null|STRING180|STRING181|STRING181|null|null|15014.00|null|15014.00|null|null|230004|null|null|101|STRING1|null|null|null|null|PCP|101|6500148.00|null|101.000|101.000|6500148.00|85030.00|101.000|101.000|101.000|101.000|STRING17|STRING1|2022-01-07|CITY97|885310|8123401107|STRING181|null|2022-01-06|STRING1|10020.000|STRING3|STORE1|STRING1|TYPE1|STRING110|10240401|null|null|null|null|null|null|COMPANY1|COUNTRYAB1|STRING1|STRING2|1923001|1923001|null|null|null|null|10349200.00|STRING112|null|null|null|null|null|124814109|null|STRING1|2140085|2022-01-07|null|null|null|STRING110|COMP1COUNTRY1|null|null|null|null|null|No||2|180|null|null
null|null|null|null|VALUE6|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-12|null|null|null|null|2022-01-03|null|STRING1|AC|AC|AC|null|null|null|2022-01-03|COUNTRY2|null|STRING6|null|AC|null|null|STRING6|2022-01-12|2022-01-08|81351106|VALUE1|null|null|CITY6|888420106|null|2300107|404|null|RER|RCR|XCX|null|null|null|STRING7|null|101|null|null|null|1001|null|null|null|null|null|STRING7|STRING7|STRING7|null|null|15003.00|null|15003.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500129.00|null|101.000|101.000|6500129.00|85006.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-08|CITY6|885206|8123401006|STRING7|null|2022-01-06|STRING2|10020.000|STRING1|STORE1|STRING2|TYPE1|STRING6|10240402|null|null|null|null|null|null|COMPANY2|COUNTRYAB2|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING6|null|null|null|null|null|124814006|null|STRING1|2140006|2022-01-08|null|null|null|STRING6|COMP2COUNTRY2|null|null|null|null|null|No||2|6|null|null
null|null|null|null|VALUE6|10102413|10102413|null|3,02E+25|3,02E+25|null|null| |null|AB|2022-01-12|null|null|null|null|2022-01-03|null|STRING1|AB|AB|AB|null|null|null|2022-01-03|COUNTRY2|null|STRING6|null|AB|null|null|STRING6|2022-01-12|2022-01-08|81351106|VALUE1|null|null|CITY6|888420106|null|2300108|404|null|RER|RCR|XCX|null|null|null|STRING8|null|101|null|null|null|1003|null|null|null|null|null|STRING8|STRING7|STRING7|null|null|15003.00|15001.00|15003.00|null|null|230006|null|null|101|STRING1|null|null|null|null|PCP|101|6500130.00|5001.00|101.000|101.000|6500130.00|85007.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-08|CITY6|885206|8123401006|STRING7|null|2022-01-06|STRING3|10020.000|STRING1|STORE1|STRING2|TYPE1|STRING6|10240402|null|null|null|null|null|null|COMPANY2|COUNTRYAB2|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING6|null|null|null|null|null|124814006|null|STRING1|2140006|2022-01-08|null|null|null|STRING6|COMP2COUNTRY2|null|null|null|null|null|No||2|7|null|null
null|null|null|null|VALUE6|10102415|10102415|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-12|null|null|null|null|2022-01-03|null|STRING1|AC|AC|AC|null|null|null|2022-01-03|COUNTRY2|null|STRING6|null|AC|null|null|STRING6|2022-01-12|2022-01-08|81351106|VALUE1|null|null|CITY6|888420106|null|2300107|404|null|RER|RCR|XCX|null|null|null|STRING9|null|101|null|null|null|1001|null|null|null|null|null|STRING9|STRING7|STRING7|null|null|15003.00|15002.00|15003.00|null|null|230005|null|null|101|STRING1|null|null|null|null|PCP|101|6500126.00|5002.00|101.000|101.000|6500126.00|85008.00|101.000|101.000|101.000|101.000|null|STRING1|2022-01-08|CITY6|885206|8123401006|STRING7|null|2022-01-06|STRING2|10020.000|STRING1|STORE1|STRING2|TYPE1|STRING6|10240402|null|null|null|null|null|null|COMPANY2|COUNTRYAB2|STRING3|STRING3|1923002|1923001|null|null|null|null|10349200.00|STRING6|null|null|null|null|null|124814006|null|STRING1|2140006|2022-01-08|null|null|null|STRING6|COMP2COUNTRY2|null|null|null|null|null|No||2|8|null|null
null|null|null|null|VALUE122|10102412|10102412|null|3,02E+25|3,02E+25|null|null| |null|AC|2022-01-10|null|null|null|null|2022-01-05|null|STRING1|AC|AC|AC|null|null|null|2022-01-05|COUNTRY1|null|STRING122|null|AC|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|101|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|PCP|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null|null||2|199|null|null


================================================
FILE: tests/resources/feature/gab/setup/schema/dummy_sales_kpi.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/gab/setup/schema/lkp_query_builder.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "query_id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "query_label",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "query_type",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "mappings",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "intermediate_stages",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recon_window",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "timezone_offset",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "start_of_the_week",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "is_active",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "queue",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "lh_created_on",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/gab/setup/schema/order_events.json
================================================
{
  "type": "struct",
  "fields": [
    {"name": "request_timestamp", "type": "string","nullable": true,"metadata": {}},
    {"name": "data_pack_id", "type": "string","nullable": true,"metadata": {}},
    {"name": "record_number", "type": "integer","nullable": true,"metadata": {}},
    {"name": "update_mode", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_order_header", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_order_schedule", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_order_item", "type": "string","nullable": true,"metadata": {}},
    {"name": "orgsales_orgp", "type": "string","nullable": true,"metadata": {}},
    {"name": "order_header_key", "type": "string","nullable": true,"metadata": {}},
    {"name": "order_line_key", "type": "string","nullable": true,"metadata": {}},
    {"name": "derived_order_header", "type": "string","nullable": true,"metadata": {}},
    {"name": "derived_order_line_k", "type": "string","nullable": true,"metadata": {}},
    {"name": "return_reason", "type": "string","nullable": true,"metadata": {}},
    {"name": "reqmnt_category", "type": "string","nullable": true,"metadata": {}},
    {"name": "delivery_status10", "type": "string","nullable": true,"metadata": {}},
    {"name": "req_del_dt_item", "type": "date","nullable": true,"metadata": {}},
    {"name": "reason_for_rejsize", "type": "string","nullable": true,"metadata": {}},
    {"name": "invoice_item_price", "type": "string","nullable": true,"metadata": {}},
    {"name": "id_of_the_customer", "type": "string","nullable": true,"metadata": {}},
    {"name": "logistics_profit_ctr", "type": "string","nullable": true,"metadata": {}},
    {"name": "material_availabilit", "type": "date","nullable": true,"metadata": {}},
    {"name": "mso_store", "type": "string","nullable": true,"metadata": {}},
    {"name": "name_of_orderer", "type": "string","nullable": true,"metadata": {}},
    {"name": "overall_delivery_sta", "type": "string","nullable": true,"metadata": {}},
    {"name": "overall_processing_s20", "type": "string","nullable": true,"metadata": {}},
    {"name": "overall_processing_s21", "type": "string","nullable": true,"metadata": {}},
    {"name": "coupon_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "org_grape_bapcx", "type": "string","nullable": true,"metadata": {}},
    {"name": "cust_service_rep", "type": "string","nullable": true,"metadata": {}},
    {"name": "customer_purchase_or25", "type": "date","nullable": true,"metadata": {}},
    {"name": "delivery_country_cod", "type": "string","nullable": true,"metadata": {}},
    {"name": "delivery_city_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "delivery_post_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "delivery_state_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "delivery_status30", "type": "string","nullable": true,"metadata": {}},
    {"name": "ops_del_block_sohdr", "type": "string","nullable": true,"metadata": {}},
    {"name": "ops_del_block_soscl", "type": "string","nullable": true,"metadata": {}},
    {"name": "ecom_crm_id", "type": "string","nullable": true,"metadata": {}},
    {"name": "conf_del_date_size", "type": "date","nullable": true,"metadata": {}},
    {"name": "created_on", "type": "date","nullable": true,"metadata": {}},
    {"name": "time", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_doc_item_cat", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_campaign_id", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_coupon_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_city", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_postal_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "shp_promotion_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "size_grid", "type": "string","nullable": true,"metadata": {}},
    {"name": "main_chan_frm_src", "type": "string","nullable": true,"metadata": {}},
    {"name": "prctr_billing", "type": "string","nullable": true,"metadata": {}},
    {"name": "prere_indfrm_src", "type": "string","nullable": true,"metadata": {}},
    {"name": "reg__clr_from_src", "type": "string","nullable": true,"metadata": {}},
    {"name": "update_flag", "type": "string","nullable": true,"metadata": {}},
    {"name": "usage", "type": "string","nullable": true,"metadata": {}},
    {"name": "so_header_usgindp", "type": "string","nullable": true,"metadata": {}},
    {"name": "vas_customer_defined", "type": "string","nullable": true,"metadata": {}},
    {"name": "adidas_group_article", "type": "string","nullable": true,"metadata": {}},
    {"name": "billto_cust", "type": "string","nullable": true,"metadata": {}},
    {"name": "requirement_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipto_cust__r2", "type": "string","nullable": true,"metadata": {}},
    {"name": "soldto_cust_r2", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_doc_category", "type": "string","nullable": true,"metadata": {}},
    {"name": "product_division", "type": "string","nullable": true,"metadata": {}},
    {"name": "promotion_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "sd_categ_precdoc", "type": "string","nullable": true,"metadata": {}},
    {"name": "so_hdrpreceding_doc", "type": "string","nullable": true,"metadata": {}},
    {"name": "so_itmpreceding_doc", "type": "string","nullable": true,"metadata": {}},
    {"name": "so_scl_prec_doc", "type": "string","nullable": true,"metadata": {}},
    {"name": "article__region__s", "type": "string","nullable": true,"metadata": {}},
    {"name": "reference_1", "type": "string","nullable": true,"metadata": {}},
    {"name": "mkt_place_order_num", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_representative", "type": "string","nullable": true,"metadata": {}},
    {"name": "subtotal_1_source", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "subtotal_2_source", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "subtotal_3_source", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "subtotal_4_source", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "subtotal_5_source", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "subtotal_6_source", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "grid_value", "type": "string","nullable": true,"metadata": {}},
    {"name": "orgcompcodep", "type": "string","nullable": true,"metadata": {}},
    {"name": "created_by", "type": "string","nullable": true,"metadata": {}},
    {"name": "miscdistchcopap", "type": "string","nullable": true,"metadata": {}},
    {"name": "document_currency", "type": "string","nullable": true,"metadata": {}},
    {"name": "reason_for_order", "type": "string","nullable": true,"metadata": {}},
    {"name": "opsplantp", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_group", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_office", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_unit", "type": "string","nullable": true,"metadata": {}},
    {"name": "storage_location", "type": "string","nullable": true,"metadata": {}},
    {"name": "so_net_price_2", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "sales_order_net_valu", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "so_conf_qty", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "so_cum_order_qty", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "so_net_price", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "so_net_value", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "so_org_qty", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "so_conf_qty_actual", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "sales_order_qty", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "sales_odr_qty_actual", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "article_campaign_id", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_document_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "order_date_header", "type": "date","nullable": true,"metadata": {}},
    {"name": "billing_city", "type": "string","nullable": true,"metadata": {}},
    {"name": "billing_postal_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "customer_po_time", "type": "string","nullable": true,"metadata": {}},
    {"name": "customer_purchase_or101", "type": "string","nullable": true,"metadata": {}},
    {"name": "overall_rej_status", "type": "string","nullable": true,"metadata": {}},
    {"name": "changed_on", "type": "date","nullable": true,"metadata": {}},
    {"name": "epoch_status", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_order_canqty", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "epoch_entry_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "epoch_entry_by", "type": "string","nullable": true,"metadata": {}},
    {"name": "epoch_order_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "epoch_line_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_marketplace", "type": "string","nullable": true,"metadata": {}},
    {"name": "confirmed_delivery_t", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_city_addres112", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_city_addres113", "type": "string","nullable": true,"metadata": {}},
    {"name": "shipping_city_addres114", "type": "string","nullable": true,"metadata": {}},
    {"name": "billing_city_address115", "type": "string","nullable": true,"metadata": {}},
    {"name": "billing_city_address116", "type": "string","nullable": true,"metadata": {}},
    {"name": "billing_city_address117", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_seller_org", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_locale_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "customer_po_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_carrier_serv", "type": "string","nullable": true,"metadata": {}},
    {"name": "qualifier", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_document_typ", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_return_code", "type": "string","nullable": true,"metadata": {}},
    {"name": "refund_process_date", "type": "date","nullable": true,"metadata": {}},
    {"name": "refund_process_time", "type": "string","nullable": true,"metadata": {}},
    {"name": "omni_cancel_reason", "type": "string","nullable": true,"metadata": {}},
    {"name": "sales_order_ecom_fre", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "omnihub_custom_order", "type": "string","nullable": true,"metadata": {}},
    {"name": "vas_packing_type_so", "type": "string","nullable": true,"metadata": {}},
    {"name": "vas_spl_ser_type_so", "type": "string","nullable": true,"metadata": {}},
    {"name": "vas_tktlbl_type_so", "type": "string","nullable": true,"metadata": {}},
    {"name": "exchange_flag", "type": "string","nullable": true,"metadata": {}},
    {"name": "exchange_type", "type": "string","nullable": true,"metadata": {}},
    {"name": "customer_po_timedw", "type": "string","nullable": true,"metadata": {}},
    {"name": "cnc_store_id", "type": "string","nullable": true,"metadata": {}},
    {"name": "last_hold__type", "type": "string","nullable": true,"metadata": {}},
    {"name": "last_hold_released_t", "type": "string","nullable": true,"metadata": {}},
    {"name": "last_hold_release_dt", "type": "date","nullable": true,"metadata": {}},
    {"name": "dynamic_pricing_iden", "type": "string","nullable": true,"metadata": {}},
    {"name": "dynamic_pricing_valu", "type": "string","nullable": true,"metadata": {}},
    {"name": "dymamic_pricing_amnt", "type": "decimal","nullable": true,"metadata": {}},
    {"name": "exchange_reason", "type": "string","nullable": true,"metadata": {}},
    {"name": "omnihub_site_id", "type": "string","nullable": true,"metadata": {}},
    {"name": "international_shipme", "type": "string","nullable": true,"metadata": {}},
    {"name": "exchange_variant", "type": "string","nullable": true,"metadata": {}},
    {"name": "secondary_article_ca", "type": "string","nullable": true,"metadata": {}},
    {"name": "secondary_article_pr", "type": "string","nullable": true,"metadata": {}},
    {"name": "secondary_coupon_cod", "type": "string","nullable": true,"metadata": {}},
    {"name": "double_discount_flag", "type": "string","nullable": true,"metadata": {}},
    {"name": "extraction_date", "type": "string","nullable": true,"metadata": {}},
    {"name": "lhe_batch_id", "type": "integer","nullable": true,"metadata": {}},
    {"name": "lhe_row_id", "type": "long","nullable": true,"metadata": {}},
    {"name": "source_update_date", "type": "date","nullable": true,"metadata": {}},
    {"name": "source_update_time", "type": "string","nullable": true,"metadata": {}}
  ]
}

================================================
FILE: tests/resources/feature/gab/usecases/dummy_sales_kpi/1_article_category.sql
================================================
SELECT
    "category_a" AS category_name
   ,"article1" AS article_id
UNION
SELECT
     "category_a" AS category_name
    ,"article2" AS article_id
UNION
SELECT
     "category_a" AS category_name
    ,"article3" AS article_id
UNION
SELECT
     "category_a" AS category_name
    ,"article4" AS article_id
UNION
SELECT
     "category_b" AS category_name
    ,"article5" AS article_id
UNION
SELECT
     "category_b" AS category_name
    ,"article6" AS article_id
UNION
SELECT
     "category_b" AS category_name
    ,"article7" AS article_id


================================================
FILE: tests/resources/feature/gab/usecases/dummy_sales_kpi/2_dummy_sales_kpi.sql
================================================
SELECT
   {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour) {% endif %} AS order_date,
  {{ to_date }} AS to_date,
  b.category_name,
  COUNT(a.article_id) qty_articles,
  SUM(amount) total_amount
FROM
  `{{ database }}`.`dummy_sales_kpi` a {{ joins }}
  LEFT JOIN article_categories b
    ON a.article_id = b.article_id
WHERE
  TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= (
          '{{start_date}}' + INTERVAL '{{offset_value}}' HOUR
  )
  AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < (
          '{{ end_date}}' + INTERVAL '{{offset_value}}' HOUR
  )
GROUP BY
  1,2,3

================================================
FILE: tests/resources/feature/gab/usecases/dummy_sales_kpi/scenario/dummy_sales_kpi.json
================================================
{
  "query_label_filter": ["dummy_sales_kpi"],
  "queue_filter": ["Low"],
  "cadence_filter": ["DAY","WEEK","MONTH","QUARTER","YEAR"],
  "target_database": "test_db",
  "start_date": "2016-01-01",
  "end_date": "2018-12-31",
  "rerun_flag": "N",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder",
  "calendar_table": "dim_calendar"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/1_order_events.sql
================================================
SELECT
    {{ to_date }} AS to_date,
    {% if replace_offset_value == 0 %} {{ project_date_column }} {% else %} ({{ project_date_column }} + INTERVAL '{{offset_value}}' HOUR) {% endif %} AS order_date,
    sales_order_schedule,
    delivery_country_cod,
    COUNT(*) orders,
    SUM(sales_order_qty) total_sales
FROM {{ database }}.order_events {{ joins }}
WHERE
{{ filter_date_column }} >= (
        '{{start_date}}' + INTERVAL '{{offset_value}}' HOUR
)
AND {{ filter_date_column }} < (
        '{{ end_date}}' + INTERVAL '{{offset_value}}' HOUR
)
AND order_date_header IS NOT NULL
GROUP BY ALL

================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events.json
================================================
{
  "query_label_filter": ["order_events"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["All"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "N",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events_nam.json
================================================
{
  "query_label_filter": ["order_events_nam"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["MONTH","QUARTER"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "N",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events_negative_timezone_offset.json
================================================
{
  "query_label_filter": ["order_events_negative_timezone_offset"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["WEEK"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "Y",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/order_events_snapshot.json
================================================
{
  "query_label_filter": ["order_events_snapshot"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["DAY","WEEK"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "N",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_empty_reconciliation.json
================================================
{
  "query_label_filter": ["order_events_empty_reconciliation_window"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["WEEK"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "Y",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_empty_requested_cadence.json
================================================
{
  "query_label_filter": ["order_events_negative_timezone_offset"],
  "queue_filter": ["Medium"],
  "cadence_filter": [""],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "Y",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_not_configured_cadence.json
================================================
{
  "query_label_filter": ["order_events_negative_timezone_offset"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["YEAR"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "Y",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/gab/usecases/order_events/scenario/skip_use_case_by_unexisting_cadence.json
================================================
{
  "query_label_filter": ["order_events_unexisting_cadence"],
  "queue_filter": ["Medium"],
  "cadence_filter": ["WEEK"],
  "target_database": "test_db",
  "start_date": "2022-01-01",
  "end_date": "2022-12-31",
  "rerun_flag": "Y",
  "target_table": "gab_use_case_results",
  "source_database": "test_db",
  "gab_base_path": "/app/tests/lakehouse/in/feature/gab/usecases_sql/",
  "lookup_table": "lkp_query_builder"
}


================================================
FILE: tests/resources/feature/heartbeat/control/default/data/ctr_heart_tbl_heartb_feed.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||UNPAUSED|TRUE 
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_heart_tbl_exec_sensor.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset|||2025-08-14 23:00:00|1927384615203749|data-product_job_name_orders|NEW_EVENT_AVAILABLE|2025-08-14 23:00:00|||UNPAUSED|TRUE 
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_heart_tbl_trigger_job.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset|||2025-08-14 23:00:00|1927384615203749|data-product_job_name_orders|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00|UNPAUSED|TRUE 
delta_table|dummy_order|batch|dummy_heartbeat_asset||||1015557820139870|data-product_job_name_orders|IN_PROGRESS|2025-08-14 23:00:00|2025-08-14 23:00:00||UNPAUSED|true
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_heart_tbl_updated.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset|||2025-08-14 23:00:00|1927384615203749|data-product_job_name_orders|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00|UNPAUSED|TRUE 
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||UNPAUSED|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/default/data/ctrl_sensor_tbl_upd_status.json
================================================
{"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null}
{"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"}
{"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "dummy_delta_table_1927384615203749","assets": ["dummy_heartbeat_asset"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-08-14 23:00:00.00000","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"}

================================================
FILE: tests/resources/feature/heartbeat/control/default/schema/ctrl_heart_tbl_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name":"sensor_source",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"sensor_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"sensor_read_type",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"asset_description",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_key",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"preprocess_query",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"latest_event_fetched_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"trigger_job_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"trigger_job_name",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status_change_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_start_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_end_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_state",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"dependency_flag",
      "type":"string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/heartbeat/control/default/schema/ctrl_heart_tbl_trig_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name":"sensor_source",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"sensor_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"sensor_read_type",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"asset_description",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_key",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"preprocess_query",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"trigger_job_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"trigger_job_name",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_state",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"dependency_flag",
      "type":"string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctr_heart_tbl_heartb_feed.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales||||||TRUE
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||COMPLETE|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_heart_tbl_exec_sensor.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales||||||TRUE
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|||||COMPLETE|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_heart_tbl_trigger_job.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE
delta_table|dummy_order|batch|dummy_heartbeat_asset||||1015557820139870|data-product_job_name_orders|IN PROGRESS||||UNPAUSED|true
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00|COMPLETE|TRUE
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00||TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_heart_tbl_updated.csv
================================================
sensor_source|sensor_id|sensor_read_type|asset_description|upstream_key|preprocess_query|latest_event_fetched_timestamp|trigger_job_id|trigger_job_name|status|status_change_timestamp|job_start_timestamp|job_end_timestamp|job_state|dependency_flag
delta_table|dummy_delta_table|streaming|dummy_heartbeat_asset||||1927384615203749|data-product_job_name_orders|||||PAUSED|TRUE
sap_bw|dummy_sap_asset|batch|dummy_heartbeat_sap_bw|LOAD_DATE|||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00||2025-08-14 23:00:00||TRUE
kafka|sales: domain.workspace.load.dummy_topic|streaming|dummy_heartbeat_kafka||||2604918372561094|data-product_job_name_sales|COMPLETED|2025-08-14 23:00:00 ||2025-08-14 23:00:00|COMPLETE|TRUE

================================================
FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/data/ctrl_sensor_tbl_upd_status.json
================================================
{"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null}
{"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"}
{"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "dummy_sap_asset_2604918372561094","assets": ["dummy_heartbeat_sap_bw"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-08-14 23:00:00","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"}
{"sensor_id": "sales__domain_workspace_load_dummy_topic_2604918372561094","assets": null,"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-08-14 23:00:00","checkpoint_location": null,"upstream_key": "None","upstream_value": "None"}

================================================
FILE: tests/resources/feature/heartbeat/control/heartbeat_paused_sensor_new_record/schema/ctrl_heart_tbl_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name":"sensor_source",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"sensor_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"sensor_read_type",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"asset_description",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_key",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"preprocess_query",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"latest_event_fetched_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"trigger_job_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"trigger_job_name",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status_change_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_start_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_end_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"job_state",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"dependency_flag",
      "type":"string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/heartbeat/setup/default/column_list/heartbeat_sensor_control_table.json
================================================
{
  "sensor_source": "string",
  "sensor_id": "string",
  "sensor_read_type": "string",
  "asset_description": "string",
  "upstream_key": "string",
  "preprocess_query": "string",
  "latest_event_fetched_timestamp": "timestamp",
  "trigger_job_id": "string",
  "trigger_job_name": "string",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "job_start_timestamp": "timestamp",
  "job_end_timestamp": "timestamp",
  "job_state": "string",
  "dependency_flag": "string"
}

================================================
FILE: tests/resources/feature/heartbeat/setup/default/column_list/sensor_table.json
================================================
{
  "sensor_id": "string",
  "assets": "array<string>",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "checkpoint_location": "string",
  "upstream_key": "string",
  "upstream_value": "string"
}

================================================
FILE: tests/resources/feature/heartbeat/setup/default/data/setup_heartbeat_data.csv
================================================
sensor_source,sensor_id,sensor_read_type,asset_description,upstream_key,preprocess_query,trigger_job_id,trigger_job_name,job_state,dependency_flag
delta_table,dummy_delta_table,streaming,dummy_heartbeat_asset,,,1927384615203749,data-product_job_name_orders,UNPAUSED,TRUE 
sap_bw,dummy_sap_asset,batch,dummy_heartbeat_sap_bw,LOAD_DATE,,2604918372561094,data-product_job_name_sales,UNPAUSED,TRUE
kafka,sales: domain.workspace.load.dummy_topic,streaming,dummy_heartbeat_kafka,,,2604918372561094,data-product_job_name_sales,UNPAUSED,TRUE

================================================
FILE: tests/resources/feature/heartbeat/setup/default/data/setup_sensor_data.json
================================================
{"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null}
{"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"}
{"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "dummy_delta_table_1927384615203749","assets": ["dummy_heartbeat_asset"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"}

================================================
FILE: tests/resources/feature/heartbeat/setup/default/schema/schema_sensor_df.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name":"sensor_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "assets",
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      },
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status_change_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"checkpoint_location",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_key",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_value",
      "type":"string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/column_list/heartbeat_sensor_control_table.json
================================================
{
  "sensor_source": "string",
  "sensor_id": "string",
  "sensor_read_type": "string",
  "asset_description": "string",
  "upstream_key": "string",
  "preprocess_query": "string",
  "latest_event_fetched_timestamp": "timestamp",
  "trigger_job_id": "string",
  "trigger_job_name": "string",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "job_start_timestamp": "timestamp",
  "job_end_timestamp": "timestamp",
  "job_state": "string",
  "dependency_flag": "string"
}

================================================
FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/column_list/sensor_table.json
================================================
{
  "sensor_id": "string",
  "assets": "array<string>",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "checkpoint_location": "string",
  "upstream_key": "string",
  "upstream_value": "string"
}

================================================
FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/data/setup_heartbeat_data.csv
================================================
sensor_source,sensor_id,sensor_read_type,asset_description,upstream_key,preprocess_query,trigger_job_id,trigger_job_name,job_state,dependency_flag
delta_table,dummy_delta_table,streaming,dummy_heartbeat_asset,,,1927384615203749,data-product_job_name_orders,PAUSED,TRUE
sap_bw,dummy_sap_asset,batch,dummy_heartbeat_sap_bw,LOAD_DATE,,2604918372561094,data-product_job_name_sales,,TRUE
kafka,sales: domain.workspace.load.dummy_topic,streaming,dummy_heartbeat_kafka,,,2604918372561094,data-product_job_name_sales,COMPLETE,TRUE

================================================
FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/data/setup_sensor_data.json
================================================
{"sensor_id": "multiple_sensors_delta_table_hello_world_sensor","assets": ["multiple_sensors_delta_table_hello_world"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2024-10-29 14:30:38.268544","checkpoint_location": "s3://lh-sadp-template-eu-west-1-as12/checkpoints/lakehouse_engine/sensors/multiple_sensors_delta_table_hello_world_sensor","upstream_key": null,"upstream_value": null}
{"sensor_id": "multiple_sensors_sap_bw_hello_world_sensor","assets": ["multiple_sensors_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:48:18.406151","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "once_with_retry_sap_bw_hello_world_sensor","assets": ["once_with_retry_sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:29:37.167015","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "lmu_table_batch_sensor","assets": ["lmu_article_description"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2025-02-13 14:24:10.528557","checkpoint_location": null,"upstream_key": "date","upstream_value": "20200201010101"}
{"sensor_id": "sap_bw_hello_world_sensor","assets": ["sap_bw_hello_world"],"status": "PROCESSED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "20220903195523"}
{"sensor_id": "dummy_sap_asset_2604918372561094","assets": ["dummy_heartbeat_sap_bw"],"status": "ACQUIRED_NEW_DATA","status_change_timestamp": "2023-08-14 08:28:18.24358","checkpoint_location": null,"upstream_key": "LOAD_DATE","upstream_value": "38172649503821"}

================================================
FILE: tests/resources/feature/heartbeat/setup/heartbeat_paused_sensor_new_record/schema/schema_sensor_df.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name":"sensor_id",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "assets",
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      },
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"status_change_timestamp",
      "type":"timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"checkpoint_location",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_key",
      "type":"string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name":"upstream_value",
      "type":"string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "options": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/correct_arguments/tests.db",
        "dbtable": "jdbc_format",
        "driver": "org.sqlite.JDBC",
        "numPartitions": 1
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.jdbc_format_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/correct_arguments/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_format/correct_arguments/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_format/predicates/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "options": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/predicates/tests.db",
        "dbtable": "options",
        "driver": "org.sqlite.JDBC",
        "predicates": "[customer=customer1]"
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.options_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/predicates/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_format/wrong_arguments/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "options": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_format/wrong_arguments/tests.db",
        "table": "error_because_should_be_dbtable",
        "driver": "org.sqlite.JDBC",
        "numPartitions": 1
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.jdbc_format_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_format/wrong_arguments/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "jdbc_args": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/correct_arguments/tests.db",
        "table": "jdbc_function",
        "properties": {
          "driver": "org.sqlite.JDBC"
        }
      },
      "options": {
        "numPartitions": 1
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.jdbc_function_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_function/correct_arguments/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_function/correct_arguments/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/jdbc_reader/jdbc_function/wrong_arguments/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "jdbc",
      "jdbc_args": {
        "url": "jdbc:sqlite:/app/tests/lakehouse/in/feature/jdbc_reader/jdbc_function/wrong_arguments/tests.db",
        "dbtable": "error_because_should_be_table_or_query",
        "properties": {
          "driver": "org.sqlite.JDBC"
        }
      },
      "options": {
        "numPartitions": 1
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "db_table": "test_db.jdbc_function_table",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/jdbc_reader/jdbc_function/wrong_arguments/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/materialize_cdf/acon_create_table.json
================================================
{
  "function": "create_table",
  "path": "file:///app/tests/resources/feature/materialize_cdf/data/table/streaming_with_cdf.sql"
}

================================================
FILE: tests/resources/feature/materialize_cdf/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "_change_type",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "_commit_version",
      "type": "long",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "_commit_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}


================================================
FILE: tests/resources/feature/materialize_cdf/data/control/part-01_cdf.csv
================================================
salesorder|item|date|customer|article|amount|_change_type|_commit_version
1|1|20160601|customer1|article1|1000|insert|1
1|2|20160601|customer1|article2|2000|insert|1
1|3|20160601|customer1|article3|500|insert|1
2|1|20170215|customer2|article4|1000|insert|1
2|2|20170215|customer2|article6|5000|insert|1
2|3|20170215|customer2|article1|3000|insert|1
3|1|20170215|customer1|article5|20000|insert|1
3|2|20170215|customer1|article2|12000|insert|1
3|3|20170215|customer1|article4|9000|insert|1
4|1|20170430|customer3|article3|8000|insert|1
4|2|20170430|customer3|article7|7000|insert|1
4|3|20170430|customer3|article1|3000|insert|1
4|4|20170430|customer3|article2|5000|insert|1

================================================
FILE: tests/resources/feature/materialize_cdf/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/materialize_cdf/data/source/part-02.csv
================================================
salesorder|item|date|customer|article|amount
5|1|20180601|customer1|article1|1000
5|2|20180601|customer1|article2|2000
5|3|20180601|customer1|article3|500
6|1|20190215|customer2|article4|1000
6|2|20190215|customer2|article6|5000
6|3|20190215|customer2|article1|3000


================================================
FILE: tests/resources/feature/materialize_cdf/data/table/streaming_with_cdf.sql
================================================
CREATE TABLE test_db.streaming_with_cdf (salesorder INT, item INT, date INT, customer STRING, article STRING, amount INT)
USING DELTA
PARTITIONED BY (date)
LOCATION 'file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf'
TBLPROPERTIES(
  'delta.enableChangeDataFeed'='true'
)

================================================
FILE: tests/resources/feature/materialize_cdf/streaming_with_clean_and_vacuum.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "DROPMALFORMED"
      },
      "location": "file:///app/tests/lakehouse/in/feature/materialize_cdf/streaming_with_cdf/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "db_table": "test_db.streaming_with_cdf",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/data"
    }
  ],
  "terminate_specs": [
    {
      "function": "expose_cdf",
      "args": {
        "db_table": "test_db.streaming_with_cdf",
        "materialized_cdf_location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_data",
        "materialized_cdf_options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_checkpoint"
        },
        "vacuum_cdf": true,
        "vacuum_hours": 240,
        "clean_cdf": true,
        "days_to_keep": 1
      }
    }
  ],
  "exec_env": {
    "spark.sql.sources.partitionColumnTypeInference.enabled": true
  }
}

================================================
FILE: tests/resources/feature/materialize_cdf/streaming_without_clean_cdf.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "DROPMALFORMED"
      },
      "location": "file:///app/tests/lakehouse/in/feature/materialize_cdf/streaming_with_cdf/data",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "db_table": "test_db.streaming_with_cdf",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/data"
    }
  ],
  "terminate_specs": [
    {
      "function": "expose_cdf",
      "args": {
        "db_table": "test_db.streaming_with_cdf",
        "materialized_cdf_location": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_data",
        "materialized_cdf_options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/materialize_cdf/streaming_with_cdf/cdf_checkpoint"
        },
        "clean_cdf": false
      }
    }
  ],
  "exec_env": {
    "spark.sql.sources.partitionColumnTypeInference.enabled": true
  }
}

================================================
FILE: tests/resources/feature/notification/test_attachement.txt
================================================
Test attachemment

================================================
FILE: tests/resources/feature/reconciliation/data/current.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": 200
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 400
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": 600
 }
]

================================================
FILE: tests/resources/feature/reconciliation/data/current_different_rows.json
================================================
[
  {
    "country": "pt",
    "consumer": 1,
    "date": "20211112",
    "net_sales": 200
  },
  {
    "country": "ge",
    "consumer": 2,
    "date": "20211113",
    "net_sales": 400
  },
  {
    "country": "pt",
    "consumer": 3,
    "date": "20211114",
    "net_sales": 600
  },
  {
    "country": "es",
    "consumer": 4,
    "date": "20211115",
    "net_sales": 250
  }
]

================================================
FILE: tests/resources/feature/reconciliation/data/current_fail.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": 100
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 400
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": 600
 }
]

================================================
FILE: tests/resources/feature/reconciliation/data/current_nulls_and_zeros.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": null
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 0
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": null
 }
]

================================================
FILE: tests/resources/feature/reconciliation/data/current_nulls_and_zeros_fail.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": 0
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 0
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": null
 }
]

================================================
FILE: tests/resources/feature/reconciliation/data/truth.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": 200
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 400
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": 600
 }
]

================================================
FILE: tests/resources/feature/reconciliation/data/truth_different_rows.json
================================================
[
  {
    "country": "pt",
    "consumer": 1,
    "date": "20211112",
    "net_sales": 200
  },
  {
    "country": "ge",
    "consumer": 2,
    "date": "20211113",
    "net_sales": 400
  },
  {
    "country": "pt",
    "consumer": 3,
    "date": "20211114",
    "net_sales": 600
  },
  {
    "country": "uk",
    "consumer": 4,
    "date": "20211115",
    "net_sales": 250
  }
]

================================================
FILE: tests/resources/feature/reconciliation/data/truth_empty.json
================================================
[]

================================================
FILE: tests/resources/feature/reconciliation/data/truth_nulls_and_zeros.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": null
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 0
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": null
 }
]

================================================
FILE: tests/resources/feature/reconciliation/data/truth_nulls_and_zeros_fail.json
================================================
[
 {
   "country": "pt",
   "consumer": 1,
   "date": "20211112",
   "net_sales": null
 },
 {
   "country": "ge",
   "consumer": 2,
   "date": "20211113",
   "net_sales": 0
 },
 {
   "country": "pt",
   "consumer": 3,
   "date": "20211114",
   "net_sales": null
 }
]

================================================
FILE: tests/resources/feature/schema_evolution/append_load/batch_append_disabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.schema_evolution_append_load"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            }
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_timestamp"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/batch_append_disabled_cast.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.schema_evolution_append_load"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_timestamp"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/batch_append_enabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.schema_evolution_append_load"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            },
            "escape_col_names": false
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_timestamp"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": true
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/batch_append_enabled_cast.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_append_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "db_table": "test_db.schema_evolution_append_load"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "date"
          }
        }
      ]
    },
    {
      "spec_id": "appended_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "date",
            "increment_df": "max_sales_bronze_timestamp"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "appended_sales",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": true
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/batch_init_disabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_part-01_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/batch_init_enabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/source_part-01_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/append_load/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/append_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": true
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|new_column
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1|
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1|
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2|
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3|
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4|
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6|
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6|
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3|new

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6
20180110120052t|request1|1|1|1|7|1|N|20180110||article2|120|2
20180110120052t|request1|1|1|8|4|1|X|20170430||article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-05.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|request_id
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1|
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1|
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2|
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3|
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4|
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6|
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6|
20180110120052t||1|1|1|7|1|N|20180110|customer5|article2|120|2|request1
20180110120052t||1|1|8|4|1|X|20170430|customer3|article3|80|3|request1

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/control/part-06.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article6|50|2
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5||120|2
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|new_column
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1|new
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1|new
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6|new
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2|new
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2|new
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4|new
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3|new

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-05.csv
================================================
actrequest_timestamp|request_id|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/append_load/data/source/part-06.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/control/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/control/control_schema_add_column.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/control/control_schema_rename.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-01_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-02_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-03_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-04_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-05_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/append_load/schema/source/source_part-06_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_delta_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      }
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled_rename.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_delta_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            },
            "escape_col_names": false
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      }
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/batch_delta_enabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_delta_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data"
    },
    {
      "spec_id": "sales_bronze",
      "read_type": "batch",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "max_sales_bronze_timestamp",
      "input_id": "sales_bronze",
      "transformers": [
        {
          "function": "get_max_value",
          "args": {
            "input_col": "actrequest_timestamp"
          }
        }
      ]
    },
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            },
            "escape_col_names": false
          }
        },
        {
          "function": "incremental_filter",
          "args": {
            "input_col": "actrequest_timestamp",
            "increment_df": "max_sales_bronze_timestamp"
          }
        },
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",
        "delete_predicate": "new.recordmode in ('R','D','X')"
      }
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": true
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/batch_init_disabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_part-01_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "ranking_key_asc": [
              "recordmode"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/batch_init_enabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/source_part-01_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/delta_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "condensed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "condense_record_mode_cdc",
          "args": {
            "business_key": [
              "salesorder",
              "item"
            ],
            "ranking_key_desc": [
              "actrequest_timestamp",
              "datapakid",
              "partno",
              "record"
            ],
            "ranking_key_asc": [
              "recordmode"
            ],
            "record_mode_col": "recordmode",
            "valid_record_modes": [
              "",
              "N",
              "R",
              "D",
              "X"
            ]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "condensed_sales",
      "write_type": "merge",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/delta_load/data",
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date"
      }
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": true
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code|new_column
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2|
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4|
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2|new
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7|
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2|
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6|
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5|
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1|new
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3|
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1|
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5|
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1|
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3|
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4|

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5
20180110120052t|request1|1|1|1|7|1|N|20180110||article2|120|2
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article6|50|2
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5||120|2
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-05.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|request_id
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2|
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4|
20180110120052t|0|1|1|5|2|2||20170215|customer2|article2|50|2|request1
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7|
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2|
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6|
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5|
20180110120052t||1|1|1|7|1|N|20180110|customer5|article2|120|2|request1
20180110120052t|0|1|1|3|1|1||20160601|customer1|article1|150|1|request1
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3|
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1|
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5|
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1|
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3|
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4|

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/control/part-06.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6
00000000000000t|0|0|0|0|2|3|N|20170215|customer2|article1|30|1
00000000000000t|0|0|0|0|3|1|N|20170215|customer1|article5|200|5
00000000000000t|0|0|0|0|3|2|N|20170215|customer1|article2|120|2
00000000000000t|0|0|0|0|3|3|N|20170215|customer1|article4|90|4
00000000000000t|0|0|0|0|4|1|N|20170430|customer3|article3|80|3
00000000000000t|0|0|0|0|4|2|N|20170430|customer3|article7|70|7
00000000000000t|0|0|0|0|4|3|N|20170430|customer3|article1|30|1
00000000000000t|0|0|0|0|4|4|N|20170430|customer3|article2|50|2
00000000000000t|0|0|0|0|5|1|N|20170510|customer4|article6|150|6
00000000000000t|0|0|0|0|5|2|N|20170510|customer4|article3|100|3
00000000000000t|0|0|0|0|5|3|N|20170510|customer4|article5|80|5
00000000000000t|0|0|0|0|6|1|N|20170601|customer2|article4|100|4
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code|new_column
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2|new
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1|new
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1|new
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6|new
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2|new
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2|new
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4|new
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3|new

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-03.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-04.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-05.csv
================================================
actrequest_timestamp|request_id|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/data/source/part-06.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5|article2|120|2
20180110120052t|request1|1|1|2|1|1|X|20160601|customer1|article1|100|1
20180110120052t|request1|1|1|3|1|1||20160601|customer1|article1|150|1
20180110120052t|request1|1|1|4|2|2|X|20170215|customer2|article6|50|6
20180110120052t|request1|1|1|5|2|2||20170215|customer2|article2|50|2
20180110120052t|request1|1|1|6|3|2|D|20170215|customer1|article2|120|2
20180110120052t|request1|1|1|7|3|3|R|20170215|customer1|article4|-90|4
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3|article3|80|3

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/control/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/control/control_schema_add_column.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/control/control_schema_rename.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-01_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-02_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-03_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-04_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-05_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/delta_load/schema/source/source_part-06_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/batch_init.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/batch_merge_disabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "transformed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            }
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "transformed_sales",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": false
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/batch_merge_enabled.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "transformed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            }
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data"
    }
  ],
  "exec_env": {
    "spark.databricks.delta.schema.autoMerge.enabled": true
  }
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/batch_overwrite.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/schema_evolution/full_load/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "transformed_sales",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "cast",
          "args": {
            "cols": {
              "code": "StringType"
            }
          }
        },
        {
          "function": "rename",
          "args": {
            "cols": {
              "ARTICLE": "article"
            },
            "escape_col_names": false
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "transformed_sales",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/schema_evolution/full_load/data",
      "options": {
        "overwriteSchema": true
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/data/control/part-02.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|article|amount|code|new_column
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1|
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1|
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2|
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3|
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4|
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6|
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6|
20180110120052t|request1|1|1|1|7|1|N|20180110|customer5||120|2|new
20180110120052t|request1|1|1|8|4|1|X|20170430|customer3||80|3|new

================================================
FILE: tests/resources/feature/schema_evolution/full_load/data/source/part-01.csv
================================================
actrequest_timestamp|request|datapakid|partno|record|salesorder|item|recordmode|date|customer|ARTICLE|amount|code
00000000000000t|0|0|0|0|1|1|N||customer1|article1|100|1
00000000000000t|0|0|0|0|1|1||20160601|customer1|article1|100|1
00000000000000t|0|0|0|0|1|2|N|20160601|customer1|article2|200|2
00000000000000t|0|0|0|0|1|3|N|20160601|customer1|article3|50|3
00000000000000t|0|0|0|0|2|1|N|20170215|customer2|article4|10|4
00000000000000t|0|0|0|0|2|2||20170215|customer2|article6|50|6
00000000000000t|0|0|0|0|2|2|N||customer2|article6|50|6

================================================
FILE: tests/resources/feature/schema_evolution/full_load/data/source/part-02.csv
================================================
actrequest_timestamp|request_id|datapakid|partno|record|salesorder|item|recordmode|date|ARTICLE|amount|code|new_column
20180110120052t|request1|1|1|1|7|1|N|20180110|article2|120|2|new
20180110120052t|request1|1|1|2|1|1|X|20160601|article1|100|1|new
20180110120052t|request1|1|1|3|1|1||20160601|article1|150|1|new
20180110120052t|request1|1|1|4|2|2|X|20170215|article6|50|6|new
20180110120052t|request1|1|1|5|2|2||20170215|article2|50|2|new
20180110120052t|request1|1|1|6|3|2|D|20170215|article2|120|2|new
20180110120052t|request1|1|1|7|3|3|R|20170215|article4|-90|4|new
20180110120052t|request1|1|1|8|4|1|X|20170430|article3|80|3|new

================================================
FILE: tests/resources/feature/schema_evolution/full_load/schema/control/control_schema_merge_enabled.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/schema/control/control_schema_overwrite.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/schema/source/source_part-01_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/schema_evolution/full_load/schema/source/source_part-02_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "actrequest_timestamp",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "request_id",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "datapakid",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "partno",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "record",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "recordmode",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ARTICLE",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "code",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "new_column",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/sftp_reader/data/file.csv
================================================
column1|column2
1|1

================================================
FILE: tests/resources/feature/sftp_reader/data/file1.csv
================================================
column1|column2
2|2

================================================
FILE: tests/resources/feature/sftp_reader/data/file2.csv
================================================
column1|column2
3|3

================================================
FILE: tests/resources/feature/sftp_reader/data/file3.json
================================================
{"colUserName":"TestName", "colCity":"TestCity", "colState":"TestState"}

================================================
FILE: tests/resources/feature/sftp_reader/data/file4.xml
================================================
<?xml version='1.0' encoding='utf-8'?>
<data xmlns="http://example.com">
    <row>
        <name>userOne</name>
        <age>50</age>
        <city>CityTest</city>
    </row>
    <row>
        <name>userTwo</name>
        <age>40</age>
        <city>CityTest2</city>
    </row>
    <row>
        <name>userThree</name>
        <age>30</age>
        <city>CityTest3</city>
    </row>
</data>

================================================
FILE: tests/resources/feature/sftp_reader/data/file5.txt
================================================
value1
value2
value3

================================================
FILE: tests/resources/feature/sharepoint/exceptions/acons/drive_exception.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sharepoint_input",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sharepoint_output",
      "input_id": "sharepoint_input",
      "data_format": "sharepoint",
      "sharepoint_opts": {
        "client_id": "CLIENT_ID",
        "tenant_id": "TENANT_TEST",
        "secret": "CLIENT_SECRET",
        "site_name": "mock_site",
        "drive_name": "",
        "folder_relative_path": "sp_test",
        "file_name": "sharepoint_test.csv",
        "local_path": "mock_path",
        "conflict_behaviour": "replace"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/exceptions/acons/endpoint_exception.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sharepoint_input",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sharepoint_output",
      "input_id": "sharepoint_input",
      "data_format": "sharepoint",
      "sharepoint_opts": {
        "client_id": "CLIENT_ID",
        "tenant_id": "TENANT_TEST",
        "secret": "CLIENT_SECRET",
        "site_name": "mock_site",
        "drive_name": "mock_drive",
        "folder_relative_path": "sp_test",
        "file_name": "sharepoint_test.csv",
        "local_path": "mock_path",
        "conflict_behaviour": "replace"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/exceptions/acons/local_path_exception.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sharepoint_input",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sharepoint_output",
      "input_id": "sharepoint_input",
      "data_format": "sharepoint",
      "sharepoint_opts": {
        "client_id": "CLIENT_ID",
        "tenant_id": "TENANT_TEST",
        "secret": "CLIENT_SECRET",
        "site_name": "mock_site",
        "drive_name": "mock_drive",
        "folder_relative_path": "sp_test",
        "file_name": "sharepoint_test.csv",
        "local_path": "",
        "conflict_behaviour": "replace"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/exceptions/acons/site_exception.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sharepoint_input",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sharepoint_output",
      "input_id": "sharepoint_input",
      "data_format": "sharepoint",
      "sharepoint_opts": {
        "client_id": "CLIENT_ID",
        "tenant_id": "TENANT_TEST",
        "secret": "CLIENT_SECRET",
        "site_name": "",
        "drive_name": "mock_drive",
        "folder_relative_path": "sp_test",
        "file_name": "sharepoint_test.csv",
        "local_path": "mock_path",
        "conflict_behaviour": "replace"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/exceptions/acons/streaming_exception.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sharepoint_input",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "file:///app/tests/lakehouse/in/feature/sharepoint/data/",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sharepoint_output",
      "input_id": "sharepoint_input",
      "data_format": "sharepoint",
      "sharepoint_opts": {
        "client_id": "CLIENT_ID",
        "tenant_id": "TENANT_TEST",
        "secret": "CLIENT_SECRET",
        "site_name": "files_ingestion",
        "drive_name": "Exports_DART_dev",
        "folder_relative_path": "sp_test",
        "file_name": "sharepoint_test.csv",
        "local_path": "LOCAL_PATH",
        "conflict_behaviour": "replace"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/exceptions/schemas/schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_file_name_and_file_pattern_conflict_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_pattern": "sample_*",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                }
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_conflict_file_name_pattern",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_conflict_file_name_pattern/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_file_name_unsupported_extension_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "bad.txt",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                }
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_bad_extension",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_bad_extension/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_archive_enabled_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_archive_enabled",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_archive_enabled/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_archive_success_subfolder_override_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_pattern": "*",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true,
                "archive_success_subfolder": "processed"
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_archive_success_subfolder_override",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_archive_success_subfolder_override/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_no_csv_files_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_no_csv_files",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_no_csv_files/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_one_file_schema_mismatch_custom_error_subfolder_should_archive_error.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_pattern": "*",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true,
                "archive_error_subfolder": "failed"
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_schema_mismatch_custom_error_subfolder",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_schema_mismatch_custom_error_subfolder/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_one_file_schema_mismatch_should_archive_error.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_pattern": "*",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true,
                "archive_success_subfolder": "done",
                "archive_error_subfolder": "error"
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_schema_mismatch",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_schema_mismatch/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_pattern_matches_no_files_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_pattern": "does_not_match_*.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_pattern_matches_no_files",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_pattern_matches_no_files/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_pattern_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_pattern": "sample_*",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_pattern",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_pattern/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_csv_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_pattern": "*",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_path_does_not_exist_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "missing_folder",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_missing_folder",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_missing_folder/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_folder_relative_path_looks_like_file_unsupported_extension_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test/bad.txt",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_folder_path_bad_ext",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_folder_path_bad_ext/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_archive_default_enabled_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                }
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_archive_default_enabled",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_archive_default_enabled/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_archive_enabled_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_archive_enabled",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_archive_enabled/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_archive_success_subfolder_override_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true,
                "archive_success_subfolder": "processed"
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_archive_success_subfolder_override",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_archive_success_subfolder_override/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_download_error_should_archive_error.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_download_error",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_download_error/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_empty_file_should_archive_error.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "empty.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true,
                "archive_success_subfolder": "done",
                "archive_error_subfolder": "error"
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_empty_file",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_empty_file/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test/sample_1.csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_full_path",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_with_file_name_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test/sample_1.csv",
                "file_name": "other.csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_conflict",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_conflict/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_with_file_pattern_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test/sample_1.csv",
                "file_pattern": "*.csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                }
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_full_path_with_file_pattern_should_fail",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path_with_file_pattern_should_fail/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_full_path_with_file_type_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test/sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                }
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_full_path_with_file_type_should_fail",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_full_path_with_file_type_should_fail/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_spark_load_fails_should_archive_error.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": true
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single_spark_load_fails",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_single_spark_load_fails/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_single_csv_success.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_name": "sample_1.csv",
                "file_type": "csv",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_single",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/acons/read_unsupported_file_type_should_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sharepoint_input",
            "read_type": "batch",
            "data_format": "sharepoint",
            "sharepoint_opts": {
                "client_id": "CLIENT_ID",
                "tenant_id": "TENANT_ID",
                "secret": "SECRET",
                "site_name": "mock_site",
                "drive_name": "mock_drive",
                "local_path": "/app/tests/lakehouse/out/feature/sharepoint/reader/tmp/",
                "folder_relative_path": "sp_test",
                "file_type": "json",
                "local_options": {
                    "header": true,
                    "delimiter": ",",
                    "inferSchema": true
                },
                "archive_enabled": false
            }
        }
    ],
    "output_specs": [
        {
            "spec_id": "sharepoint_output",
            "input_id": "sharepoint_input",
            "data_format": "delta",
            "db_table": "test_db.sharepoint_reader_bad_file_type",
            "write_type": "overwrite",
            "location": "/app/tests/lakehouse/out/feature/sharepoint/reader/delta_bad_file_type/"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/data/bad_schema.csv
================================================
col_a,col_c
1,999

================================================
FILE: tests/resources/feature/sharepoint/reader/data/other.csv
================================================
col_a,col_b
999,999


================================================
FILE: tests/resources/feature/sharepoint/reader/data/sample_1.csv
================================================
col_a,col_b
1,2

================================================
FILE: tests/resources/feature/sharepoint/reader/data/sample_2.csv
================================================
col_a,col_b
3,4


================================================
FILE: tests/resources/feature/sharepoint/reader/mocks/get_drive_id.json
================================================
{
    "value": [
        {
            "name": "mock_drive",
            "id": "test_drive_id"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/reader/mocks/get_file_metadata.json
================================================
{
    "id": "test_item_id",
    "name": "sample.csv",
    "createdDateTime": "2026-01-01T00:00:00Z",
    "lastModifiedDateTime": "2026-01-01T00:00:00Z",
    "@microsoft.graph.downloadUrl": "https://download.mock/sample.csv"
}

================================================
FILE: tests/resources/feature/sharepoint/reader/mocks/get_site_id.json
================================================
{
    "id": "test_site_id",
    "displayName": "mock_site"
}

================================================
FILE: tests/resources/feature/sharepoint/reader/mocks/rename_file.json
================================================
{}

================================================
FILE: tests/resources/feature/sharepoint/writer/acons/write_to_local_success.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sharepoint_input",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "header": true,
        "delimiter": "|"
      },
      "location": "/app/tests/lakehouse/in/feature/sharepoint/data/",
      "schema": {
        "type": "struct",
        "fields": [
          {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
          },
          {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
          }
        ]
      }
    }
  ],
  "output_specs": [
    {
      "spec_id": "sharepoint_output",
      "input_id": "sharepoint_input",
      "data_format": "sharepoint",
      "sharepoint_opts": {
        "client_id": "CLIENT_ID",
        "tenant_id": "TENANT_TEST",
        "secret": "CLIENT_SECRET",
        "site_name": "mock_site",
        "drive_name": "mock_drive",
        "folder_relative_path": "sp_test",
        "file_name": "sharepoint_test",
        "local_path": "/app/tests/lakehouse/out/feature/sharepoint/writer/data/",
        "conflict_behaviour": "replace"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/sharepoint/writer/data/file_control.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/sharepoint/writer/data/file_source.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/sharepoint/writer/mocks/create_upload_session.json
================================================
{
    "uploadUrl": "test_site_id"
}

================================================
FILE: tests/resources/feature/sharepoint/writer/mocks/get_drive_id.json
================================================
{
    "value": [
        {
            "name": "mock_drive",
            "id": "test_drive_id"
        }
    ]
}

================================================
FILE: tests/resources/feature/sharepoint/writer/mocks/get_site_id.json
================================================
{
    "id": "test_site_id",
    "displayName": "mock_site"
}

================================================
FILE: tests/resources/feature/sharepoint/writer/schemas/schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_default_scenario1.json
================================================
{
  "function": "compute_table_statistics",
  "table_or_view": "test_db.DummyTableBronzeComplexDefaultScenario1"
}

================================================
FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_default_scenario2.json
================================================
{
  "function": "compute_table_statistics",
  "table_or_view": "test_db.DummyTableBronzeComplexDefaultScenario2"
}

================================================
FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_different_delimiter_scenario1.json
================================================
{
  "function": "compute_table_statistics",
  "table_or_view": "test_db.DummyTableBronzeComplexDifferentDelimiterScenario1"
}

================================================
FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_complex_different_delimiter_scenario2.json
================================================
{
  "function": "compute_table_statistics",
  "table_or_view": "test_db.DummyTableBronzeComplexDifferentDelimiterScenario2"
}

================================================
FILE: tests/resources/feature/table_manager/compute_table_statistics/table_stats_simple_split_scenario.json
================================================
{
  "function": "compute_table_statistics",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario"
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_table.json
================================================
{
  "function": "create_table",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table.sql"
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_table_complex_default_scenario.json
================================================
{
  "function": "create_table",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table_complex_default_scenario.sql",
  "delimiter": ";",
  "advanced_parser": true
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_table_complex_different_delimiter_scenario.json
================================================
{
  "function": "create_table",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table_complex_different_delimiter_scenario.sql",
  "delimiter": "===",
  "advanced_parser": true
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_table_simple_split_scenario.json
================================================
{
  "function": "create_table",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/table/test_table_simple_split_scenario.sql"
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_view.json
================================================
{
  "function": "create_view",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view.sql"
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_view_complex_default_scenario.json
================================================
{
  "function": "create_view",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view_complex_default_scenario.sql",
  "advanced_parser": true
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_view_complex_different_delimiter_scenario.json
================================================
{
  "function": "create_view",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view_complex_different_delimiter_scenario.sql",
  "delimiter": "===",
  "advanced_parser": true
}

================================================
FILE: tests/resources/feature/table_manager/create/acon_create_view_simple_split_scenario.json
================================================
{
  "function": "create_view",
  "path": "file:///app/tests/lakehouse/in/feature/table_manager/create/view/test_view_simple_split_scenario.sql"
}

================================================
FILE: tests/resources/feature/table_manager/create/table/test_table_complex_default_scenario.sql
================================================
-- New table manager test table, to check if new parser works as expected and deals well with different delimiters (;).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
CREATE TABLE test_db.DummyTableBronzeComplexDefaultScenario1
    (
        id INT COMMENT 'id with special (< characters ;',
        col1 STRING COMMENT 'col1 with >) special character " and ;',
        col2 INT COMMENT 'col2 with () special character \" and ;',
        col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;',
        col4 STRING COMMENT "col4 with special /* character ;",
        year INT COMMENT "year with */ special character ;",
        month INT COMMENT "month with special -- character ;",
        day INT COMMENT "day with special \" character ;"
    )
USING DELTA PARTITIONED BY (year, month, day)
LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_default_scenario1'
TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`');
-- New table manager test table, to check if new parser works as expected and deals well to different delimiters (;).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
/* New table manager test table, to check if new parser works as expected and deals well to different delimiters (;).
The parser must be able to deal with the delimiters that are inside of "", '', -- */
CREATE TABLE test_db.DummyTableBronzeComplexDefaultScenario2
    (
    id INT COMMENT 'id with special (< characters ;',
    col1 STRING COMMENT 'col1 with >) special character " and ;',
    col2 INT COMMENT 'col2 with () special character \" and ;',
    col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;',
    col4 STRING COMMENT "col4 with special /* character ;",
    year INT COMMENT "year with */ special character ;",
    month INT COMMENT "month with special -- character ;",
    day INT COMMENT "day with special \" character ;"
    )
USING DELTA PARTITIONED BY (year, month, day)
LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_default_scenario2'
TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`')

================================================
FILE: tests/resources/feature/table_manager/create/table/test_table_complex_different_delimiter_scenario.sql
================================================
-- New table manager test table, to check if new parser works as expected and deals well with different delimiters (===).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
CREATE TABLE test_db.DummyTableBronzeComplexDifferentDelimiterScenario1
    (
        id INT COMMENT 'id with special (< characters ;',
        col1 STRING COMMENT 'col1 with >) special character " and ;',
        col2 INT COMMENT 'col2 with () special character \" and ;',
        col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;',
        col4 STRING COMMENT "col4 with special /* character ;",
        year INT COMMENT "year with */ special character ;",
        month INT COMMENT "month with special -- character ;",
        day INT COMMENT "day with special \" character ;"
    )
USING DELTA PARTITIONED BY (year, month, day)
LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_different_delimiter_scenario1'
TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`')===
-- New table manager test table, to check if new parser works as expected and deals well to different delimiters (===).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
/* New table manager test table, to check if new parser works as expected and deals well to different delimiters (===).
The parser must be able to deal with the delimiters that are inside of "", '', -- */
CREATE TABLE test_db.DummyTableBronzeComplexDifferentDelimiterScenario2
    (
    id INT COMMENT 'id with special (< characters ;',
    col1 STRING COMMENT 'col1 with >) special character " and ;',
    col2 INT COMMENT 'col2 with () special character \" and ;',
    col3 BOOLEAN COMMENT 'col3 with special <> character \" and ;',
    col4 STRING COMMENT "col4 with special /* character ;",
    year INT COMMENT "year with */ special character ;",
    month INT COMMENT "month with special -- character ;",
    day INT COMMENT "day with special \" character ;"
    )
USING DELTA PARTITIONED BY (year, month, day)
LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_complex_different_delimiter_scenario2'
TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`')

================================================
FILE: tests/resources/feature/table_manager/create/table/test_table_simple_split_scenario.sql
================================================
CREATE TABLE test_db.DummyTableBronzeSimpleSplitScenario
    (id INT, col1 STRING, col2 INT, col3 BOOLEAN, col4 STRING, year INT, month INT, day INT)
USING DELTA PARTITIONED BY (year, month, day)
LOCATION 'file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_simple_split_scenario'
TBLPROPERTIES('lakehouse.primary_key'=' id, `col1`')

================================================
FILE: tests/resources/feature/table_manager/create/view/test_view_complex_default_scenario.sql
================================================
-- New table manager test view, to check if new parser works as expected and deals well with different delimiters (;).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
CREATE VIEW test_db.DummyViewBronzeComplexDefaultScenario1 (id,col1,col2,col3,col4) AS
    SELECT id,col1,CONCAT_WS(";",col2) AS col2,col3,col4
    FROM test_db.DummyTableBronzeComplexDefaultScenario1;
-- New table manager test view, to check if new parser works as expected and deals well with different delimiters (;).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
CREATE VIEW test_db.DummyViewBronzeComplexDefaultScenario2 (id,col1,col2,col3,col4) AS
    SELECT id,col1,col2,CONCAT_WS(";",col3) AS col3,col4
    FROM test_db.DummyTableBronzeComplexDefaultScenario2

================================================
FILE: tests/resources/feature/table_manager/create/view/test_view_complex_different_delimiter_scenario.sql
================================================
-- New table manager test view, to check if new parser works as expected and deals well with different delimiters (===).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
CREATE VIEW test_db.DummyViewBronzeComplexDifferentDelimiterScenario1 (id,col1,col2,col3,col4) AS
    SELECT id,col1,CONCAT_WS(";",col2) AS col2,col3,col4
    FROM test_db.DummyTableBronzeComplexDifferentDelimiterScenario1===
-- New table manager test view, to check if new parser works as expected and deals well with different delimiters (===).
-- The parser must be able to deal with the delimiters that are inside of "", '', --, /* */.
CREATE VIEW test_db.DummyViewBronzeComplexDifferentDelimiterScenario2 (id,col1,col2,col3,col4) AS
    SELECT id,col1,col2,CONCAT_WS(";",col3) AS col3,col4
    FROM test_db.DummyTableBronzeComplexDifferentDelimiterScenario2

================================================
FILE: tests/resources/feature/table_manager/create/view/test_view_simple_split_scenario.sql
================================================
CREATE VIEW test_db.DummyViewBronzeSimpleSplitScenario (id,col1,col2,col3,col4) AS
    SELECT id,col1,col2,col3,col4
    FROM test_db.DummyTableBronzeSimpleSplitScenario

================================================
FILE: tests/resources/feature/table_manager/delete/acon_delete_where_table_simple_split_scenario.json
================================================
{
  "function": "delete_where",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario",
  "where_clause": "year=2021"
}

================================================
FILE: tests/resources/feature/table_manager/describe/acon_describe_simple_split_scenario.json
================================================
{
  "function": "describe",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario"
}

================================================
FILE: tests/resources/feature/table_manager/drop/acon_drop_table_simple_split_scenario.json
================================================
{
  "function": "drop_table",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario"
}

================================================
FILE: tests/resources/feature/table_manager/drop/acon_drop_view_simple_split_scenario.json
================================================
{
  "function": "drop_view",
  "table_or_view": "test_db.DummyViewBronzeSimpleSplitScenario"
}

================================================
FILE: tests/resources/feature/table_manager/execute_sql/acon_execute_sql_complex_default_scenario.json
================================================
{
  "function": "execute_sql",
  "sql": "/* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;).The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col1 COMMENT 'comment ; for col1'; /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col2 COMMENT 'comment for col2'; /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col1 COMMENT 'comment \" for col1'; /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (;). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col2 COMMENT 'comment () <> for col2'",
  "advanced_parser": "True"
}

================================================
FILE: tests/resources/feature/table_manager/execute_sql/acon_execute_sql_complex_different_delimiter_scenario.json
================================================
{
  "function": "execute_sql",
  "sql": "/* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===).The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col1 COMMENT 'comment === for col1'=== /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario1 ALTER COLUMN col2 COMMENT 'comment for col2'=== /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col1 COMMENT 'comment \" for col1'=== /* New table manager test view, to check if new parser works as expected and deals well to different delimiters (===). The parser must be able to deal with the delimiters that are inside of \"\", '', --, */ ALTER TABLE test_db.DummyTableBronzeComplexDefaultScenario2 ALTER COLUMN col2 COMMENT 'comment () <> for col2'",
  "delimiter": "===",
  "advanced_parser": "True"
}

================================================
FILE: tests/resources/feature/table_manager/execute_sql/acon_execute_sql_simple_split_scenario.json
================================================
{
  "function": "execute_sql",
  "sql": "ALTER TABLE test_db.DummyTableBronzeSimpleSplitScenario ALTER COLUMN col1 COMMENT 'comment for col1'"
}

================================================
FILE: tests/resources/feature/table_manager/get_tbl_pk/get_tbl_pk_simple_split_scenario.json
================================================
{
  "function": "get_tbl_pk",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario"
}

================================================
FILE: tests/resources/feature/table_manager/optimize/optimize_location.json
================================================
{
  "function": "optimize",
  "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data",
  "where_clause": "year >= 2021 and month >= 09 and day > 01",
  "optimize_zorder_col_list": "col1,col2"
}

================================================
FILE: tests/resources/feature/table_manager/optimize/optimize_location_simple_split_scenario.json
================================================
{
  "function": "optimize",
  "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_simple_split_scenario",
  "where_clause": "year >= 2021 and month >= 09 and day > 01",
  "optimize_zorder_col_list": "col1,col2"
}

================================================
FILE: tests/resources/feature/table_manager/optimize/optimize_table.json
================================================
{
  "function": "optimize",
  "table_or_view": "test_db.DummyTableBronze",
  "where_clause": "year >= 2021 and month >= 09 and day > 01",
  "optimize_zorder_col_list": "col1,col2"
}

================================================
FILE: tests/resources/feature/table_manager/optimize/optimize_table_simple_split_scenario.json
================================================
{
  "function": "optimize",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario",
  "where_clause": "year >= 2021 and month >= 09 and day > 01",
  "optimize_zorder_col_list": "col1,col2"
}

================================================
FILE: tests/resources/feature/table_manager/show_tbl_properties/show_tbl_properties_simple_split_scenario.json
================================================
{
  "function": "show_tbl_properties",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario"
}

================================================
FILE: tests/resources/feature/table_manager/vacuum/acon_vacuum_location.json
================================================
{
  "function": "vacuum",
  "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data",
  "vacuum_hours": 185
}

================================================
FILE: tests/resources/feature/table_manager/vacuum/acon_vacuum_location_simple_split_scenario.json
================================================
{
  "function": "vacuum",
  "path": "file:///app/tests/lakehouse/out/feature/table_manager/dummy_table_bronze/data_simple_split_scenario",
  "vacuum_hours": 185
}

================================================
FILE: tests/resources/feature/table_manager/vacuum/acon_vacuum_table_simple_split_scenario.json
================================================
{
  "function": "vacuum",
  "table_or_view": "test_db.DummyTableBronzeSimpleSplitScenario",
  "vacuum_hours": 168
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/acons/batch.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sales_historical",
            "read_type": "batch",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_historical/"
        },
        {
            "spec_id": "sales_new",
            "read_type": "batch",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_new/"
        }
    ],
    "transform_specs": [
        {
            "spec_id": "incremented_historical",
            "input_id": "sales_historical",
            "transformers": [
                {
                    "function": "with_literals",
                    "args": {
                        "literals": {
                            "is_historical": true
                        }
                    }
                }
            ]
        },
        {
            "spec_id": "incremented_new",
            "input_id": "sales_new",
            "transformers": [
                {
                    "function": "with_literals",
                    "args": {
                        "literals": {
                            "is_historical": false
                        }
                    }
                }
            ]
        },
        {
            "spec_id": "union_dataframes",
            "input_id": "incremented_historical",
            "transformers": [
                {
                    "function": "union",
                    "args": {"union_with": ["incremented_new"]}
                }
            ]
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales",
            "input_id": "union_dataframes",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["date"],
            "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/batch/data"
        }
    ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/acons/streaming.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sales_historical",
            "read_type": "streaming",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_historical/"
        },
        {
            "spec_id": "sales_new",
            "read_type": "streaming",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_new/"
        }
    ],
    "transform_specs": [
        {
            "spec_id": "incremented_historical",
            "input_id": "sales_historical",
            "transformers": [
                {
                    "function": "with_literals",
                    "args": {
                        "literals": {
                            "is_historical": true
                        }
                    }
                }
            ]
        },
        {
            "spec_id": "incremented_new",
            "input_id": "sales_new",
            "transformers": [
                {
                    "function": "with_literals",
                    "args": {
                        "literals": {
                            "is_historical": false
                        }
                    }
                }
            ]
        },
        {
            "spec_id": "union_dataframes",
            "input_id": "incremented_historical",
            "transformers": [
                {
                    "function": "union",
                    "args": {"union_with": ["incremented_new"]}
                }
            ]
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales",
            "input_id": "union_dataframes",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["date"],
            "options": {
                "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming/checkpoint"
            },
            "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming/data"
        }
    ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/acons/streaming_batch.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sales_historical",
            "read_type": "streaming",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_historical/"
        },
        {
            "spec_id": "sales_new",
            "read_type": "streaming",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/sales_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/sales_new/"
        },
        {
            "spec_id": "customers",
            "read_type": "batch",
            "data_format": "csv",
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/customer_schema.json",
            "options": {
                "header": true,
                "delimiter": "|",
                "mode": "FAILFAST"
            },
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/customers/"
        }
    ],
    "transform_specs": [
        {
            "spec_id": "incremented_historical",
            "input_id": "sales_historical",
            "transformers": [
                {
                    "function": "with_literals",
                    "args": {
                        "literals": {
                            "is_historical": true
                        }
                    }
                }
            ]
        },
        {
            "spec_id": "incremented_new",
            "input_id": "sales_new",
            "transformers": [
                {
                    "function": "with_literals",
                    "args": {
                        "literals": {
                            "is_historical": false
                        }
                    }
                }
            ]
        },
        {
            "spec_id": "union_dataframes",
            "input_id": "incremented_historical",
            "transformers": [
                {
                    "function": "union",
                    "args": {"union_with": ["incremented_new"]}
                }
            ]
        },
        {
            "spec_id": "join_with_customers",
            "input_id": "union_dataframes",
            "force_streaming_foreach_batch_processing": true,
            "transformers": [
                {
                    "function": "join",
                    "args": {
                        "join_with": "customers",
                        "join_type": "left outer",
                        "join_condition": "a.customer = b.customer",
                        "select_cols": ["a.*", "b.name as customer_name"]
                    }
                },
                {"function": "with_row_id"}
            ]
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales",
            "input_id": "join_with_customers",
            "write_type": "append",
            "data_format": "delta",
            "partitions": ["date"],
            "options": {
                "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming_batch/checkpoint"
            },
            "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/streaming_batch/data"
        }
    ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/acons/write_streaming_struct_data.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sales_source",
            "read_type": "streaming",
            "data_format": "csv",
            "options": {
                "mode": "FAILFAST",
                "header": true,
                "delimiter": "|"
            },
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/struct_data_schema.json",
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/struct_data/"
        }
    ],
    "transform_specs": [
        {
            "spec_id": "first_transform",
            "input_id": "sales_source",
            "transformers": [
                {
                    "function": "cast",
                    "args": {
                        "cols": {
                            "date": "StringType",
                            "amount": "StringType"
                        }
                    }
                },
                {
                    "function": "rename",
                    "args": {
                        "cols": {
                            "date": "date2",
                            "customer": "customer2"
                        }
                    }
                },
                {
                    "function": "with_expressions",
                    "args": {
                        "cols_and_exprs": {
                            "constant": "'just a constant'",
                            "length_customer2": "length(customer2)"
                        }
                    }
                },
                {
                    "function": "from_json",
                    "args": {
                        "input_col": "sample",
                        "schema": {
                            "type": "struct",
                            "fields": [
                                {
                                    "name": "field1",
                                    "type": "string",
                                    "nullable": true,
                                    "metadata": {}
                                },
                                {
                                    "name": "field2",
                                    "type": "string",
                                    "nullable": true,
                                    "metadata": {}
                                },
                                {
                                    "name": "field3",
                                    "type": "double",
                                    "nullable": true,
                                    "metadata": {}
                                },
                                {
                                    "name": "field4",
                                    "type": {
                                        "type": "struct",
                                        "fields": [
                                            {
                                                "name": "field1",
                                                "type": "string",
                                                "nullable": true,
                                                "metadata": {}
                                            },
                                            {
                                                "name": "field2",
                                                "type": "string",
                                                "nullable": true,
                                                "metadata": {}
                                            }
                                        ]
                                    },
                                    "nullable": true,
                                    "metadata": {}
                                }
                            ]
                        }
                    }
                },
                {
                    "function": "to_json",
                    "args": {
                        "in_cols": [
                            "item",
                            "amount"
                        ],
                        "out_col": "item_amount_json"
                    }
                },
                {
                    "function": "flatten_schema",
                    "args": {
                        "max_level": 1
                    }
                }
            ]
        },
        {
            "spec_id": "second_transform",
            "input_id": "first_transform",
            "force_streaming_foreach_batch_processing": true,
            "transformers": [
                {
                    "function": "column_filter_exp",
                    "args": {
                        "exp": ["salesorder","item","article","sample_json_field1","sample_json_field4","item_amount_json"]
                    }
                }
            ]
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales_bronze",
            "input_id": "second_transform",
            "write_type": "append",
            "data_format": "delta",
            "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data/data",
            "options": {
                "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data/checkpoint"
            }
        }
    ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/acons/write_streaming_struct_data_fail.json
================================================
{
    "input_specs": [
        {
            "spec_id": "sales_source",
            "read_type": "streaming",
            "data_format": "csv",
            "options": {
                "mode": "FAILFAST",
                "header": true,
                "delimiter": "|"
            },
            "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/schema/struct_data_schema.json",
            "location": "file:///app/tests/lakehouse/in/feature/transformations/chain_transformations/source/struct_data/"
        }
    ],
    "transform_specs": [
        {
            "spec_id": "first_transform",
            "input_id": "sales_source",
            "force_streaming_foreach_batch_processing": true,
            "transformers": [
                {
                    "function": "cast",
                    "args": {
                        "cols": {
                            "date": "StringType",
                            "amount": "StringType"
                        }
                    }
                },
                {
                    "function": "rename",
                    "args": {
                        "cols": {
                            "date": "date2",
                            "customer": "customer2"
                        }
                    }
                },
                {
                    "function": "with_expressions",
                    "args": {
                        "cols_and_exprs": {
                            "constant": "'just a constant'",
                            "length_customer2": "length(customer2)"
                        }
                    }
                },
                {
                    "function": "from_json",
                    "args": {
                        "input_col": "sample",
                        "schema": {
                            "type": "struct",
                            "fields": [
                                {
                                    "name": "field1",
                                    "type": "string",
                                    "nullable": true,
                                    "metadata": {}
                                },
                                {
                                    "name": "field2",
                                    "type": "string",
                                    "nullable": true,
                                    "metadata": {}
                                },
                                {
                                    "name": "field3",
                                    "type": "double",
                                    "nullable": true,
                                    "metadata": {}
                                },
                                {
                                    "name": "field4",
                                    "type": {
                                        "type": "struct",
                                        "fields": [
                                            {
                                                "name": "field1",
                                                "type": "string",
                                                "nullable": true,
                                                "metadata": {}
                                            },
                                            {
                                                "name": "field2",
                                                "type": "string",
                                                "nullable": true,
                                                "metadata": {}
                                            }
                                        ]
                                    },
                                    "nullable": true,
                                    "metadata": {}
                                }
                            ]
                        }
                    }
                },
                {
                    "function": "to_json",
                    "args": {
                        "in_cols": [
                            "item",
                            "amount"
                        ],
                        "out_col": "item_amount_json"
                    }
                },
                {
                    "function": "flatten_schema",
                    "args": {
                        "max_level": 1
                    }
                }
            ]
        },
        {
            "spec_id": "second_transform",
            "input_id": "first_transform",
            "force_streaming_foreach_batch_processing": true,
            "transformers": [
                {
                    "function": "column_filter_exp",
                    "args": {
                        "exp": ["salesorder","item","article","sample_json_field1","sample_json_field4","item_amount_json"]
                    }
                }
            ]
        }
    ],
    "output_specs": [
        {
            "spec_id": "sales_bronze",
            "input_id": "second_transform",
            "write_type": "append",
            "data_format": "delta",
            "location": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data_fail/data",
            "options": {
                "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/chain_transformations/write_streaming_struct_data_fail/checkpoint"
            }
        }
    ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/control/chain_control.csv
================================================
salesorder|item|date|customer|article|amount|is_historical|customer_name|lhe_row_id
0|1|20140601|customer1|article1|1000|true|Anna|0
0|2|20140601|customer1|article2|2000|true|Anna|8589934592
0|3|20140601|customer1|article3|500|true|Anna|1
1|1|20150601|customer1|article1|1000|true|Anna|2
1|2|20150601|customer1|article2|2000|true|Anna|8589934593
1|3|20150601|customer1|article3|500|true|Anna|8589934594
2|1|20160215|customer2|article4|1000|false|John|3
2|2|20160215|customer2|article6|5000|false|John|8589934595
2|3|20160215|customer2|article1|3000|false|John|4
3|1|20160215|customer1|article5|20000|false|Anna|8589934596
6|1|20160218|customer3|article7|100|false|Sarah|5
6|2|20160218|customer3|article9|500|false|Sarah|6
6|3|20160218|customer3|article8|300|false|Sarah|8589934597
7|1|20160218|customer5|article7|2000|false||8589934598

================================================
FILE: tests/resources/feature/transformations/chain_transformations/control/struct_data.json
================================================
[
 {
   "salesorder": 1,
   "item": 1,
   "article": "article1",
   "amount": "1000",
   "sample": "{\"field1\": \"value1\", \"field2\": \"value2\", \"field4\": {\"field1\": \"value1\", \"field2\": \"value2\"}}",
   "date2": "20160601",
   "customer2": "customer1",
   "constant": "just a constant",
   "length_customer2": 9,
   "sample_json_field1": "value1",
   "sample_json_field2": "value2",
   "sample_json_field3": null,
   "sample_json_field4": {"field1": "value1", "field2": "value2"},
   "item_amount_json": "{\"item\":1,\"amount\":\"1000\"}"
 },
 {
   "salesorder": 1,
   "item": 2,
   "article": "article2",
   "amount": "2000",
   "sample": "{\"field1\": \"value3\", \"field2\": \"value4\", \"field4\": {\"field1\": \"1value\", \"field2\": \"2value\"}}",
   "date2": "20160601",
   "customer2": "customer1",
   "constant": "just a constant",
   "length_customer2": 9,
   "sample_json_field1": "value3",
   "sample_json_field2": "value4",
   "sample_json_field3": null,
   "sample_json_field4": {"field1": "1value", "field2": "2value"},
   "item_amount_json": "{\"item\":2,\"amount\":\"2000\"}"
 },
 {
   "salesorder": 1,
   "item": 3,
   "article": "article3",
   "amount": "500",
   "sample": "{\"field1\": \"value5\", \"field3\": 6.25, \"field4\": {\"field1\": \"1value1\", \"field2\": \"2value2\"}}",
   "date2": "20160601",
   "customer2": "customer1",
   "constant": "just a constant",
   "length_customer2": 9,
   "sample_json_field1": "value5",
   "sample_json_field2": null,
   "sample_json_field3": 6.25,
   "sample_json_field4": {"field1": "1value1", "field2": "2value2"},
   "item_amount_json": "{\"item\":3,\"amount\":\"500\"}"
 }
]

================================================
FILE: tests/resources/feature/transformations/chain_transformations/schema/customer_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "birth_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/schema/sales_schema.json
================================================
{
    "type": "struct",
    "fields": [
        {
            "name": "salesorder",
            "type": "integer",
            "nullable": true,
            "metadata": {}
        },
        {
            "name": "item",
            "type": "integer",
            "nullable": true,
            "metadata": {}
        },
        {
            "name": "date",
            "type": "integer",
            "nullable": true,
            "metadata": {}
        },
        {
            "name": "customer",
            "type": "string",
            "nullable": true,
            "metadata": {}
        },
        {
            "name": "article",
            "type": "string",
            "nullable": true,
            "metadata": {}
        },
        {
            "name": "amount",
            "type": "integer",
            "nullable": true,
            "metadata": {}
        }
    ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/schema/struct_data_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "sample",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/chain_transformations/source/customers.csv
================================================
customer|name|birth_date
customer1|Anna|01012002
customer2|John|04051980
customer3|Sarah|02051940

================================================
FILE: tests/resources/feature/transformations/chain_transformations/source/sales_historical.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/chain_transformations/source/sales_new.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/transformations/chain_transformations/source/struct_data.csv
================================================
salesorder|item|date|customer|article|amount|sample
1|1|20160601|customer1|article1|1000|{"field1":"value1","field2":"value2","field4":{"field1":"value1","field2":"value2"}}
1|2|20160601|customer1|article2|2000|{"field1":"value3","field2":"value4","field4":{"field1":"1value","field2":"2value"}}
1|3|20160601|customer1|article3|500|{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}

================================================
FILE: tests/resources/feature/transformations/column_creators/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "with_literals",
          "args": {
            "literals": {
              "dummy_string": "this is a string",
              "dummy_int": 100,
              "dummy_double": 10.2,
              "dummy_boolean": true
            }
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/batch/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/batch/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_creators/data/control/part-01.json
================================================
[
 {
   "salesorder": 1,
   "item": 1,
   "article": "article1",
   "amount": 1000,
   "date": 20160601,
   "customer": "customer1",
   "dummy_string": "this is a string",
   "dummy_int": 100,
   "dummy_double": 10.2,
   "dummy_boolean": true
 },
 {
   "salesorder": 1,
   "item": 2,
   "article": "article2",
   "amount": 2000,
   "date": 20160601,
   "customer": "customer1",
   "dummy_string": "this is a string",
   "dummy_int": 100,
   "dummy_double": 10.2,
   "dummy_boolean": true
 },
 {
   "salesorder": 1,
   "item": 3,
   "article": "article3",
   "amount": 500,
   "date": 20160601,
   "customer": "customer1",
   "dummy_string": "this is a string",
   "dummy_int": 100,
   "dummy_double": 10.2,
   "dummy_boolean": true
 }
]

================================================
FILE: tests/resources/feature/transformations/column_creators/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/column_creators/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_creators/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_creators/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "with_literals",
          "args": {
            "literals": {
              "dummy_string": "this is a string",
              "dummy_int": 100,
              "dummy_double": 10.2,
              "dummy_boolean": true
            }
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_creators/streaming/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "json",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "date": "date2",
              "customer": "customer2"
            }
          }
        },
        {
          "function": "with_expressions",
          "args": {
            "cols_and_exprs": {
              "constant": "'just a constant'",
              "length_customer2": "length(customer2)"
            }
          }
        },
        {
          "function": "to_json",
          "args": {
            "in_cols": [
              "item",
              "amount"
            ],
            "out_col": "item_amount_json"
          }
        },
        {
          "function": "explode_columns",
          "args": {
            "explode_arrays": true
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/batch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/data/control/part-01.csv
================================================
salesorder|item|article|amount|manufacturing_countries|sub_articles|date2|customer2|constant|length_customer2|item_amount_json
1|1|article1|1000|Portugal|article101|20220101|customer1|just a constant|9|{"item":1,"amount":1000}
1|1|article1|1000|Portugal|article102|20220101|customer1|just a constant|9|{"item":1,"amount":1000}
1|1|article1|1000|Spain|article101|20220101|customer1|just a constant|9|{"item":1,"amount":1000}
1|1|article1|1000|Spain|article102|20220101|customer1|just a constant|9|{"item":1,"amount":1000}
1|2|article2|1000|Portugal|article201|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Portugal|article202|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Portugal|article203|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Algeria|article201|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Algeria|article202|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Algeria|article203|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Italy|article201|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Italy|article202|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
1|2|article2|1000|Italy|article203|20220102|customer2|just a constant|9|{"item":2,"amount":1000}
2|1|article3|1200|Norway|article301|20220102|customer3|just a constant|9|{"item":1,"amount":1200}
2|1|article4|1500|Portugal|article401|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Portugal|article402|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Portugal|article403|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Malaysia|article401|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Malaysia|article402|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Malaysia|article403|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Germany|article401|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Germany|article402|20220103|customer2|just a constant|9|{"item":1,"amount":1500}
2|1|article4|1500|Germany|article403|20220103|customer2|just a constant|9|{"item":1,"amount":1500}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/data/source/part-01.json
================================================
{"salesorder": 1,"item": 1,"date": 20220101,"customer":"customer1","article": "article1","amount": 1000,"manufacturing_countries": ["Portugal", "Spain"], "sub_articles": ["article101", "article102"]}
{"salesorder": 1,"item": 2,"date": 20220102,"customer":"customer2","article": "article2","amount": 1000,"manufacturing_countries": ["Portugal", "Algeria", "Italy"], "sub_articles": ["article201", "article202", "article203"]}
{"salesorder": 2,"item": 1,"date": 20220102,"customer":"customer3","article": "article3","amount": 1200,"manufacturing_countries": ["Norway"], "sub_articles": ["article301"]}
{"salesorder": 2,"item": 1,"date": 20220103,"customer":"customer2","article": "article4","amount": 1500,"manufacturing_countries": ["Portugal", "Malaysia", "Germany"], "sub_articles": ["article401", "article402", "article403"]}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "manufacturing_countries",
      "nullable": true,
      "metadata": {},
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "name": "sub_articles",
      "nullable": true,
      "metadata": {},
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/explode_arrays/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "json",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "date": "date2",
              "customer": "customer2"
            }
          }
        },
        {
          "function": "with_expressions",
          "args": {
            "cols_and_exprs": {
              "constant": "'just a constant'",
              "length_customer2": "length(customer2)"
            }
          }
        },
        {
          "function": "to_json",
          "args": {
            "in_cols": [
              "item",
              "amount"
            ],
            "out_col": "item_amount_json"
          }
        },
        {
          "function": "explode_columns",
          "args": {
            "explode_arrays": true
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/streaming/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "json",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "date": "date2",
              "customer": "customer2"
            }
          }
        },
        {
          "function": "with_expressions",
          "args": {
            "cols_and_exprs": {
              "constant": "'just a constant'",
              "length_customer2": "length(customer2)"
            }
          }
        },
        {
          "function": "from_json",
          "args": {
            "input_col": "agg_fields",
            "schema": {
              "type": "struct",
              "fields": [
                {
                  "name": "field1",
                  "nullable": true,
                  "metadata": {},
                  "type": {
                    "containsNull": true,
                    "elementType": "string",
                    "type": "array"
                  }
                },
                {
                  "name": "field2",
                  "type": {
                    "type": "struct",
                    "fields": [
                      {
                        "name": "field1",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      },
                      {
                        "name": "field2",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      }
                    ]
                  },
                  "nullable": true,
                  "metadata": {}
                }
              ]
            }
          }
        },
        {
          "function": "to_json",
          "args": {
            "in_cols": [
              "item",
              "amount"
            ],
            "out_col": "item_amount_json"
          }
        },
        {
          "function": "flatten_schema",
          "args": {
            "max_level": 2
          }
        },
        {
          "function": "explode_columns",
          "args": {
            "explode_arrays": true,
            "map_cols_to_explode": [
              "sample"
            ]
          }
        },
        {
          "function": "flatten_schema",
          "args": {
            "max_level": 2
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data/control/part-01.csv
================================================
salesorder|item|article|amount|sub_articles|sample_key|sample_value|agg_fields|date2|customer2|constant|length_customer2|agg_fields_json_field1|agg_fields_json_field2_field1|agg_fields_json_field2_field2|item_amount_json
1|1|article1|1000|article101|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article101|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article101|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article101|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article102|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article102|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Portugal|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article102|field1|value1|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000}
1|1|article1|1000|article102|field2|value2|{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|Spain|value1|value2|{"item":1,"amount":1000}
1|2|article2|1000|article201|field1|value3|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article201|field2|value4|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article201|field1|value5|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article201|field2|value6|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article202|field1|value3|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article202|field2|value4|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article202|field1|value5|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article202|field2|value6|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article203|field1|value3|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article203|field2|value4|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article203|field1|value5|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
1|2|article2|1000|article203|field2|value6|{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}|20220102|customer2|just a constant|9|Italy|value4|value5|{"item":2,"amount":1000}
2|1|article3|1200|article301|field1|value7|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Malaysia|||{"item":1,"amount":1200}
2|1|article3|1200|article301|field2|value8|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Malaysia|||{"item":1,"amount":1200}
2|1|article3|1200|article301|field1|value7|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Germany|||{"item":1,"amount":1200}
2|1|article3|1200|article301|field2|value8|{"field1":["Malaysia","Germany"]}|20220102|customer3|just a constant|9|Germany|||{"item":1,"amount":1200}
2|1|article4|1500|article401|field1|value9|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500}
2|1|article4|1500|article401|field2|value10|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500}
2|1|article4|1500|article402|field1|value9|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500}
2|1|article4|1500|article402|field2|value10|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500}
2|1|article4|1500|article403|field1|value9|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500}
2|1|article4|1500|article403|field2|value10|{"field2":{"field1":"value2","field2":"value3"}}|20220103|customer2|just a constant|9||value2|value3|{"item":1,"amount":1500}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data/source/part-01.json
================================================
{"salesorder":1,"item":1,"date":20220101,"customer":"customer1","article":"article1","amount":1000,"sub_articles":["article101","article102"],"sample":[{"field1":"value1","field2":"value2"}],"agg_fields":{"field1":["Portugal","Spain"],"field2":{"field1":"value1","field2":"value2"}}}
{"salesorder":1,"item":2,"date":20220102,"customer":"customer2","article":"article2","amount":1000,"sub_articles":["article201","article202","article203"],"sample":[{"field1":"value3","field2":"value4"},{"field1":"value5","field2":"value6"}],"agg_fields":{"field1":["Italy"],"field2":{"field1":"value4","field2":"value5"}}}
{"salesorder":2,"item":1,"date":20220102,"customer":"customer3","article":"article3","amount":1200,"sub_articles":["article301"],"sample":[{"field1":"value7","field2":"value8"}],"agg_fields":{"field1":["Malaysia","Germany"]}}
{"salesorder":2,"item":1,"date":20220103,"customer":"customer2","article":"article4","amount":1500,"sub_articles":["article401","article402","article403"],"sample":[{"field1":"value9","field2":"value10"}],"agg_fields":{"field2":{"field1":"value2","field2":"value3"}}}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "sub_articles",
      "nullable": true,
      "metadata": {},
      "type": {
        "containsNull": true,
        "elementType": "string",
        "type": "array"
      }
    },
    {
      "name": "sample",
      "nullable": true,
      "metadata": {},
      "type": {
        "containsNull": true,
        "elementType": {
          "keyType": "string",
          "type": "map",
          "valueContainsNull": true,
          "valueType": "string"
        },
        "type": "array"
      }
    },
    {
      "name": "agg_fields",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "json",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "date": "date2",
              "customer": "customer2"
            }
          }
        },
        {
          "function": "with_expressions",
          "args": {
            "cols_and_exprs": {
              "constant": "'just a constant'",
              "length_customer2": "length(customer2)"
            }
          }
        },
        {
          "function": "from_json",
          "args": {
            "input_col": "agg_fields",
            "schema": {
              "type": "struct",
              "fields": [
                {
                  "name": "field1",
                  "nullable": true,
                  "metadata": {},
                  "type": {
                    "containsNull": true,
                    "elementType": "string",
                    "type": "array"
                  }
                },
                {
                  "name": "field2",
                  "type": {
                    "type": "struct",
                    "fields": [
                      {
                        "name": "field1",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      },
                      {
                        "name": "field2",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      }
                    ]
                  },
                  "nullable": true,
                  "metadata": {}
                }
              ]
            }
          }
        },
        {
          "function": "to_json",
          "args": {
            "in_cols": [
              "item",
              "amount"
            ],
            "out_col": "item_amount_json"
          }
        },
        {
          "function": "flatten_schema",
          "args": {
            "max_level": 2
          }
        },
        {
          "function": "explode_columns",
          "args": {
            "explode_arrays": true,
            "map_cols_to_explode": [
              "sample"
            ]
          }
        },
        {
          "function": "flatten_schema",
          "args": {
            "max_level": 2
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/streaming/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "json",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "date": "date2",
              "customer": "customer2"
            }
          }
        },
        {
          "function": "with_expressions",
          "args": {
            "cols_and_exprs": {
              "constant": "'just a constant'",
              "length_customer2": "length(customer2)"
            }
          }
        },
        {
          "function": "from_json",
          "args": {
            "input_col": "sample",
            "schema": {
              "type": "struct",
              "fields": [
                {
                  "name": "field1",
                  "type": "string",
                  "nullable": true,
                  "metadata": {}
                },
                {
                  "name": "field2",
                  "type": "string",
                  "nullable": true,
                  "metadata": {}
                },
                {
                  "name": "field3",
                  "type": "double",
                  "nullable": true,
                  "metadata": {}
                },
                {
                  "name": "field4",
                  "type": {
                    "type": "struct",
                    "fields": [
                      {
                        "name": "field1",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      },
                      {
                        "name": "field2",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      }
                    ]
                  },
                  "nullable": true,
                  "metadata": {}
                }
              ]
            }
          }
        },
        {
          "function": "to_json",
          "args": {
            "in_cols": [
              "item",
              "amount"
            ],
            "out_col": "item_amount_json"
          }
        },
        {
          "function": "flatten_schema",
          "args": {
            "max_level": 2
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/batch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/data/control/part-01.csv
================================================
salesorder|item|article|amount|sample|date2|customer2|constant|length_customer2|sample_json_field1|sample_json_field2|sample_json_field3|sample_json_field4_field1|sample_json_field4_field2|item_amount_json
1|1|article1|1000|{"field1":"value1","field2":"value2","field4":{"field1":"value1","field2":"value2"}}|20220101|customer1|just a constant|9|value1|value2||value1|value2|{"item":1,"amount":1000}
1|2|article2|1000|{"field1":"value3","field2":"value4","field4":{"field1":"1value","field2":"2value"}}|20220102|customer2|just a constant|9|value3|value4||1value|2value|{"item":2,"amount":1000}
2|1|article3|1200|{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}|20220102|customer3|just a constant|9|value5||6.25|1value1|2value2|{"item":1,"amount":1200}
2|1|article4|1500|{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}|20220103|customer2|just a constant|9|value5||6.25|1value1|2value2|{"item":1,"amount":1500}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/data/source/part-01.json
================================================
{"salesorder":1,"item":1,"date":20220101,"customer":"customer1","article":"article1","amount":1000,"sample":{"field1":"value1","field2":"value2","field4":{"field1":"value1","field2":"value2"}}}
{"salesorder":1,"item":2,"date":20220102,"customer":"customer2","article":"article2","amount":1000,"sample":{"field1":"value3","field2":"value4","field4":{"field1":"1value","field2":"2value"}}}
{"salesorder":2,"item":1,"date":20220102,"customer":"customer3","article":"article3","amount":1200,"sample":{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}}
{"salesorder":2,"item":1,"date":20220103,"customer":"customer2","article":"article4","amount":1500,"sample":{"field1":"value5","field3":6.25,"field4":{"field1":"1value1","field2":"2value2"}}}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "sample",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/column_reshapers/flatten_schema/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "json",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_source",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "rename",
          "args": {
            "cols": {
              "date": "date2",
              "customer": "customer2"
            }
          }
        },
        {
          "function": "with_expressions",
          "args": {
            "cols_and_exprs": {
              "constant": "'just a constant'",
              "length_customer2": "length(customer2)"
            }
          }
        },
        {
          "function": "from_json",
          "args": {
            "input_col": "sample",
            "schema": {
              "type": "struct",
              "fields": [
                {
                  "name": "field1",
                  "type": "string",
                  "nullable": true,
                  "metadata": {}
                },
                {
                  "name": "field2",
                  "type": "string",
                  "nullable": true,
                  "metadata": {}
                },
                {
                  "name": "field3",
                  "type": "double",
                  "nullable": true,
                  "metadata": {}
                },
                {
                  "name": "field4",
                  "type": {
                    "type": "struct",
                    "fields": [
                      {
                        "name": "field1",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      },
                      {
                        "name": "field2",
                        "type": "string",
                        "nullable": true,
                        "metadata": {}
                      }
                    ]
                  },
                  "nullable": true,
                  "metadata": {}
                }
              ]
            }
          }
        },
        {
          "function": "to_json",
          "args": {
            "in_cols": [
              "item",
              "amount"
            ],
            "out_col": "item_amount_json"
          }
        },
        {
          "function": "flatten_schema",
          "args": {
            "max_level": 2
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_source",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/streaming/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/data_maskers/data/control/drop_columns.csv
================================================
salesorder|item|date|amount
1|1|20160601|1000
1|2|20160601|2000
1|3|20160601|500

================================================
FILE: tests/resources/feature/transformations/data_maskers/data/control/hash_masking.csv
================================================
salesorder|item|date|amount|customer|customer_hash|article|article_hash
1|1|20160601|-14577491|customer1|dea26157fa355301663174eac368538cff8939f36681d6712dedba439ab98b70|article1|36b3061d4fb72c32379a2ad0f05ace632371107ce414a1b3d51ef64247f53952
1|2|20160601|1268485177|customer1|dea26157fa355301663174eac368538cff8939f36681d6712dedba439ab98b70|article2|8e3ba57e23105c9aaceb58b2ad0f5de979199a7732a6ee3734404ca7745c6fef
1|3|20160601|-2108627946|customer1|dea26157fa355301663174eac368538cff8939f36681d6712dedba439ab98b70|article3|12717ebdf09ca4f2b2318796b6653e9b96989eda7726da4d94b73a3614476ae6

================================================
FILE: tests/resources/feature/transformations/data_maskers/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/data_maskers/drop_columns.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "masked_data",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "column_dropper",
          "args": {
            "cols": ["customer", "article"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "masked_data",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/data_maskers/drop_columns/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/data_maskers/drop_columns_control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/data_maskers/hash_masking.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/data_maskers/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "masked_data",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "hash_masker",
          "args": {
            "cols": ["customer", "article"]
          }
        },
        {
          "function": "hash_masker",
          "args": {
            "cols": ["amount"],
            "approach": "MURMUR3",
            "suffix": ""
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "masked_data",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/data_maskers/hash_masking/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/data_maskers/hash_masking_control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer_hash",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article_hash",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/data_maskers/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/date_transformers/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date3",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2_day",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2_month",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2_week",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2_quarter",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2_year",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2_day",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2_month",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2_week",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2_quarter",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2_year",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/date_transformers/data/control/part-01.csv
================================================
salesorder|order_date|order_date2|order_date3|ship_date|ship_date2|order_date2_day|order_date2_month|order_date2_week|order_date2_quarter|order_date2_year|ship_date2_day|ship_date2_month|ship_date2_week|ship_date2_quarter|ship_date2_year|
1|2016-06-01|2016-06-01|16-1-6|16-6-2|2016-02-06 23:40:43|1|6|22|2|2016|6|2|5|1|2016|
2|2016-07-03|2016-07-03|16-3-7|16-22-5|2016-05-22 22:12:54|3|7|26|3|2016|22|5|20|2|2016|
3|2017-01-02|2017-01-02|17-2-1|17-1-3|2017-03-01 07:43:11|2|1|1|1|2017|1|3|9|1|2017|


================================================
FILE: tests/resources/feature/transformations/date_transformers/data/source/part-01.csv
================================================
salesorder|order_date|order_date2|order_date3|ship_date|ship_date2
1|2016-06-01|01-06-2016|20160601|2016-06-02 23:40:43|2016-02-06T23:40:43.000Z
2|2016-07-03|03-07-2016|20160703|2016-22-05 22:12:54|2016-05-22T22:12:54.000Z
3|2017-01-02|02-01-2017|20170102|2017-01-03 07:43:11|2017-03-01T07:43:11.000Z

================================================
FILE: tests/resources/feature/transformations/date_transformers/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date",
      "type": "date",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date2",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "order_date3",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date2",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/date_transformers/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/date_transformers/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/date_transformers/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_with_new_dates",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "add_current_date",
          "args": {
            "output_col": "curr_date"
          }
        },
        {
          "function": "convert_to_date",
          "args": {
            "cols": ["order_date2"],
            "source_format": "dd-MM-yyyy"
          }
        },
        {
          "function": "convert_to_date",
          "args": {
            "cols": ["order_date3"],
            "source_format": "yyyyMMdd"
          }
        },
        {
          "function": "convert_to_timestamp",
          "args": {
            "cols": ["ship_date"],
            "source_format": "yyyy-dd-MM HH:mm:ss"
          }
        },
        {
          "function": "format_date",
          "args": {
            "cols": ["order_date3", "ship_date"],
            "target_format": "yy-d-M"
          }
        },
        {
          "function": "get_date_hierarchy",
          "args": {
            "cols": ["order_date2", "ship_date2"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_with_new_dates",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/date_transformers/streaming/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/date_transformers/streaming/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/batch.json
================================================
{
    "input_specs": [
      {
        "spec_id": "orders_source",
        "read_type": "batch",
        "data_format": "csv",
        "options": {
          "mode": "FAILFAST",
          "header": true,
          "delimiter": "|"
        },
        "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/source_schema.json",
        "location": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/data/part-01.csv"
      }
    ],
    "transform_specs": [
      {
        "spec_id": "orders_duplicate_no_args",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows"
          }
        ]
      },
      {
        "spec_id": "orders_duplicate_empty",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows",
            "args": {
              "cols": []
            }
          }
        ]
      },
      {
        "spec_id": "orders_duplicate",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows",
            "args": {
              "cols": ["order_number","item_number"]
            }
          }
        ]
      }
    ],
    "output_specs": [
      {
        "spec_id": "orders_duplicate_no_args_write",
        "input_id": "orders_duplicate_no_args",
        "write_type": "overwrite",
        "data_format": "delta",
        "partitions": ["date"],
        "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/batch/orders_duplicate_no_args/data"
      },
      {
        "spec_id": "orders_duplicate_empty_write",
        "input_id": "orders_duplicate_empty",
        "write_type": "overwrite",
        "data_format": "delta",
        "partitions": ["date"],
        "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/batch/orders_duplicate_empty/data"
      },
      {
        "spec_id": "orders_duplicate_write",
        "input_id": "orders_duplicate",
        "write_type": "overwrite",
        "data_format": "delta",
        "partitions": ["date"],
        "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/batch/columns/data"
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/batch_distinct.json
================================================
[
  {
    "order_number": 1,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 20,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 22,
    "date": 20220102,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 3,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 4,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  }, 
  {
    "order_number": 2,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 2,
    "article_number": "article2",
    "amount": 300,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 3,
    "article_number": "article3",
    "amount": 200,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  }
]

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/batch_drop_duplicates.json
================================================
[
  {
    "order_number": 1,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 20,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 3,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 4,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  }, 
  {
    "order_number": 2,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 2,
    "article_number": "article2",
    "amount": 300,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 3,
    "article_number": "article3",
    "amount": 200,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  }
]

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/streaming_distinct.json
================================================
[
  {
    "order_number": 1,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 20,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 22,
    "date": 20220102,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 3,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 4,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  }, 
  {
    "order_number": 2,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 2,
    "article_number": "article2",
    "amount": 300,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 3,
    "article_number": "article3",
    "amount": 200,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 3,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 3,
    "item_number": 2,
    "article_number": "article2",
    "amount": 15,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 3,
    "item_number": 2,
    "article_number": "article2",
    "amount": 220,
    "date": 20220103,
    "customer_number": "customer3",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 4,
    "item_number": 1,
    "article_number": "article3",
    "amount": 350,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 5,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 5,
    "item_number": 1,
    "article_number": "article2",
    "amount": 300,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 5,
    "item_number": 2,
    "article_number": "article4",
    "amount": 10,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "spain",
    "city": "madrid"
  }
]

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/control/streaming_drop_duplicates.json
================================================
[
  {
    "order_number": 1,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 20,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 3,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 4,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  }, 
  {
    "order_number": 2,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 2,
    "article_number": "article2",
    "amount": 300,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 3,
    "article_number": "article3",
    "amount": 200,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 3,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 3,
    "item_number": 2,
    "article_number": "article2",
    "amount": 15,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 4,
    "item_number": 1,
    "article_number": "article3",
    "amount": 350,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 5,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 5,
    "item_number": 2,
    "article_number": "article4",
    "amount": 10,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "spain",
    "city": "madrid"
  }
]

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/source/part-01.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
1|1|20220101|customer1|portugal|porto|article1|10
1|2|20220101|customer1|portugal|porto|article2|20
1|2|20220102|customer1|portugal|porto|article2|22
1|3|20220101|customer1|portugal|porto|article3|120
1|3|20220101|customer1|portugal|porto|article3|120
1|4|20220101|customer1|portugal|porto|article3|120
1|4|20220101|customer1|portugal|porto|article3|120
2|1|20220102|customer2|germany|nuremberg|article1|3
2|2|20220102|customer2|germany|nuremberg|article2|300
2|3|20220102|customer2|germany|nuremberg|article3|200
2|3|20220102|customer2|germany|nuremberg|article3|200

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/data/source/part-02.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
3|1|20220101|customer1|portugal|porto|article1|10
3|2|20220101|customer1|portugal|porto|article2|15
3|2|20220103|customer3|portugal|porto|article2|220
4|1|20220101|customer1|portugal|porto|article3|350
4|1|20220101|customer1|portugal|porto|article3|350
5|1|20220102|customer2|germany|nuremberg|article1|3
5|1|20220102|customer2|germany|nuremberg|article2|300
5|2|20220102|customer2|spain|madrid|article4|10
5|2|20220102|customer2|spain|madrid|article4|10
5|2|20220102|customer2|spain|madrid|article4|10

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/source_schema.json
================================================
{
    "type": "struct",
    "fields": [
      {
        "name": "order_number",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "item_number",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "date",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "customer_number",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "country",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "city",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "article_number",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "amount",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/drop_duplicate_rows/streaming.json
================================================
{
    "input_specs": [
      {
        "spec_id": "orders_source",
        "read_type": "streaming",
        "data_format": "csv",
        "options": {
          "mode": "FAILFAST",
          "header": true,
          "delimiter": "|"
        },
        "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/source_schema.json",
        "location": "file:///app/tests/lakehouse/in/feature/transformations/drop_duplicate_rows/data"
      }
    ],
    "transform_specs": [
      {
        "spec_id": "orders_duplicate_no_args",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows"
          }
        ]
      },
      {
        "spec_id": "orders_duplicate_empty",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows",
            "args": {
              "cols": []
            }
          }
        ]
      },
      {
        "spec_id": "orders_duplicate",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows",
            "args": {
              "cols": ["order_number","item_number"]
            }
          }
        ]
      }
    ],
    "output_specs": [
      {
        "spec_id": "orders_duplicate_no_args_write",
        "input_id": "orders_duplicate_no_args",
        "write_type": "append",
        "data_format": "delta",
        "partitions": ["date"],
        "options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_no_args/checkpoint"
        },
        "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_no_args/data"
      },
      {
        "spec_id": "orders_duplicate_empty_write",
        "input_id": "orders_duplicate_empty",
        "write_type": "append",
        "data_format": "delta",
        "partitions": ["date"],
        "options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_empty/checkpoint"
        },
        "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/orders_duplicate_empty/data"
      },
      {
        "spec_id": "orders_duplicate_write",
        "input_id": "orders_duplicate",
        "write_type": "append",
        "data_format": "delta",
        "partitions": ["date"],
        "options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/columns/checkpoint"
        },
        "location": "file:///app/tests/lakehouse/out/feature/transformations/drop_duplicate_rows/streaming/columns/data"
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/joiners/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "join_with": "customers",
            "join_type": "left outer",
            "join_condition": "a.customer = b.customer",
            "select_cols": ["a.*", "b.name as customer_name"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "db_table": "test_db.batch_join",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/batch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/control_scenario_1_and_2_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer_name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/control_scenario_3_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/customer_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "birth_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/data/control/control_scenario_1_and_2.csv
================================================
salesorder|item|date|customer|article|amount|customer_name
1|1|20160601|customer1|article1|1000|Anna
1|2|20160601|customer1|article2|2000|Anna
1|3|20160601|customer1|article3|500|Anna
2|1|20170215|customer2|article4|1000|John
2|2|20170215|customer2|article6|5000|John
2|3|20170215|customer2|article1|3000|John
3|1|20170215|customer1|article5|20000|Anna
3|2|20170215|customer1|article2|12000|Anna
3|3|20170215|customer1|article4|9000|Anna
4|1|20170430|customer3|article3|8000|Sarah
4|2|20170430|customer3|article7|7000|Sarah
4|3|20170430|customer3|article1|3000|Sarah
4|4|20170430|customer3|article2|5000|Sarah

================================================
FILE: tests/resources/feature/transformations/joiners/data/control/control_scenario_3.csv
================================================
salesorder|item|date|customer|article|amount|name
1|1|20160601|customer1|article1|1000|Anna
1|2|20160601|customer1|article2|2000|Anna
1|3|20160601|customer1|article3|500|Anna
2|1|20170215|customer2|article4|1000|John
2|2|20170215|customer2|article6|5000|John
2|3|20170215|customer2|article1|3000|John
3|1|20170215|customer1|article5|20000|Anna
3|2|20170215|customer1|article2|12000|Anna
3|3|20170215|customer1|article4|9000|Anna
4|1|20170430|customer3|article3|8000|Sarah
4|2|20170430|customer3|article7|7000|Sarah
4|3|20170430|customer3|article1|3000|Sarah
4|4|20170430|customer3|article2|5000|Sarah

================================================
FILE: tests/resources/feature/transformations/joiners/data/source/customer-part-01.csv
================================================
customer|name|birth_date
customer1|Anna|01012002
customer2|John|04051980
customer3|Sarah|02051940

================================================
FILE: tests/resources/feature/transformations/joiners/data/source/sales-part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/joiners/data/source/sales-part-02.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20170215|customer2|article4|1000
2|2|20170215|customer2|article6|5000
2|3|20170215|customer2|article1|3000
3|1|20170215|customer1|article5|20000
3|2|20170215|customer1|article2|12000
3|3|20170215|customer1|article4|9000
4|1|20170430|customer3|article3|8000
4|2|20170430|customer3|article7|7000
4|3|20170430|customer3|article1|3000
4|4|20170430|customer3|article2|5000

================================================
FILE: tests/resources/feature/transformations/joiners/sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/streaming.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "join_with": "customers",
            "join_type": "left outer",
            "join_condition": "a.customer = b.customer",
            "select_cols": ["a.*", "b.name as customer_name"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "db_table": "test_db.streaming_join",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/streaming_foreachBatch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {
          "function": "join",
          "args": {
            "join_with": "customers",
            "join_type": "left outer",
            "join_condition": "a.customer = b.customer",
            "select_cols": ["a.*", "b.name as customer_name"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "db_table": "test_db.streaming_join_foreachBatch",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_foreachBatch/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_foreachBatch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/streaming_without_broadcast.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "join_with": "customers",
            "join_type": "left outer",
            "join_condition": "a.customer = b.customer",
            "select_cols": ["a.*", "b.name as customer_name"],
            "broadcast_join": false
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "db_table": "test_db.streaming_without_broadcast",
      "data_format": "delta",
      "partitions": [
        "customer",
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_broadcast/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_broadcast/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/joiners/streaming_without_column_rename.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/joiners/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/joiners/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "join_with": "customers",
            "join_type": "left outer",
            "join_condition": "a.customer = b.customer",
            "select_cols": ["a.*", "b.name"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "db_table": "test_db.streaming_join_without_column_rename",
      "data_format": "delta",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_column_rename/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/joiners/streaming_without_column_rename/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/multiple_transform/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "orders_source",
      "read_type": "batch",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/multiple_transform/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/multiple_transform/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "orders_customer_cols",
      "input_id": "orders_source",
      "transformers": [
        {
          "function": "column_filter_exp",
          "args": {
            "exp": ["date", "country", "customer_number"]
          }
        }
      ]
    },
    {
      "spec_id": "orders_kpi_cols",
      "input_id": "orders_source",
      "transformers": [
        {
          "function": "column_filter_exp",
          "args": {
            "exp": ["date", "city", "amount"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "orders_bronze_customer_cols",
      "input_id": "orders_customer_cols",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": ["date"],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_customer_cols/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_customer_cols/checkpoint"
      }
    },
    {
      "spec_id": "orders_bronze_kpi_cols",
      "input_id": "orders_kpi_cols",
      "write_type": "overwrite",
      "data_format": "delta",
      "partitions": ["date"],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_kpi_cols/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/multiple_transform/batch/orders_kpi_cols/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/multiple_transform/data/control/part-01.json
================================================
[
  {
    "order_number": 1,
    "item_number": 1,
    "article_number": "article1",
    "amount": 10,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 2,
    "article_number": "article2",
    "amount": 20,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 1,
    "item_number": 3,
    "article_number": "article3",
    "amount": 120,
    "date": 20220101,
    "customer_number": "customer1",
    "country": "portugal",
    "city": "porto"
  },
  {
    "order_number": 2,
    "item_number": 1,
    "article_number": "article1",
    "amount": 3,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 2,
    "article_number": "article2",
    "amount": 300,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  },
  {
    "order_number": 2,
    "item_number": 3,
    "article_number": "article3",
    "amount": 200,
    "date": 20220102,
    "customer_number": "customer2",
    "country": "germany",
    "city": "nuremberg"
  }
]

================================================
FILE: tests/resources/feature/transformations/multiple_transform/data/source/part-01.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
1|1|20220101|customer1|portugal|porto|article1|10
1|2|20220101|customer1|portugal|porto|article2|20
1|3|20220101|customer1|portugal|porto|article3|120
2|1|20220102|customer2|germany|nuremberg|article1|3
2|2|20220102|customer2|germany|nuremberg|article2|300
2|3|20220102|customer2|germany|nuremberg|article3|200

================================================
FILE: tests/resources/feature/transformations/multiple_transform/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "order_number",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item_number",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer_number",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "country",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "city",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article_number",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/null_handlers/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": false,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": false,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "float",
      "nullable": false,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/null_handlers/data/control/replace_nulls.csv
================================================
salesorder|customer|amount
1|customer1|-999
-999|customer2|200.5
3|UNKNOWN|100.0

================================================
FILE: tests/resources/feature/transformations/null_handlers/data/control/replace_nulls_col_subset.csv
================================================
salesorder|customer|amount
1|customer1|-999
|customer2|200.5
3||100.0

================================================
FILE: tests/resources/feature/transformations/null_handlers/data/source/part-01.csv
================================================
salesorder|customer|amount
1|customer1|
|customer2|200.50
3||100.00

================================================
FILE: tests/resources/feature/transformations/null_handlers/replace_nulls.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_without_nulls",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "replace_nulls"
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_without_nulls",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/null_handlers/replace_nulls_col_subset.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "streaming",
      "data_format": "csv",
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|"
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/null_handlers/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "sales_without_nulls",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "replace_nulls",
          "args": {
            "subset_cols": ["amount"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "sales_without_nulls",
      "write_type": "append",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls_col_subset/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/null_handlers/replace_nulls_col_subset/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/null_handlers/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "float",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/optimizers/data/source/part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/batch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_source",
      "read_type": "batch",
      "data_format": "csv",
      "with_filepath": true,
      "options": {
        "mode": "FAILFAST",
        "header": true,
        "delimiter": "|",
        "inferSchema": true
      },
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/regex_transformers/with_regex_value/source_schema.json",
      "location": "file:///app/tests/lakehouse/in/feature/transformations/regex_transformers/with_regex_value/data"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "with_extraction_date",
      "input_id": "sales_source",
      "transformers": [
        {
          "function": "with_regex_value",
          "args": {
            "input_col": "lhe_extraction_filepath",
            "output_col": "extraction_date",
            "drop_input_col": true,
            "regex": ".*WE_SO_SCL_(\\d+).csv"
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_bronze",
      "input_id": "with_extraction_date",
      "write_type": "overwrite",
      "data_format": "delta",
      "location": "file:///app/tests/lakehouse/out/feature/transformations/regex_transformers/with_regex_value/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "extraction_date",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/data/control/part-01.csv
================================================
salesorder|item|date|customer|article|amount|extraction_date
1|1|20160601|customer1|article1|1000|202108111400000029
1|2|20160601|customer1|article2|2000|202108111400000029
1|3|20160601|customer1|article3|500|202108111400000029

================================================
FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/data/source/WE_SO_SCL_202108111400000029.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20160601|customer1|article1|1000
1|2|20160601|customer1|article2|2000
1|3|20160601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/regex_transformers/with_regex_value/source_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/batch_union.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_union/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/batch_unionByName.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union_by_name",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_unionByName/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/batch_unionByName_diff_schema.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv"
    },
    {
      "spec_id": "sales_shipment",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/sales-shipment-part-01.csv"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union_by_name",
          "args": {
            "union_with": ["sales_new", "sales_shipment"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_unionByName_diff_schema/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/batch_unionByName_diff_schema_error.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv"
    },
    {
      "spec_id": "sales_shipment",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/sales-shipment-part-01.csv"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union_by_name",
          "args": {
            "union_with": ["sales_new", "sales_shipment"],
            "allow_missing_columns": false
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_unionByName_diff_schema_error/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/batch_union_diff_schema.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/sales-historical-part-01.csv"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/sales-new-part-01.csv"
    },
    {
      "spec_id": "sales_shipment",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/sales-shipment-part-01.csv"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new", "sales_shipment"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/batch_union_diff_schema/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/data/control/control_sales.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000

================================================
FILE: tests/resources/feature/transformations/unions/data/control/control_sales_shipment.csv
================================================
salesorder|item|date|customer|article|amount|ship_date
1|1|20150601|customer1|article1|1000|
1|2|20150601|customer1|article2|2000|
1|3|20150601|customer1|article3|500|
2|1|20160215|customer2|article4|1000|
2|2|20160215|customer2|article6|5000|
2|3|20160215|customer2|article1|3000|
3|1|20160215|customer1|article5|20000|
4|1|20170215|customer2|article4|1000|20170216
4|2|20170215|customer2|article6|5000|20170216
5|1|20170215|customer1|article5|20000|20170216
5|3|20170215|customer2|article1|3000|20170216

================================================
FILE: tests/resources/feature/transformations/unions/data/control/control_sales_shipment_streaming.csv
================================================
salesorder|item|date|customer|article|amount|ship_date
0|1|20140601|customer1|article1|1000|
0|2|20140601|customer1|article2|2000|
0|3|20140601|customer1|article3|500|
1|1|20150601|customer1|article1|1000|
1|2|20150601|customer1|article2|2000|
1|3|20150601|customer1|article3|500|
2|1|20160215|customer2|article4|1000|
2|2|20160215|customer2|article6|5000|
2|3|20160215|customer2|article1|3000|
3|1|20160215|customer1|article5|20000|
4|1|20170215|customer2|article4|1000|20170216
4|2|20170215|customer2|article6|5000|20170216
5|1|20170215|customer1|article5|20000|20170216
5|3|20170215|customer2|article1|3000|20170216
6|1|20160218|customer3|article7|100|
6|2|20160218|customer3|article9|500|
6|3|20160218|customer3|article8|300|
7|1|20160218|customer5|article7|2000|
8|1|20190215|customer2|article4|1000|20190216
8|2|20190215|customer2|article6|5000|20190216
9|3|20190215|customer2|article1|3000|20190216
9|1|20190215|customer1|article5|20000|20190216

================================================
FILE: tests/resources/feature/transformations/unions/data/control/control_sales_shipment_streaming_foreachBatch.csv
================================================
salesorder|item|date|customer|article|amount|ship_date
0|1|20140601|customer1|article1|1000|
0|2|20140601|customer1|article2|2000|
0|3|20140601|customer1|article3|500|
1|1|20150601|customer1|article1|1000|
1|2|20150601|customer1|article2|2000|
1|3|20150601|customer1|article3|500|
2|1|20160215|customer2|article4|1000|
2|2|20160215|customer2|article6|5000|
2|3|20160215|customer2|article1|3000|
3|1|20160215|customer1|article5|20000|
4|1|20170215|customer2|article4|1000|20170216
4|2|20170215|customer2|article6|5000|20170216
5|1|20170215|customer1|article5|20000|20170216
5|3|20170215|customer2|article1|3000|20170216
6|1|20160218|customer3|article7|100|
6|2|20160218|customer3|article9|500|
6|3|20160218|customer3|article8|300|
7|1|20160218|customer5|article7|2000|
8|1|20190215|customer2|article4|1000|20190216
8|2|20190215|customer2|article6|5000|20190216
9|3|20190215|customer2|article1|3000|20190216
9|1|20190215|customer1|article5|20000|20190216

================================================
FILE: tests/resources/feature/transformations/unions/data/control/control_sales_streaming.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/transformations/unions/data/control/control_sales_streaming_foreachBatch.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/transformations/unions/data/source/sales-historical-part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/unions/data/source/sales-historical-part-02.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/unions/data/source/sales-new-part-01.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000

================================================
FILE: tests/resources/feature/transformations/unions/data/source/sales-new-part-02.csv
================================================
salesorder|item|date|customer|article|amount
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/transformations/unions/data/source/sales-shipment-part-01.csv
================================================
salesorder|item|date|customer|article|amount|ship_date
4|1|20170215|customer2|article4|1000|20170216
4|2|20170215|customer2|article6|5000|20170216
5|3|20170215|customer2|article1|3000|20170216
5|1|20170215|customer1|article5|20000|20170216

================================================
FILE: tests/resources/feature/transformations/unions/data/source/sales-shipment-part-02.csv
================================================
salesorder|item|date|customer|article|amount|ship_date
8|1|20190215|customer2|article4|1000|20190216
8|2|20190215|customer2|article6|5000|20190216
9|3|20190215|customer2|article1|3000|20190216
9|1|20190215|customer1|article5|20000|20190216

================================================
FILE: tests/resources/feature/transformations/unions/sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/sales_shipment_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "ship_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/streaming_union.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/"
    },
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_new",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_historical"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/streaming_unionByName_diff_schema.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/"
    },
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/"
    },
    {
      "spec_id": "sales_shipment",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_new",
      "transformers": [
        {
          "function": "union_by_name",
          "args": {
            "union_with": ["sales_historical", "sales_shipment"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/streaming_unionByName_diff_schema_foreachBatch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/"
    },
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/"
    },
    {
      "spec_id": "sales_shipment",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_shipment_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_shipment/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_new",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {
          "function": "union_by_name",
          "args": {
            "union_with": ["sales_historical", "sales_shipment"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema_foreachBatch/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_unionByName_diff_schema_foreachBatch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/unions/streaming_union_foreachBatch.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_new/"
    },
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/unions/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/unions/data/sales/sales_historical/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_new",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_historical"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union_foreachBatch/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/unions/streaming_union_foreachBatch/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/data/control/streaming_drop_duplicates.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
1|2|2017-05-10 01:01:01|customer1|portugal|porto|article2|20
2|1|2017-05-10 01:01:01|customer2|germany|nuremberg|article1|3
2|2|2017-05-10 01:01:01|customer2|germany|nuremberg|article2|300
3|1|2017-05-12 01:01:01|customer1|portugal|porto|article1|10
3|2|2017-05-12 01:01:01|customer1|portugal|porto|article2|15
3|2|2017-05-12 01:01:01|customer3|portugal|porto|article2|220
1|1|2017-05-10 01:01:01|customer1|portugal|porto|article1|10
1|2|2017-05-10 01:01:01|customer1|portugal|porto|article2|22
1|3|2017-05-10 01:01:01|customer1|portugal|porto|article3|120
1|4|2017-05-10 01:01:01|customer1|portugal|porto|article3|120
2|3|2017-05-10 01:01:01|customer2|germany|nuremberg|article3|200
4|1|2017-05-12 01:01:01|customer1|portugal|porto|article3|350
5|1|2017-05-12 01:01:01|customer2|germany|nuremberg|article1|3
5|1|2017-05-12 01:01:01|customer2|germany|nuremberg|article2|300
5|2|2017-05-12 01:01:01|customer2|spain|madrid|article4|10
5|2|2017-05-10 10:01:12|customer2|spain|madrid|article4|10

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/data/source/part-01.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10
1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|20
1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|22
1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
2|1|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article1|3
2|2|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article2|300
2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200
2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200
5|2|2017-05-10 10:01:12.000|customer2|spain|madrid|article4|10

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/data/source/part-02.csv
================================================
order_number|orders_duplicate_no_args|date|customer_number|country|city|article_number|amount
3|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article1|10
3|2|2017-05-12 01:01:01.000|customer1|portugal|porto|article2|15
3|2|2017-05-12 01:01:01.000|customer3|portugal|porto|article2|220
4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350
4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350
5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article1|3
5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article2|300
5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10
5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10
5|2|2017-05-06 10:01:12.000|customer2|spain|madrid|article4|10
5|2|2017-05-04 10:01:12.000|customer2|spain|madrid|article4|1000
1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/source_schema.json
================================================
{
    "type": "struct",
    "fields": [
      {
        "name": "order_number",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "item_number",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "date",
        "type": "timestamp",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "customer_number",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "country",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "city",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "article_number",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "amount",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates/streaming_drop_duplicates.json
================================================
{
    "input_specs": [
      {
        "spec_id": "orders_source",
        "read_type": "streaming",
        "data_format": "csv",
        "options": {
          "mode": "FAILFAST",
          "header": true,
          "delimiter": "|"
        },
        "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates/source_schema.json",
        "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates/data"
      }
    ],
    "transform_specs": [
      {
        "spec_id": "orders_duplicate_no_args",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "drop_duplicate_rows",
            "args": {
                 "watermarker": {"col": "date", "watermarking_time":"2 days"}
            }
          }
        ]
      }
    ],
  "dq_specs": [
    {
      "spec_id": "dq_validator",
      "input_id": "orders_duplicate_no_args",
      "dq_type": "validator",
      "store_backend": "file_system",
      "local_fs_root_dir": "/app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates/dq",
      "result_sink_db_table": "test_db.validator_full_overwrite",
      "result_sink_explode": true,
      "result_sink_extra_columns": ["validation_results.result.*"],
      "source": "orders_source",
      "dq_functions": [
        {
          "function": "expect_column_to_exist",
          "args": {
            "column": "date"
          }
        },
        {
          "function": "expect_table_row_count_to_be_between",
          "args": {
            "min_value": 0,
            "max_value": 20
          }
        }
      ]
    }
  ],
    "output_specs": [
      {
        "spec_id": "orders_duplicate_no_args_write",
        "input_id": "orders_duplicate_no_args",
        "write_type": "append",
        "data_format": "delta",
        "options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates/checkpoint"
        },
        "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates/data"
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data/control/streaming_drop_duplicates_overall_watermark.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
1|1|2017-05-10 01:01:01|customer1|portugal|porto|article1|10
1|2|2017-05-10 01:01:01|customer1|portugal|porto|article2|22
1|4|2017-05-10 01:01:01|customer1|portugal|porto|article3|120
2|1|2017-05-10 01:01:01|customer2|germany|nuremberg|article1|3
2|2|2017-05-10 01:01:01|customer2|germany|nuremberg|article2|300
2|3|2017-05-10 01:01:01|customer2|germany|nuremberg|article3|200
3|1|2017-05-12 01:01:01|customer1|portugal|porto|article1|10
3|2|2017-05-12 01:01:01|customer1|portugal|porto|article2|15
3|2|2017-05-12 01:01:01|customer3|portugal|porto|article2|220
4|1|2017-05-12 01:01:01|customer1|portugal|porto|article3|350
5|1|2017-05-12 01:01:01|customer2|germany|nuremberg|article1|3
1|3|2017-05-10 01:01:01|customer1|portugal|porto|article3|120
5|2|2017-05-10 10:01:12|customer2|spain|madrid|article4|10
5|2|2017-05-12 01:01:03|customer2|spain|madrid|article4|10

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data/source/part-01.csv
================================================
order_number|item_number|date|customer_number|country|city|article_number|amount
1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10
1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|20
1|2|2017-05-10 01:01:01.000|customer1|portugal|porto|article2|22
1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
1|3|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
1|4|2017-05-10 01:01:01.000|customer1|portugal|porto|article3|120
2|1|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article1|3
2|2|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article2|300
2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200
2|3|2017-05-10 01:01:01.000|customer2|germany|nuremberg|article3|200
5|2|2017-05-10 10:01:12.000|customer2|spain|madrid|article4|10

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data/source/part-02.csv
================================================
order_number|orders_duplicate_no_args|date|customer_number|country|city|article_number|amount
3|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article1|10
3|2|2017-05-12 01:01:01.000|customer1|portugal|porto|article2|15
3|2|2017-05-12 01:01:01.000|customer3|portugal|porto|article2|220
4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350
4|1|2017-05-12 01:01:01.000|customer1|portugal|porto|article3|350
5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article1|3
5|1|2017-05-12 01:01:01.000|customer2|germany|nuremberg|article2|300
5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10
5|2|2017-05-12 01:01:01.000|customer2|spain|madrid|article4|10
5|2|2017-05-12 01:01:02.000|customer2|spain|madrid|article4|10
5|2|2017-05-12 01:01:03.000|customer2|spain|madrid|article4|10
5|2|2017-05-06 10:01:12.000|customer23|spain|madrid|article4|10
5|2|2017-05-04 10:01:12.000|customer22|spain|madrid|article4|1000
1|1|2017-05-10 01:01:01.000|customer1|portugal|porto|article1|10

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/source_schema.json
================================================
{
    "type": "struct",
    "fields": [
      {
        "name": "order_number",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "item_number",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "date",
        "type": "timestamp",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "customer_number",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "country",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "city",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "article_number",
        "type": "string",
        "nullable": true,
        "metadata": {}
      },
      {
        "name": "amount",
        "type": "integer",
        "nullable": true,
        "metadata": {}
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/streaming_drop_duplicates_overall_watermark.json
================================================
{
    "input_specs": [
      {
        "spec_id": "orders_source",
        "read_type": "streaming",
        "data_format": "csv",
        "options": {
          "mode": "FAILFAST",
          "header": true,
          "delimiter": "|"
        },
        "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/source_schema.json",
        "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data"
      }
    ],
    "transform_specs": [
      {
        "spec_id": "watermarking_orders",
        "input_id": "orders_source",
        "transformers": [
          {
            "function": "with_watermark",
            "args" : {"watermarker_column": "date", "watermarker_time":"2 days"}
          },
          {"function":"drop_duplicate_rows"},
          {
            "function": "group_and_rank",
            "args": {
              "group_key": [
                "order_number",
                "item_number",
                "customer_number",
                "city"
              ],
              "ranking_key": [
                "date"
              ]
            }
          }
        ]
      }
    ],
    "output_specs": [
      {
        "spec_id": "orders_duplicate_no_args_write",
        "input_id": "watermarking_orders",
        "write_type": "append",
        "data_format": "delta",
        "partitions": ["date"],
        "options": {
          "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/checkpoint"
        },
        "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/data"
      }
    ]
  }

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/customer_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "birth_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/control/streaming_inner_join.csv
================================================
salesorder|item|date|customer|article|amount|customer_name
3|1|2017-05-12 01:01:01|customer1|article5|20000|Anna
3|2|2017-05-12 01:01:01|customer1|article2|12000|Anna
3|3|2017-05-12 01:01:01|customer1|article4|9000|Anna
4|1|2017-05-12 01:01:01|customer3|article3|8000|Sarah
4|2|2017-05-12 01:01:01|customer3|article7|7000|Sarah
4|3|2017-05-12 01:01:01|customer3|article1|3000|Sarah
4|4|2017-05-12 01:01:01|customer3|article2|5000|Sarah
2|1|2017-05-12 01:01:01|customer2|article4|1000|John
2|2|2017-05-12 01:01:01|customer2|article6|5000|John
2|3|2017-05-12 01:01:01|customer2|article1|3000|John
1|1|2017-05-10 01:01:01|customer1|article1|1000|Anna
1|2|2017-05-10 01:01:01|customer1|article2|2000|Anna
1|3|2017-05-10 01:01:01|customer1|article3|500|Anna

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/source/customer-part-01.csv
================================================
customer|name|birth_date|date
customer1|Anna|01012002|2017-05-10 01:01:01.000
customer2|John|04051980|2017-05-10 01:01:01.000
customer3|Sarah|02051940|2017-05-10 01:01:01.000
customer7|George|02051940|2017-05-10 01:01:01.000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/source/sales-part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|2017-05-10 01:01:01.000|customer1|article1|1000
1|2|2017-05-10 01:01:01.000|customer1|article2|2000
1|3|2017-05-10 01:01:01.000|customer1|article3|500
1|3|2017-05-10 01:01:01.000|customer10|article3|500

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/data/source/sales-part-02.csv
================================================
salesorder|item|date|customer|article|amount
2|1|2017-05-12 01:01:01.000|customer2|article4|1000
2|2|2017-05-12 01:01:01.000|customer2|article6|5000
2|3|2017-05-12 01:01:01.000|customer2|article1|3000
3|1|2017-05-12 01:01:01.000|customer1|article5|20000
3|2|2017-05-12 01:01:01.000|customer1|article2|12000
3|3|2017-05-12 01:01:01.000|customer1|article4|9000
4|1|2017-05-12 01:01:01.000|customer3|article3|8000
4|2|2017-05-12 01:01:01.000|customer3|article7|7000
4|3|2017-05-12 01:01:01.000|customer3|article1|3000
4|4|2017-05-12 01:01:01.000|customer3|article2|5000
4|4|2017-05-07 01:01:01.000|customer3|article2|5000
1|3|2017-05-14 01:01:01.000|customer100|article3|500

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/streaming_inner_join.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_inner_join/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "join_with": "customers",
            "join_type": "inner",
            "join_condition": "a.customer = b.customer and a.date between b.date and b.date + interval 4 days",
            "select_cols": ["a.*", "b.name as customer_name"],
            "watermarker": {"a":{"col":  "date", "watermarking_time":  "2 days"}, "b": {"col":  "date", "watermarking_time":  "2 days"}}
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "db_table": "test_db.streaming_inner_join",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_inner_join/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_inner_join/data"
    }
  ],
  "exec_env": {
    "spark.sql.streaming.stateStore.stateSchemaCheck": false
  }
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_inner_join/streaming_inner_join_control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer_name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/customer_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customerId",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customerClickTime",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/control/streaming_left_outer_join.csv
================================================
customerId|customerBuyTime|customerClickTime
0|2018-03-06 04:32:09.076|2018-03-06 04:32:31.941
1|2018-03-06 04:32:09.276|
2|2018-03-06 04:32:09.476|
3|2018-03-06 04:32:10.676|2018-03-06 04:32:31.941
3|2018-03-06 07:54:10.876|2018-03-06 07:54:10.876
4|2018-03-06 04:32:10.876|
5|2018-03-06 04:32:10.076|2018-03-06 04:32:32.341
10|2018-03-06 04:32:00|
10|2018-03-06 04:53:10.676|2018-03-06 04:53:10.676
11|2018-03-06 04:53:10.876|2018-03-06 04:54:00.676
11|2018-03-06 04:53:10.876|2018-03-06 04:54:00.676
15|2018-03-06 04:32:05.676|
21|2018-03-03 04:53:10.876|


================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-01.csv
================================================
customerId|customerClickTime
0|2018-03-06T04:32:31.941+0000
3|2018-03-06T04:32:31.941+0000
5|2018-03-06T04:32:32.341+0000
8|2018-03-06T04:32:32.941+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-02.csv
================================================
customerId|customerClickTime
10|2018-03-06T04:53:10.676+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-03.csv
================================================
customerId|customerClickTime
11|2018-03-06T04:54:00.676+0000
0|2018-03-06T07:53:10.876+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-04.csv
================================================
customerId|customerClickTime

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/customer-part-05.csv
================================================
customerId|customerClickTime
3|2018-03-06T07:54:10.876+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-01.csv
================================================
customerId|customerBuyTime
0|2018-03-06T04:32:09.076+0000
1|2018-03-06T04:32:09.276+0000
2|2018-03-06T04:32:09.476+0000
21|2018-03-03T04:53:10.876+0000
10|2018-03-06T04:32:00.000+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-02.csv
================================================
customerId|customerBuyTime
3|2018-03-06T04:32:10.676+0000
4|2018-03-06T04:32:10.876+0000
5|2018-03-06T04:32:10.076+0000
15|2018-03-06T04:32:05.676+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-03.csv
================================================
customerId|customerBuyTime
10|2018-03-06T04:53:10.676+0000
11|2018-03-06T04:53:10.876+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-04.csv
================================================
customerId|customerBuyTime
11|2018-03-06T04:53:10.876+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/data/source/sales-part-05.csv
================================================
customerId|customerBuyTime
3|2018-03-06T07:54:10.876+0000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customerId",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customerBuyTime",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/streaming_left_outer_join.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_left_outer_join/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "left_df_alias": "df",
            "right_df_alias": "join_with",
            "join_with": "customers",
            "join_type": "left outer",
            "join_condition": "df.customerId = join_with.customerId and join_with.customerClickTime BETWEEN df.customerBuyTime AND df.customerBuyTime + INTERVAL 1 MINUTE",
            "select_cols": ["df.*", "join_with.customerClickTime"],
            "watermarker": {"df":{"col":  "customerBuyTime", "watermarking_time":  "10 seconds"}, "join_with": {"col":  "customerClickTime", "watermarking_time":  "20 seconds"}}
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "append",
      "data_format": "delta",
      "partitions": [
        "customerId"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_left_outer_join/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_left_outer_join/data"
    }
  ],
  "exec_env": {
    "spark.sql.streaming.stateStore.stateSchemaCheck": false
  }
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_left_outer_join/streaming_left_outer_join_control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customerId",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customerBuyTime",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customerClickTime",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/customer_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "birth_date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/control/streaming_right_outer_join.csv
================================================
salesorder|item|date|customer|article|amount|customer_name
3|1|2017-05-12 01:01:01|customer1|article5|20000|Anna
3|2|2017-05-12 01:01:01|customer1|article2|12000|Anna
3|3|2017-05-12 01:01:01|customer1|article4|9000|Anna
4|1|2017-05-12 01:01:01|customer3|article3|8000|Sarah
2|1|2017-05-12 01:01:01|customer2|article4|1000|John
2|2|2017-05-12 01:01:01|customer2|article6|5000|John
2|3|2017-05-12 01:01:01|customer2|article1|3000|John
1|1|2017-05-12 00:00:01|customer1|article1|1000|Anna
1|2|2017-05-12 00:00:01|customer1|article2|2000|Anna
1|3|2017-05-12 00:00:01|customer1|article3|500|Anna|
||||||Fran|

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/source/customer-part-01.csv
================================================
customer|name|birth_date|date
customer1|Anna|01012002|2017-05-12 23:01:01.000
customer2|John|04051980|2017-05-12 23:01:01.000
customer3|Sarah|02051940|2017-05-12 23:01:01.000
customer5|Fran|02051940|2017-05-05 00:01:01.000
customer6|Nuno|02051940|2017-05-12 00:01:01.000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/source/sales-part-01.csv
================================================
salesorder|item|date|customer|article|amount
1|1|2017-05-12 00:00:01.000|customer1|article1|1000
1|2|2017-05-12 00:00:01.000|customer1|article2|2000
1|3|2017-05-12 00:00:01.000|customer1|article3|500

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/data/source/sales-part-02.csv
================================================
salesorder|item|date|customer|article|amount
2|1|2017-05-12 01:01:01.000|customer2|article4|1000
2|2|2017-05-12 01:01:01.000|customer2|article6|5000
2|3|2017-05-12 01:01:01.000|customer2|article1|3000
3|1|2017-05-12 01:01:01.000|customer1|article5|20000
3|2|2017-05-12 01:01:01.000|customer1|article2|12000
3|3|2017-05-12 01:01:01.000|customer1|article4|9000
4|1|2017-05-12 01:01:01.000|customer3|article3|8000
4|3|2017-05-12 01:01:01.000|customer800|article1|3000
4|4|2017-05-05 01:01:01.000|customer3|article2|5000
4|4|2017-05-07 01:01:01.000|customer800|article2|5000

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/streaming_right_outer_join.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/data/sales"
    },
    {
      "spec_id": "customers",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/customer_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/transformations/watermarker/streaming_right_outer_join/data/customers"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "join_with_customers",
      "input_id": "sales",
      "transformers": [
        {
          "function": "join",
          "args": {
            "left_df_alias": "df",
            "right_df_alias": "join_with",
            "join_with": "customers",
            "join_type": "right outer",
            "join_condition": "df.customer = join_with.customer and join_with.date >= df.date AND join_with.date <= df.date + interval 1 days",
            "select_cols": ["df.*", "join_with.name as customer_name"],
            "watermarker": {"df":{"col": "date", "watermarking_time": "2 days"}, "join_with": {"col": "date", "watermarking_time": "2 days"}}
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "join_with_customers",
      "write_type": "merge",
      "db_table": "test_db.streaming_outer_join",
      "data_format": "delta",
      "partitions": [
        "date"
      ],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_right_outer_join/checkpoint"
      },
      "merge_opts": {
        "merge_predicate": "current.salesorder = new.salesorder and current.item = new.item and current.customer_name == new.customer_name",
        "update_predicate": "new.date >= current.date"
      },
      "location": "file:///app/tests/lakehouse/out/feature/transformations/watermarker/streaming_right_outer_join/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/transformations/watermarker/streaming_right_outer_join/streaming_right_outer_join_control_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "timestamp",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer_name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_batch_console.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        },
        {
          "function": "coalesce",
          "args": {
            "num_partitions": 1
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "console",
      "options": {
        "limit": 8,
        "truncate": false,
        "vertical": false
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_batch_dataframe.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        },
        {
          "function": "coalesce",
          "args": {
            "num_partitions": 1
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "dataframe"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_batch_files.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": ["date"],
      "location": "file:///app/tests/lakehouse/out/feature/writers/write_batch_files/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_batch_jdbc.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "coalesce",
          "args": {"num_partitions": 1}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "jdbc",
      "partitions": ["date"],
      "options":{
        "url": "jdbc:sqlite:/app/tests/lakehouse/out/feature/writers/write_batch_jdbc/test.db",
        "dbtable": "write_batch_jdbc",
        "driver": "org.sqlite.JDBC"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_batch_rest_api.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "with_literals",
          "args": {"literals": {"payload": "{\"a\": \"a value\"}"}}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "rest_api",
      "options": {
        "rest_api_url": "https://www.dummy-url.local/dummy-endpoint",
        "rest_api_method": "post",
        "rest_api_header": {"Authorization": "Bearer dummytoken"}
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_batch_table.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "coalesce",
          "args": {"num_partitions": 1}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": ["date"],
      "db_table": "test_db.write_batch_table",
      "location": "file:///app/tests/lakehouse/out/feature/writers/write_batch_table/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_console.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "console"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_dataframe.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "dataframe"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_df_with_checkpoint.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST",
        "maxFilesPerTrigger": "1"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST",
        "maxFilesPerTrigger": "1"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "dataframe",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_df_with_checkpoint/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_files.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": ["date"],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_files/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_files/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_console.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "console"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_dataframe.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "dataframe"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_df_with_checkpoint.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST",
        "maxFilesPerTrigger": "1"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {
          "function": "union",
          "args": {
            "union_with": ["sales_new"]
          }
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "dataframe",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_df_with_checkpoint/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_files.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": ["date"],
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_files/checkpoint"
      },
      "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_files/data"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_jdbc.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "coalesce",
          "args": {"num_partitions": 1}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "jdbc",
      "partitions": ["date"],
      "options":{
        "url": "jdbc:sqlite:/app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_jdbc/test.db",
        "dbtable": "write_streaming_foreachBatch_jdbc",
        "driver": "org.sqlite.JDBC",
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_jdbc/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_foreachBatch_table.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "batch",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "force_streaming_foreach_batch_processing": true,
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "coalesce",
          "args": {"num_partitions": 1}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": ["date"],
      "db_table": "test_db.write_streaming_foreachBatch_table",
      "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_table/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_foreachBatch_table/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_multiple_dfs.json
================================================
{
  "input_specs": [
    {
      "spec_id": "bronze_sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "bronze_sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales_historical",
      "input_id": "bronze_sales_historical",
      "data_format": "dataframe"
    },
    {
      "spec_id": "sales_new",
      "input_id": "bronze_sales_new",
      "data_format": "dataframe"
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_rest_api.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "with_literals",
          "args": {"literals": {"payload": "{\"a\": \"a value\"}"}}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "data_format": "rest_api",
      "options": {
        "rest_api_url": "https://www.dummy-url.local/dummy-endpoint",
        "rest_api_method": "put",
        "rest_api_basic_auth_username": "dummy_user",
        "rest_api_basic_auth_password": "dummy_password"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/acons/write_streaming_table.json
================================================
{
  "input_specs": [
    {
      "spec_id": "sales_historical",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
      "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_historical/"
    },
    {
      "spec_id": "sales_new",
      "read_type": "streaming",
      "data_format": "csv",
      "schema_path": "file:///app/tests/lakehouse/in/feature/writers/schema/sales_schema.json",
      "options": {
        "header": true,
        "delimiter": "|",
        "mode": "FAILFAST"
      },
    "location": "file:///app/tests/lakehouse/in/feature/writers/source/sales_new/"
    }
  ],
  "transform_specs": [
    {
      "spec_id": "union_dataframes",
      "input_id": "sales_historical",
      "transformers": [
        {"function": "union",
          "args": {"union_with": ["sales_new"]}
        },
        {"function": "coalesce",
          "args": {"num_partitions": 1}
        }
      ]
    }
  ],
  "output_specs": [
    {
      "spec_id": "sales",
      "input_id": "union_dataframes",
      "write_type": "append",
      "data_format": "delta",
      "partitions": ["date"],
      "db_table": "test_db.write_streaming_table",
      "location": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_table/data",
      "options": {
        "checkpointLocation": "file:///app/tests/lakehouse/out/feature/writers/write_streaming_table/checkpoint"
      }
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/control/writers_control.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_1.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000


================================================
FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_2.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_foreachBatch_1.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000


================================================
FILE: tests/resources/feature/writers/control/writers_control_streaming_dataframe_foreachBatch_2.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/feature/writers/schema/sales_schema.json
================================================
{
  "type": "struct",
  "fields": [
    {
      "name": "salesorder",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "item",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "date",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "customer",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "article",
      "type": "string",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "amount",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    }
  ]
}

================================================
FILE: tests/resources/feature/writers/source/sales_historical_1.csv
================================================
salesorder|item|date|customer|article|amount
0|1|20140601|customer1|article1|1000
0|2|20140601|customer1|article2|2000
0|3|20140601|customer1|article3|500

================================================
FILE: tests/resources/feature/writers/source/sales_historical_2.csv
================================================
salesorder|item|date|customer|article|amount
1|1|20150601|customer1|article1|1000
1|2|20150601|customer1|article2|2000
1|3|20150601|customer1|article3|500

================================================
FILE: tests/resources/feature/writers/source/sales_new_1.csv
================================================
salesorder|item|date|customer|article|amount
2|1|20160215|customer2|article4|1000
2|2|20160215|customer2|article6|5000
2|3|20160215|customer2|article1|3000
3|1|20160215|customer1|article5|20000


================================================
FILE: tests/resources/feature/writers/source/sales_new_2.csv
================================================
salesorder|item|date|customer|article|amount
6|1|20160218|customer3|article7|100
6|2|20160218|customer3|article9|500
6|3|20160218|customer3|article8|300
7|1|20160218|customer5|article7|2000

================================================
FILE: tests/resources/unit/custom_configs/custom_engine_config.yaml
================================================
notif_disallowed_email_servers:
  - dummy.file.server

================================================
FILE: tests/resources/unit/heartbeat/heartbeat_acon_creation/setup/column_list/heartbeat_sensor_control_table.json
================================================
{
  "sensor_source": "string",
  "sensor_id": "string",
  "sensor_read_type": "string",
  "asset_description": "string",
  "upstream_key": "string",
  "preprocess_query": "string",
  "latest_event_fetched_timestamp": "timestamp",
  "trigger_job_id": "string",
  "trigger_job_name": "string",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "job_start_timestamp": "timestamp",
  "job_end_timestamp": "timestamp",
  "job_state": "string",
  "dependency_flag": "string"
}

================================================
FILE: tests/resources/unit/heartbeat/heartbeat_acon_creation/setup/column_list/sensor_table.json
================================================
{
  "sensor_id": "string",
  "assets": "array<string>",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "checkpoint_location": "string",
  "upstream_key": "string",
  "upstream_value": "string"
}

================================================
FILE: tests/resources/unit/heartbeat/heartbeat_anchor_job/setup/column_list/heartbeat_sensor_control_table.json
================================================
{
  "sensor_source": "string",
  "sensor_id": "string",
  "sensor_read_type": "string",
  "asset_description": "string",
  "upstream_key": "string",
  "preprocess_query": "string",
  "latest_event_fetched_timestamp": "timestamp",
  "trigger_job_id": "string",
  "trigger_job_name": "string",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "job_start_timestamp": "timestamp",
  "job_end_timestamp": "timestamp",
  "job_state": "string",
  "dependency_flag": "string"
}

================================================
FILE: tests/resources/unit/heartbeat/heartbeat_anchor_job/setup/column_list/sensor_table.json
================================================
{
  "sensor_id": "string",
  "assets": "array<string>",
  "status": "string",
  "status_change_timestamp": "timestamp",
  "checkpoint_location": "string",
  "upstream_key": "string",
  "upstream_value": "string"
}

================================================
FILE: tests/resources/unit/sharepoint_reader/data/sample_ok.csv
================================================
col_a,col_b
1,2

================================================
FILE: tests/resources/unit/sharepoint_reader/data/sample_other_delim.csv
================================================
col_a;col_b
1;2

================================================
FILE: tests/unit/__init__.py
================================================
"""Tests utilities."""


================================================
FILE: tests/unit/test_acon_validation.py
================================================
"""Unit tests for ACON validators."""

import pytest


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Validate delete objects function",
            "acon": {
                "operations": [
                    {
                        "manager": "file",
                        "function": "delete_objects",
                        "bucket": "example-bucket",
                        "object_paths": ["path/to/delete/"],
                        "dry_run": True,
                    }
                ],
            },
        },
        {
            "name": "Validate copy objects function with missing parameters",
            "acon": {
                "operations": [
                    {
                        "manager": "file",
                        "function": "copy_objects",
                        "bucket": "example-bucket",
                        "source_object": ["path/to/copy/"],
                    }
                ]
            },
            "exception": """Errors found during validation:
Missing mandatory parameters for file manager function copy_objects: ['destination_bucket', 'destination_object', 'dry_run']
Type validation errors for file manager function copy_objects: ["Parameter 'source_object' expected str, got list"]""",  # noqa: E501
        },
        {
            "name": "Validate list of operations",
            "acon": {
                "operations": [
                    {
                        "manager": "file",
                        "function": "delete_objects",
                        "bucket": "example-bucket",
                        "object_paths": ["path/to/delete/"],
                        "dry_run": True,
                    },
                    {
                        "manager": "table",
                        "function": "execute_sql",
                        "sql": "create example_table",
                    },
                    {
                        "manager": "table",
                        "function": "optimize",
                        "table_or_view": "example_table",
                    },
                ],
            },
        },
        {
            "name": "Validate list of operations with errors",
            "acon": {
                "operations": [
                    {
                        "manager": "file",
                        "function": "delete_objects",
                        "bucket": "example-bucket",
                        "object_paths": "path/to/delete/",
                        "dry_run": "test string",
                    },
                    {
                        "manager": "table",
                        "function": "execute_sql",
                        "sql": 10,
                    },
                    {
                        "manager": "table",
                        "function": "optimize_dataset",
                        "table_or_view": "example_table",
                    },
                ]
            },
            "exception": """Errors found during validation:
Type validation errors for file manager function delete_objects: ["Parameter 'object_paths' expected list, got str", "Parameter 'dry_run' expected bool, got str"]
Type validation errors for table manager function execute_sql: ["Parameter 'sql' expected str, got int"]
Function 'optimize_dataset' not supported for table manager""",  # noqa: E501
        },
    ],
)
def test_manager_validation(scenario: dict) -> None:
    """Test to validate manager acons."""
    from lakehouse_engine.engine import validate_manager_list

    acon = scenario["acon"]
    exception = scenario.get("exception", None)

    if exception:
        with pytest.raises(Exception) as e:
            validate_manager_list(acon)
        assert str(e.value) == exception
    else:
        validate_manager_list(acon)


================================================
FILE: tests/unit/test_custom_configs.py
================================================
"""Unit tests for overwritten the default configs."""

from lakehouse_engine.core import exec_env
from lakehouse_engine.utils.logging_handler import LoggingHandler
from tests.conftest import UNIT_RESOURCES

LOGGER = LoggingHandler(__name__).get_logger()

TEST_PATH = "custom_configs"
TEST_RESOURCES = f"{UNIT_RESOURCES}/{TEST_PATH}"


def test_custom_config() -> None:
    """Testing using a custom configuration."""
    default_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers
    LOGGER.info(f"Default disallowed email server: {default_configs}")

    # Testing custom configurations using a dictionary
    exec_env.ExecEnv.set_default_engine_config(
        custom_configs_dict={"notif_disallowed_email_servers": ["dummy.server.test"]},
    )
    dict_custom_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers
    LOGGER.info(
        f"Custom disallowed email server using dictionary: {dict_custom_configs}"
    )
    assert default_configs != dict_custom_configs

    # Testing custom configurations using a file
    exec_env.ExecEnv.set_default_engine_config(
        custom_configs_file_path=f"{TEST_RESOURCES}/custom_engine_config.yaml",
    )
    file_custom_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers
    LOGGER.info(
        f"Custom disallowed email server using configuration file: "
        f"{file_custom_configs}"
    )
    assert default_configs != file_custom_configs

    # Resetting to the default configurations
    exec_env.ExecEnv.set_default_engine_config(package="tests.configs")
    reset_configs = exec_env.ExecEnv.ENGINE_CONFIG.notif_disallowed_email_servers
    LOGGER.info(f"Reset disallowed email server: {reset_configs}")
    assert default_configs == reset_configs


================================================
FILE: tests/unit/test_databricks_utils.py
================================================
"""Unit tests for DatabricksUtils in lakehouse_engine.utils.databricks_utils."""

import sys
import types
from unittest.mock import MagicMock, patch

from lakehouse_engine.utils.databricks_utils import DatabricksUtils

CONTEXT_KEYS = {
    "runId": "76890",
    "jobId": "657890",
    "jobName": "sadp-template-dummy_job",
    "workspaceId": "213245431",
    "usagePolicyId": "4567890",
}
CONTROL_DATA = {
    "run_id": "76890",
    "job_id": "657890",
    "job_name": "sadp-template-dummy_job",
    "workspace_id": "213245431",
    "policy_id": "4567890",
    "dp_name": "sadp-template",
    "environment": "dev",
}


def test_get_usage_context_for_serverless() -> None:
    """Test for get_usage_context_for_serverless method in DatabricksUtils."""
    # Create a fake module and function
    fake_module = types.ModuleType("dbruntime.databricks_repl_context")
    fake_module.get_context = MagicMock(  # type: ignore[attr-defined]
        return_value=MagicMock()
    )
    sys.modules["dbruntime"] = types.ModuleType("dbruntime")
    sys.modules["dbruntime.databricks_repl_context"] = fake_module

    mock_context = MagicMock(**CONTEXT_KEYS)

    # Patch get_context to return our mock context
    with patch(
        "dbruntime.databricks_repl_context.get_context", return_value=mock_context
    ):

        with patch(
            "lakehouse_engine.core.exec_env.ExecEnv.get_environment", return_value="dev"
        ):
            usage_stats: dict = {}
            DatabricksUtils.get_usage_context_for_serverless(usage_stats)

            assert (
                usage_stats == CONTROL_DATA
            ), f"Expected usage_stats to be {CONTROL_DATA}, but got {usage_stats}"

    # Clean up after test
    del sys.modules["dbruntime.databricks_repl_context"]
    del sys.modules["dbruntime"]


================================================
FILE: tests/unit/test_failure_notification_creation.py
================================================
"""Unit tests for the creation of failure notifications."""

import re
import time

import pytest

from lakehouse_engine.core.definitions import TerminatorSpec
from lakehouse_engine.terminators.notifier_factory import NotifierFactory
from lakehouse_engine.utils.logging_handler import LoggingHandler
from tests.utils.smtp_server import SMTPServer

LOGGER = LoggingHandler(__name__).get_logger()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Email notification creation using a template.",
            "spec": [
                TerminatorSpec(
                    function="notify",
                    args={
                        "server": "localhost",
                        "port": "1025",
                        "type": "email",
                        "template": "failure_notification_email",
                        "from": "test-email@email.com",
                        "to": ["test-email1@email.com", "test-email2@email.com"],
                        "on_failure": True,
                    },
                ),
            ],
            "server": "localhost",
            "port": "1025",
            "expected": """
            Job local in workspace local has
            failed with the exception: Test exception""",
        },
    ],
)
def test_failure_notification_creation(scenario: dict) -> None:
    """Testing notification creation.

    Args:
        scenario: scenario to test.
    """
    expected_output = scenario["expected"]

    try:
        port = scenario["port"]
        server = scenario["server"]

        smtp_server = SMTPServer(server, port)
        smtp_server.start()

        # We sleep so the subprocess has time to start the debug smtp server
        time.sleep(2)

        NotifierFactory.generate_failure_notification(
            scenario["spec"], ValueError("Test exception")
        )

        message = _parse_email_output(smtp_server.get_last_message().as_string())

        assert message == expected_output

    finally:
        smtp_server.stop()


def _parse_email_output(mail_content: str) -> str:
    """Parse the mail that was received in the debug smtp server.

    The regex is fetching the data between the encoding's field 'bit' and
    the next boundary of the email.
    Example notification content:
        Content-Type: multipart/mixed; boundary="===============1362798268250904879=="
        MIME-Version: 1.0
        From: test-email@email.com
        To: test-email1@email.com, test-email2@email.com
        CC:
        BCC:
        Subject: Service Failure
        Importance: normal
        X-Peer: ('::1', 49472, 0, 0)
        X-MailFrom: test-email@email.com
        X-RcptTo: test-email1@email.com, test-email2@email.com

        --===============1362798268250904879==
        Content-Type: text/text; charset="us-ascii"
        MIME-Version: 1.0
        Content-Transfer-Encoding: 7bit


                    Job local in workspace local has
                    failed with the exception: Test exception
        --===============1362798268250904879==--

    Args:
        mail_content: The content of the email to parse.

    Returns:
        The parsed email message.
    """
    message = re.search("(?<=bit\n).*?(?=--=)", mail_content, re.S).group()[1:-1]

    return str(message)


================================================
FILE: tests/unit/test_heartbeat_acon_creation.py
================================================
"""Module that tests the Acon creation function from the heartbeat module."""

from unittest.mock import Mock, patch

import pytest
from pyspark.sql import DataFrame

from lakehouse_engine.algorithms.sensors.heartbeat import Heartbeat
from lakehouse_engine.core.definitions import HeartbeatConfigSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import LAKEHOUSE, UNIT_RESOURCES
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "heartbeat_acon_creation"
FEATURE_TEST_RESOURCES = f"{UNIT_RESOURCES}/heartbeat/{TEST_NAME}"
_LOGGER = LoggingHandler(__name__).get_logger()

_SETUP_DELTA_TABLES = ["heartbeat_sensor_control_table", "sensor_table"]


def _create_heartbeat_table() -> None:
    """Create the necessary tables required for using Heartbeat."""
    _LOGGER.info("Creating heartbeat tables")
    for table in _SETUP_DELTA_TABLES:
        DataframeHelpers.create_delta_table(
            cols=SchemaUtils.from_file_to_dict(
                f"file:///{FEATURE_TEST_RESOURCES}/setup/column_list/{table}.json"
            ),
            table=table,
        )


def _select_all(table: str) -> DataFrame:
    """Select all records from the specified table.

    Args:
        table (str): The name of the table.
    """
    return ExecEnv.SESSION.sql(f"SELECT * FROM  {table} ORDER BY sensor_id")  # nosec


def _check_acon(heartbeat_table: str, acon: dict, acon_result_list: dict) -> None:
    """Validates the generated ACON.

    Args:
        heartbeat_table (str): The name of the heartbeat control table.
        acon (dict): The initial ACON that feeds the heartbeat algorithm.
        acon_result_list (dict): The expected ACON configuration.
    """
    _LOGGER.info("Checking acon creation.")
    for control_table_row in _select_all(heartbeat_table).collect():
        result = Heartbeat._get_sensor_acon_from_heartbeat(
            HeartbeatConfigSpec.create_from_acon(acon), control_table_row
        )
        print(result)

        assert result == acon_result_list[control_table_row["sensor_id"]]


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "use_case_name": "delta_table",
            "rows_to_add": {
                "heartbeat": """
                    ("delta_table","dummy_order","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "9274610384726150","dummy_order_events","COMPLETED",
                    NULL,NULL,NULL,"UNPAUSED","TRUE")
                    """,
            },
            "results": {
                "dummy_order": {
                    "sensor_id": "dummy_order_9274610384726150",
                    "assets": ["delta_table_order_events_9274610384726150"],
                    "control_db_table_name": "test_db.sensor_table",
                    "input_spec": {
                        "spec_id": "sensor_upstream",
                        "read_type": "batch",
                        "data_format": "delta",
                        "db_table": "dummy_order",
                        "options": None,
                        "location": None,
                        "schema": None,
                    },
                    "preprocess_query": None,
                    "base_checkpoint_location": None,
                    "fail_on_empty_result": False,
                },
            },
        },
        {
            "use_case_name": "kafka",
            "rows_to_add": {
                "heartbeat": """
                    ("kafka",
                    "sales: sales.dummy_deliveries",
                    "batch","delta_table_order_events",NULL,NULL,NULL,
                    "1847362093847561","dummy_order_events","COMPLETED",
                    NULL,NULL,NULL,"UNPAUSED","TRUE")
                    """,
            },
            "results": {
                "sales: sales.dummy_deliveries": {
                    "sensor_id": "sales__sales_dummy_deliveries_1847362093847561",
                    "assets": ["delta_table_order_events_1847362093847561"],
                    "control_db_table_name": "test_db.sensor_table",
                    "input_spec": {
                        "spec_id": "sensor_upstream",
                        "read_type": "batch",
                        "data_format": "kafka",
                        "db_table": None,
                        "options": {
                            "kafka.bootstrap.servers": ["server1", "server2"],
                            "subscribe": "sales.dummy_deliveries",
                            "startingOffsets": "earliest",
                            "kafka.security.protocol": "SSL",
                            "kafka.ssl.truststore.location": "trust_store_location",
                            "kafka.ssl.truststore.password": "key",
                            "kafka.ssl.keystore.location": "keystore_location",
                            "kafka.ssl.keystore.password": "key",
                        },
                        "location": None,
                        "schema": None,
                    },
                    "preprocess_query": None,
                    "base_checkpoint_location": None,
                    "fail_on_empty_result": False,
                }
            },
        },
        {
            "use_case_name": "sap_b4",
            "rows_to_add": {
                "heartbeat": """
                    ("sap_b4","SAP_DUMMY_ID","batch",
                    "dummy_tables","LOAD_DATE",NULL,NULL,
                    "6039184726153847","dummy_order_events","COMPLETED",
                    NULL,NULL,NULL,"UNPAUSED","FALSE"),
                    ("sap_b4","SAP_DUMMY_ID2","batch",
                    "dummy_tables","LOAD_DATE",NULL,NULL,
                    "7482910364728193","dummy_order_events","COMPLETED",
                    NULL,NULL,NULL,"UNPAUSED","FALSE")
                    """,
            },
            "results": {
                "SAP_DUMMY_ID": {
                    "sensor_id": "SAP_DUMMY_ID_6039184726153847",
                    "assets": ["dummy_tables_6039184726153847"],
                    "control_db_table_name": "test_db.sensor_table",
                    "input_spec": {
                        "spec_id": "sensor_upstream",
                        "read_type": "batch",
                        "data_format": "sap_b4",
                        "db_table": None,
                        "options": {
                            "prepareQuery": (
                                "WITH sensor_new_data AS (SELECT CHAIN_ID, "
                                "CONCAT(DATUM, ZEIT) AS LOAD_DATE, ANALYZED_STATUS "
                                "FROM sap_table "
                                "WHERE UPPER(CHAIN_ID) = UPPER('SAP_DUMMY_ID') "
                                "AND UPPER(ANALYZED_STATUS) = UPPER('G'))"
                            ),
                            "query": (
                                "SELECT COUNT(1) as count, "
                                "'LOAD_DATE' as UPSTREAM_KEY, "
                                "max(LOAD_DATE) as UPSTREAM_VALUE FROM sensor_new_data "
                                "WHERE LOAD_DATE > '19000101000000' HAVING COUNT(1) > 0"
                            ),
                        },
                        "location": None,
                        "schema": None,
                    },
                    "preprocess_query": None,
                    "base_checkpoint_location": None,
                    "fail_on_empty_result": False,
                },
                "SAP_DUMMY_ID2": {
                    "sensor_id": "SAP_DUMMY_ID2_7482910364728193",
                    "assets": ["dummy_tables_7482910364728193"],
                    "control_db_table_name": "test_db.sensor_table",
                    "input_spec": {
                        "spec_id": "sensor_upstream",
                        "read_type": "batch",
                        "data_format": "sap_b4",
                        "db_table": None,
                        "options": {
                            "prepareQuery": (
                                "WITH sensor_new_data AS (SELECT CHAIN_ID, "
                                "CONCAT(DATUM, ZEIT) AS LOAD_DATE, ANALYZED_STATUS "
                                "FROM sap_table "
                                "WHERE "
                                "UPPER(CHAIN_ID) = UPPER('SAP_DUMMY_ID2') "
                                "AND UPPER(ANALYZED_STATUS) = UPPER('G'))"
                            ),
                            "query": (
                                "SELECT COUNT(1) as count, "
                                "'LOAD_DATE' as UPSTREAM_KEY, "
                                "max(LOAD_DATE) as UPSTREAM_VALUE FROM sensor_new_data "
                                "WHERE LOAD_DATE > '19000101000000' HAVING COUNT(1) > 0"
                            ),
                        },
                        "location": None,
                        "schema": None,
                    },
                    "preprocess_query": None,
                    "base_checkpoint_location": None,
                    "fail_on_empty_result": False,
                },
            },
        },
    ],
)
@patch("lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils")
def test_get_sensor_acon(mock_get_db_utils: Mock, scenario: dict) -> None:
    """Test the acon creation.

    Args:
        mock_get_db_utils (Mock): The mocked object.
        scenario (dict): The test scenario to execute.

    Scenarios:
        1- For delta tables source.
        2- For kafka topics source.
        3- For SAP sources. In this scenario we have two records
            that will yield two different acons.
    """
    scenario_name = scenario["use_case_name"]
    records = scenario["rows_to_add"].get("heartbeat")
    acon_result_list = scenario["results"]

    heartbeat_table = "test_db.heartbeat_sensor_control_table"
    sensor_table = "test_db.sensor_table"

    acon = {
        "sensor_source": scenario_name,
        "data_format": "delta",
        "heartbeat_sensor_db_table": heartbeat_table,
        "lakehouse_engine_sensor_db_table": sensor_table,
        "token": "my-token",
        "domain": "adidas-domain.cloud.databricks.com",
    }

    _LOGGER.info(f"Scenario: {scenario_name}")

    _create_heartbeat_table()

    _LOGGER.info("Inserting records in heartbeat table.")
    ExecEnv.SESSION.sql(
        f"""INSERT INTO {heartbeat_table}
            VALUES {records}"""  # nosec
    )

    if scenario_name == "sap_b4":
        _LOGGER.info("Inserting records in sensors table.")
        acon.update(
            {
                "data_format": "sap_b4",
                "jdbc_db_table": "sap_table",
                "options": {
                    "prepareQuery": "",
                    "query": "",
                },
            }
        )

    if scenario_name == "kafka":
        acon.update(
            {
                "data_format": "kafka",
                "kafka_configs": {
                    "sales": {
                        "kafka_bootstrap_servers_list": ["server1", "server2"],
                        "kafka_ssl_truststore_location": "trust_store_location",
                        "kafka_ssl_keystore_location": "keystore_location",
                        "truststore_pwd_secret_key": "trust_store_key",
                        "keystore_pwd_secret_key": "keystore_pwd_secret_key",
                    }
                },
            }
        )

        mock_db_utils = Mock()
        mock_secrets = Mock()
        mock_secrets.get.return_value = "key"
        mock_db_utils.secrets = mock_secrets
        mock_get_db_utils.return_value = mock_db_utils

        _check_acon(heartbeat_table, acon, acon_result_list)
    else:
        _check_acon(heartbeat_table, acon, acon_result_list)

    for table in _SETUP_DELTA_TABLES:
        LocalStorage.clean_folder(f"{LAKEHOUSE}{table}")
        ExecEnv.SESSION.sql(f"""DROP TABLE IF EXISTS test_db.{table}""")  # nosec


================================================
FILE: tests/unit/test_heartbeat_anchor_job.py
================================================
"""Module that tests the anchor job function from the heartbeat module."""

from unittest.mock import Mock, patch

import pytest

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.engine import trigger_heartbeat_sensor_jobs
from lakehouse_engine.utils.logging_handler import LoggingHandler
from lakehouse_engine.utils.schema_utils import SchemaUtils
from tests.conftest import LAKEHOUSE, UNIT_RESOURCES
from tests.utils.dataframe_helpers import DataframeHelpers
from tests.utils.local_storage import LocalStorage

TEST_NAME = "heartbeat_anchor_job"
FEATURE_TEST_RESOURCES = f"{UNIT_RESOURCES}/heartbeat/{TEST_NAME}"
_LOGGER = LoggingHandler(__name__).get_logger()

_SETUP_DELTA_TABLES = ["heartbeat_sensor_control_table"]


def _create_heartbeat_table() -> None:
    """Create the necessary tables required for using Heartbeat."""
    _LOGGER.info("Creating tables")
    for table in _SETUP_DELTA_TABLES:
        DataframeHelpers.create_delta_table(
            cols=SchemaUtils.from_file_to_dict(
                f"file:///{FEATURE_TEST_RESOURCES}/setup/column_list/{table}.json"
            ),
            table=table,
        )


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "use_case_name": "delta_table_trigger_2_jobs",
            "sensor_source": "delta_table",
            "trigger_jobs_records": {
                "heartbeat": """
                    ("delta_table","dummy_orders","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "3849201756384721","events_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"UNPAUSED","TRUE"),
                    ("delta_table","dummy_sales","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "3849201756384721","events_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"UNPAUSED","TRUE"),
                    ("delta_table","dummy_test","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "7601938475620193","events_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"UNPAUSED","TRUE"),
                    ("delta_table","dummy_test2","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "7601938475620193","events_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"UNPAUSED","TRUE")
                    """,
            },
            "jobs_triggered_count": 2,
            "job_id": ["3849201756384721", "7601938475620193"],
        },
        {
            "use_case_name": "kafka_trigger_1_job",
            "sensor_source": "kafka",
            "trigger_jobs_records": {
                "heartbeat": """
                    ("kafka","dummy_test3","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "5918374620193847","events_orders","COMPLETE",
                    NULL,NULL,NULL,"UNPAUSED","FALSE"),
                    ("kafka","dummy_test4","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "5918374620193847","events_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"UNPAUSED","TRUE")
                    """,
            },
            "jobs_triggered_count": 1,
            "job_id": ["5918374620193847"],
        },
        {
            "use_case_name": "sap_b4_no_trigger",
            "sensor_source": "sap_b4",
            "trigger_jobs_records": {
                "heartbeat": """
                    ("sap_b4","dummy_test3","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "8203746159283746","events_orders","NEW_EVENT_AVAILABLE",
                    NULL,NULL,NULL,"PAUSED","FALSE"),
                    ("sap_b4","dummy_test4","batch",
                    "delta_table_order_events",NULL,NULL,NULL,
                    "8203746159283746","events_orders","COMPLETE",
                    NULL,NULL,NULL,"UNPAUSED","TRUE")
                    """
            },
            "jobs_triggered_count": 0,
        },
    ],
)
@patch(
    "lakehouse_engine.core.sensor_manager.SensorJobRunManager.run_job",
    return_value=("run_id", None),
)
def test_anchor_job(mock_run_job: Mock, scenario: dict) -> None:
    """Test the number of jobs triggered.

    Args:
        mock_run_job (Mock): The mocked object.
        scenario: The test scenario to execute.

    Scenarios:
        1- 2 different jobs id's each one with two hard dependencies.
            From the 4 records in the table, only two should trigger a job.
        2- 1 job id with two records that can trigger the job.
            Only 1 comply with the specifications to trigger a job.
        3- 1 job id with two records that can trigger the job.
            None comply with the specifications to trigger a job.
    """
    scenario_name = scenario["use_case_name"]
    sensor_source = scenario["sensor_source"]
    records = scenario["trigger_jobs_records"].get("heartbeat")
    jobs_triggered_count = scenario["jobs_triggered_count"]

    heartbeat_table = "test_db.heartbeat_sensor_control_table"
    sensor_table = "test_db.sensor_table"

    acon = {
        "heartbeat_sensor_db_table": heartbeat_table,
        "lakehouse_engine_sensor_db_table": sensor_table,
        "data_format": "delta",
        "sensor_source": sensor_source,
        "token": "my-token",
        "domain": "adidas-domain.cloud.databricks.com",
    }

    _LOGGER.info(f"Scenario: {scenario_name}")

    _create_heartbeat_table()

    ExecEnv.SESSION.sql(
        f"""INSERT INTO {heartbeat_table}
            VALUES {records}"""  # nosec
    )

    trigger_heartbeat_sensor_jobs(acon=acon)
    assert mock_run_job.call_count == jobs_triggered_count

    if jobs_triggered_count > 0:
        triggered_job_id = scenario["job_id"]
        for call_args in mock_run_job.call_args_list:
            assert call_args[0][0] in triggered_job_id

    for table in _SETUP_DELTA_TABLES:
        LocalStorage.clean_folder(f"{LAKEHOUSE}{table}")
        ExecEnv.SESSION.sql(f"""DROP TABLE IF EXISTS test_db.{table}""")  # nosec


================================================
FILE: tests/unit/test_log_filter_sensitive_data.py
================================================
"""Unit tests focusing on the logging filter FilterSensitiveData."""

import logging
from typing import Any

from lakehouse_engine.utils.logging_handler import LoggingHandler

STR_MSGS_TO_LOG = [
    {  # Sample acon being logged, password has comma and double quotes
        "original_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': "
        "'source', 'read_type': 'batch', 'data_format': 'sap_bw', 'options': "
        "{'driver': 'org.sqlite.JDBC', 'user': 'user', 'password': 'p,w\"d', "
        "'url': 'jdbc:url', 'dbtable': 'table', 'numPartitions': 2, 'extraction_type': "
        "'delta', 'partitionColumn': 'item', 'lowerBound': 1, 'upperBound': 3}}], "
        "'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', 'write_type': "
        "'append', 'data_format': 'delta', 'partitions': ['actrequest_timestamp'], "
        "'location': 'file:////path'}]}",
        "masked_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': "
        "'source', 'read_type': 'batch', 'data_format': 'sap_bw', 'options': "
        "{'driver': 'org.sqlite.JDBC', 'user': 'user', 'masked_cred': '******', "
        "'url': 'jdbc:url', 'dbtable': 'table', 'numPartitions': 2, 'extraction_type': "
        "'delta', 'partitionColumn': 'item', 'lowerBound': 1, 'upperBound': 3}}], "
        "'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', 'write_type': "
        "'append', 'data_format': 'delta', 'partitions': ['actrequest_timestamp'], "
        "'location': 'file:////path'}]}",
    },
    {  # no single neither double quotes
        "original_log": "prop1: prop2, password: pwd, secret: secret",
        "masked_log": "prop1: prop2, masked_cred: ******, " "masked_cred: ******, ",
    },
    {  # double quotes, password has single quotes and comma, ends with secret and space
        # and additional log
        "original_log": '"prop1": "prop2", "password": "p,w\'d", '
        '"secret": "secret" other logs',
        "masked_log": '"prop1": "prop2", "masked_cred": "******", '
        '"masked_cred": "******", other logs',
    },
    {
        "original_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': "
        "'source', 'read_type': 'streaming', 'data_format': 'kafka', 'options': "
        "{'kafka.ssl.truststore.password': 'p,w\"d', 'kafka.ssl.keystore.password': "
        "'p,w\"d'}}], 'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', "
        "'write_type': 'append', 'data_format': 'delta', 'partitions': "
        "['actrequest_timestamp'], 'location': 'file:////path'}]}",
        "masked_log": "Read Algorithm Configuration: {'input_specs': [{'spec_id': "
        "'source', 'read_type': 'streaming', 'data_format': 'kafka', 'options': "
        "{'masked_cred': '******', 'masked_cred': '******', }], "
        "'output_specs': [{'spec_id': 'bronze', 'input_id': 'source', 'write_type': "
        "'append', 'data_format': 'delta', 'partitions': ['actrequest_timestamp'], "
        "'location': 'file:////path'}]}",
    },
]
DICT_MSGS_TO_LOG = [
    # fmt: off
    {  # test with dict, because we rely on space after comma for the replace
        # and python might change the dict structure in the future
        "original_log": {"secret":"dummy_pwd","prop":"prop_val"},  # noqa: E231
        "masked_log": "{'masked_cred': '******', 'prop': 'prop_val'}",
    },
    # fmt: on
]
LOGGER = LoggingHandler(__name__).get_logger()


def test_log_filter_sensitive_data(caplog: Any) -> None:
    """Test the logging filter FilterSensitiveData.

    Given a set of messages, each message is logged (original_log) and tested
    against the expected output (masked_log).

    :param caplog: captures the log.
    """
    with caplog.at_level(logging.INFO):
        for str_msg in STR_MSGS_TO_LOG:
            LOGGER.info(str_msg["original_log"])
            assert str_msg["masked_log"] in caplog.text

        for dict_msg in DICT_MSGS_TO_LOG:
            LOGGER.info(dict_msg["original_log"])
            assert dict_msg["masked_log"] in caplog.text


================================================
FILE: tests/unit/test_notification_creation.py
================================================
"""Unit tests for notification creation functions."""

import pytest

from lakehouse_engine.core.definitions import TerminatorSpec
from lakehouse_engine.terminators.notifier_factory import NotifierFactory
from lakehouse_engine.terminators.notifiers.email_notifier import EmailNotifier
from lakehouse_engine.terminators.notifiers.exceptions import (
    NotifierConfigException,
    NotifierTemplateConfigException,
    NotifierTemplateNotFoundException,
)
from lakehouse_engine.utils.logging_handler import LoggingHandler
from tests.conftest import FEATURE_RESOURCES

LOGGER = LoggingHandler(__name__).get_logger()
TEST_ATTACHEMENTS_PATH = FEATURE_RESOURCES + "/notification/"


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Email notification creation using a template.",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "template": "failure_notification_email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                    "exception": "test-exception",
                },
            ),
            "expected": """
            Job local in workspace local has
            failed with the exception: test-exception""",
        },
        {
            "name": "Error: missing template",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "template": "missing template",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                    "exception": "test-exception",
                },
            ),
            "expected": "Template missing template does not exist",
        },
        {
            "name": "Error: Malformed acon",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                    "exception": "test-exception",
                },
            ),
            "expected": "Malformed Notification Definition",
        },
    ],
)
def test_notification_creation(scenario: dict) -> None:
    """Testing notification creation.

    Args:
        scenario: scenario to test.
    """
    notifier = NotifierFactory.get_notifier(scenario["spec"])

    if "Error: " in scenario["name"]:
        with pytest.raises(
            (
                NotifierTemplateNotFoundException,
                NotifierConfigException,
                NotifierTemplateConfigException,
            )
        ) as e:
            notifier.create_notification()
        assert str(e.value) == scenario["expected"]
    else:
        notifier.create_notification()
        assert notifier.notification["message"] == scenario["expected"]


@pytest.mark.parametrize(
    "scenario",
    [
        TerminatorSpec(
            function="notify",
            args={
                "server": "localhost",
                "port": "1025",
                "type": "email",
                "from": "test-email@email.com",
                "to": ["test-email1@email.com", "test-email2@email.com"],
                "subject": "test-subject",
                "message": "test-message",
            },
        ),
        TerminatorSpec(
            function="notify",
            args={
                "server": "localhost",
                "port": "1025",
                "type": "email",
                "from": "test-email@email.com",
                "cc": ["test-email1@email.com", "test-email2@email.com"],
                "bcc": ["test-email3@email.com", "test-email4@email.com"],
                "mimetype": "html",
                "subject": "test-subject",
                "message": "test-message",
                "attachments": [
                    f"{TEST_ATTACHEMENTS_PATH}test_attachement.txt",
                    f"{TEST_ATTACHEMENTS_PATH}test_image.png",
                ],
            },
        ),
    ],
)
def test_office365_notification_creation(scenario: TerminatorSpec) -> None:
    """Testing Office 365 notification creation."""
    notifier = EmailNotifier(scenario)
    body = notifier._create_graph_api_email_body()
    for recipient, test_recipient in zip(
        body.message.to_recipients, scenario.args.get("to", [])
    ):
        assert recipient.email_address.address == test_recipient
    for recipient, test_recipient in zip(
        body.message.cc_recipients, scenario.args.get("cc", [])
    ):
        assert recipient.email_address.address == test_recipient
    for recipient, test_recipient in zip(
        body.message.bcc_recipients, scenario.args.get("bcc", [])
    ):
        assert recipient.email_address.address == test_recipient

    if body.message.attachments:
        for attachment, test_attachment in zip(
            body.message.attachments, scenario.args.get("attachments")
        ):
            assert attachment.name == test_attachment.split("/")[-1]
            with open(test_attachment, "rb") as file:
                assert attachment.content_bytes == file.read()


================================================
FILE: tests/unit/test_notification_factory.py
================================================
"""Unit tests for notification factory module."""

import pytest

from lakehouse_engine.core.definitions import TerminatorSpec
from lakehouse_engine.terminators.notifier_factory import NotifierFactory
from lakehouse_engine.terminators.notifiers.exceptions import NotifierNotFoundException
from lakehouse_engine.utils.logging_handler import LoggingHandler

LOGGER = LoggingHandler(__name__).get_logger()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Error: wrong type of notifier",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "snailmail",
                    "template": "failure_notification_email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                },
            ),
            "expected": "The requested notification format snailmail is not supported.",
        },
        {
            "name": "Creation of email",
            "spec": TerminatorSpec(
                function="notify",
                args={
                    "server": "localhost",
                    "port": "1025",
                    "type": "email",
                    "template": "failure_notification_email",
                    "from": "test-email@email.com",
                    "to": ["test-email1@email.com", "test-email2@email.com"],
                },
            ),
            "expected": "email",
        },
    ],
)
def test_notification_factory(scenario: dict) -> None:
    """Testing notification factory.

    Args:
        scenario: scenario to test.
    """
    if "Error: " in scenario["name"]:
        with pytest.raises(NotifierNotFoundException) as e:
            notifier = NotifierFactory.get_notifier(scenario["spec"])

        assert scenario["expected"] == str(e.value)
    else:
        notifier = NotifierFactory.get_notifier(scenario["spec"])

        assert notifier.type == scenario["expected"]


================================================
FILE: tests/unit/test_prisma_dq_rule_id.py
================================================
"""Test the manual definition of dq functions when using prisma dq framework."""

import pytest

from lakehouse_engine.core.definitions import DQFunctionSpec, DQSpec, DQType
from lakehouse_engine.utils.dq_utils import PrismaUtils
from lakehouse_engine.utils.logging_handler import LoggingHandler

_LOGGER = LoggingHandler(__name__).get_logger()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Definition of DQ Functions using parameters without duplicates",
            "spec_id": "spec_without_duplicates",
            "dq_spec": {
                "dq_functions": [
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_2",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_1",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_3",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                ],
            },
        },
        {
            "name": "Error: Definition of DQ Functions using parameters "
            "with duplicates",
            "spec_id": "spec_with_duplicates",
            "dq_spec": {
                "dq_functions": [
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_2",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_1",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_2",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                ],
            },
        },
    ],
)
def test_prisma_manual_function_definition(scenario: dict) -> None:
    """Test the manual definition of dq functions when using prisma dq framework.

    Args:
        scenario (dict): The test scenario.
    """
    dq_functions = [
        DQFunctionSpec(function=dq_function["function"], args=dq_function["args"])
        for dq_function in scenario["dq_spec"]["dq_functions"]
    ]

    dq_spec_list = [
        DQSpec(
            spec_id=scenario["spec_id"],
            input_id=scenario["name"],
            dq_type=DQType.PRISMA.value,
            dq_functions=dq_functions,
        )
    ]

    if "Error: " in scenario["name"]:
        error = PrismaUtils.validate_rule_id_duplication(specs=dq_spec_list)
        expected_error = {"dq_spec_id: spec_with_duplicates": "rule_2; rule_1; rule_2"}
        _LOGGER.critical(
            f"A duplicate dq_rule_id was found!!!"
            "Please verify the following list:"
            f"{error}"
        )
        assert error == expected_error
    else:
        PrismaUtils.validate_rule_id_duplication(specs=dq_spec_list)


================================================
FILE: tests/unit/test_prisma_function_definition.py
================================================
"""Test the manual definition of dq functions when using prisma dq framework."""

import pytest

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.dq_processors.exceptions import DQSpecMalformedException
from lakehouse_engine.utils.dq_utils import DQUtils


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "name": "Error: missing meta parameters",
            "dq_spec": {
                "dq_functions": [
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "table": "test_table",
                                "execution_point": "in_motion",
                            },
                        },
                    },
                ],
            },
            "expected": "The dq function meta field must contain all the "
            "fields defined"
            ": ['dq_rule_id', 'execution_point', 'filters', 'schema', "
            "'table', 'column', 'dimension'].\n"
            "Found fields: ['table', 'execution_point'].\n"
            "Diff: ['column', 'dimension', 'dq_rule_id', 'filters', 'schema']",
        },
        {
            "name": "Error: missing meta",
            "dq_spec": {
                "dq_functions": [
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                        },
                    },
                ],
            },
            "expected": "The dq function must have a meta field containing all the "
            "fields defined: ['dq_rule_id', "
            "'execution_point', 'filters', 'schema', 'table', 'column', "
            "'dimension'].",
        },
        {
            "name": "Definition of DQ Functions",
            "dq_spec": {
                "dq_functions": [
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_2",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                            },
                        },
                    },
                ],
            },
            "expected": None,
        },
        {
            "name": "Definition of DQ Functions with extra params",
            "dq_spec": {
                "dq_functions": [
                    {
                        "function": "expect_column_to_exist",
                        "args": {
                            "column": "test_column",
                            "meta": {
                                "dq_rule_id": "rule_2",
                                "execution_point": "in_motion",
                                "schema": "test_db",
                                "table": "dummy_sales",
                                "column": "",
                                "dimension": "",
                                "filters": "",
                                "note": "Test Notes",
                            },
                        },
                    },
                ],
            },
            "expected": None,
        },
    ],
)
def test_prisma_manual_function_definition(scenario: dict) -> None:
    """Test the manual definition of dq functions when using prisma dq framework.

    Args:
        scenario (dict): The test scenario.
    """
    dq_spec = scenario["dq_spec"]
    if "Error: " in scenario["name"]:
        with pytest.raises(DQSpecMalformedException) as e:
            DQUtils.validate_dq_functions(
                spec=dq_spec,
                execution_point="in_motion",
                extra_meta_arguments=ExecEnv.ENGINE_CONFIG.dq_functions_column_list,
            )
        assert str(e.value) == scenario["expected"]
    else:
        DQUtils.validate_dq_functions(
            spec=dq_spec,
            execution_point="in_motion",
            extra_meta_arguments=ExecEnv.ENGINE_CONFIG.dq_functions_column_list,
        )


================================================
FILE: tests/unit/test_rest_api_functions.py
================================================
"""Test REST api related functions that cannot be tested inside Spark."""

import logging
from collections import namedtuple
from typing import Any
from unittest.mock import patch

from pyspark.sql import Row

from lakehouse_engine.core.definitions import OutputSpec
from lakehouse_engine.io.writers.rest_api_writer import RestApiWriter
from lakehouse_engine.utils.logging_handler import LoggingHandler

LOGGER = LoggingHandler(__name__).get_logger()
RestResponse = namedtuple("RestResponse", "status_code text")


@patch(
    "lakehouse_engine.io.writers.rest_api_writer.execute_api_request",
    return_value=RestResponse(status_code=200, text="ok"),
)
def test_send_payload_to_rest_api_simple_params(_: Any, caplog: Any) -> None:
    """Test if the REST API payload creation process is correct w/ simple params.

    Args:
        _: ignored patch.
        caplog: captures the log.
    """
    output_spec = OutputSpec(
        spec_id="test_output",
        input_id="test_input",
        write_type="overwrite",
        data_format="rest_api",
        options={
            "rest_api_url": "https://www.dummy-url.local/dummy-endpoint",
            "rest_api_method": "post",
            "rest_api_header": {"Authorization": "Bearer dummytoken"},
        },
    )
    row = Row(payload='{"dummy_payload":"dummy value"}')
    func = RestApiWriter._get_func_to_send_payload_to_rest_api(output_spec)
    func(row)

    str_to_assert = "Final payload: {'dummy_payload': 'dummy value'}"

    with caplog.at_level(logging.DEBUG):
        assert str_to_assert in caplog.text


@patch(
    "lakehouse_engine.io.writers.rest_api_writer.execute_api_request",
    return_value=RestResponse(status_code=200, text="ok"),
)
def test_send_payload_to_rest_api_with_file_params(_: Any, caplog: Any) -> None:
    """Test if the REST API payload creation process is correct with file params.

    Args:
        _: ignored patch.
        caplog: captures the log.
    """
    output_spec = OutputSpec(
        spec_id="test_output",
        input_id="test_input",
        write_type="overwrite",
        data_format="rest_api",
        options={
            "rest_api_url": "https://www.dummy-url.local/dummy-endpoint",
            "rest_api_method": "post",
            "rest_api_header": {"Authorization": "Bearer dummytoken"},
            "rest_api_is_file_payload": True,
            "rest_api_file_payload_name": "anotherFileName",
            "rest_api_extra_json_payload": {"a": "b"},
        },
    )
    row = Row(payload='{"dummy_payload":"dummy value"}')
    func = RestApiWriter._get_func_to_send_payload_to_rest_api(output_spec)
    func(row)

    str_to_assert = (
        "Final payload: {'anotherFileName': "
        "'{\"dummy_payload\":\"dummy value\"}', 'a': 'b'}"
    )

    with caplog.at_level(logging.DEBUG):
        assert str_to_assert in caplog.text


================================================
FILE: tests/unit/test_sensor.py
================================================
"""Module with unit tests for Sensor class."""

from datetime import datetime
from typing import Any
from unittest.mock import MagicMock, patch

import pytest
from pyspark.sql.types import Row, StructType

from lakehouse_engine.algorithms.exceptions import (
    NoNewDataException,
    SensorAlreadyExistsException,
)
from lakehouse_engine.algorithms.sensors.sensor import Sensor
from lakehouse_engine.core.definitions import (
    InputFormat,
    InputSpec,
    ReadType,
    SensorSpec,
    SensorStatus,
)
from lakehouse_engine.core.sensor_manager import (
    SensorControlTableManager,
    SensorUpstreamManager,
)
from tests.utils.dataframe_helpers import DataframeHelpers


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "create_sensor",
            "sensor_data": {
                "sensor_id": "sensor_id_1",
                "assets": ["asset_1"],
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "sensor_already_exists": False,
            "expected_result": SensorSpec(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                control_db_table_name="control_sensor_table_name",
                input_spec=InputSpec(
                    spec_id="input_spec",
                    read_type=ReadType.STREAMING.value,
                    data_format=InputFormat.CSV.value,
                ),
                preprocess_query=None,
                checkpoint_location="s3://dummy-bucket"
                "/lakehouse_engine/sensors/sensor_id_1",
                fail_on_empty_result=False,
            ),
        },
        {
            "scenario_name": "raise_exception_sensor_already_exists",
            "sensor_data": {
                "sensor_id": "sensor_id_1",
                "assets": ["asset_1"],
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "sensor_already_exists": True,
            "expected_result": "There's already a sensor registered "
            "with same id or assets!",
        },
    ],
)
def test_create_sensor(scenario: dict, capsys: Any) -> None:
    """Test Sensor creation.

    We will raise an exception if we try to create a Sensor
    that already exists, otherwise we will create successfully.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    with patch.object(
        Sensor,
        "_check_if_sensor_already_exists",
        new=MagicMock(return_value=scenario["sensor_already_exists"]),
    ) as sensor_already_exists_mock:
        sensor_already_exists_mock.start()
        if scenario["scenario_name"] == "raise_exception_sensor_already_exists":
            with pytest.raises(SensorAlreadyExistsException) as exception:
                Sensor(scenario["sensor_data"])

            assert scenario["expected_result"] == str(exception.value)
        else:
            subject = Sensor(scenario["sensor_data"])

            assert subject.spec == scenario["expected_result"]
        sensor_already_exists_mock.stop()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "create_non_existing_sensor_with_sensor_id",
            "sensor_data": {
                "sensor_id": "sensor_id_1",
                "assets": None,
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "control_db_sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=None,
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508),
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": False,
        },
        {
            "scenario_name": "create_non_existing_sensor_with_assets",
            "sensor_data": {
                "sensor_id": None,
                "assets": ["asset_1"],
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "control_db_sensor_data": Row(
                sensor_id=None,
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508),
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": False,
        },
        {
            "scenario_name": "create_non_existing_sensor_with_sensor_id_and_assets",
            "sensor_data": {
                "sensor_id": "sensor_id_1",
                "assets": ["asset_1"],
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "control_db_sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508),
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": False,
        },
        {
            "scenario_name": "raise_exception_as_sensor_"
            "already_exist_with_same_id_and_different_asset",
            "sensor_data": {
                "sensor_id": "sensor_id_1",
                "assets": ["asset_1"],
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "control_db_sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_2"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508),
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": "There's already a sensor "
            "registered with same id or assets!",
        },
        {
            "scenario_name": "raise_exception_as_sensor_"
            "already_exist_with_same_asset_and_different_id",
            "sensor_data": {
                "sensor_id": "sensor_id_1",
                "assets": ["asset_1"],
                "control_db_table_name": "control_sensor_table_name",
                "input_spec": {
                    "spec_id": "input_spec",
                    "read_type": ReadType.STREAMING.value,
                    "data_format": InputFormat.CSV.value,
                },
                "fail_on_empty_result": False,
                "base_checkpoint_location": "s3://dummy-bucket",
            },
            "control_db_sensor_data": Row(
                sensor_id="sensor_id_2",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=datetime(2023, 5, 26, 14, 38, 16, 676508),
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": "There's already a sensor "
            "registered with same id or assets!",
        },
    ],
)
def test_sensor_already_exists(scenario: dict, capsys: Any) -> None:
    """Test if Sensor already exists.

    We will raise an exception if the Sensor already exists by sensor_id or
    by assets.
    If the sensor doesn't exist we will create a new Sensor.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    with patch.object(
        SensorControlTableManager,
        "read_sensor_table_data",
        new=MagicMock(return_value=scenario["control_db_sensor_data"]),
    ) as sensor_already_exists_mock:
        sensor_already_exists_mock.start()
        if "raise_exception" in scenario["scenario_name"]:
            with pytest.raises(SensorAlreadyExistsException) as exception:
                Sensor(scenario["sensor_data"])

            assert scenario["expected_result"] == str(exception.value)
        else:
            subject = Sensor(scenario["sensor_data"])._check_if_sensor_already_exists()

            assert subject == scenario["expected_result"]
        sensor_already_exists_mock.stop()


class TestExecuteSensor:
    """Test suite containing tests for the Sensor execute method."""

    _sensor_already_exists_mock = patch.object(
        Sensor,
        "_check_if_sensor_already_exists",
        new=MagicMock(return_value=False),
    )

    @classmethod
    def setup_class(cls) -> None:
        """Start mock for all test methods in this suite."""
        cls._sensor_already_exists_mock.start()

    @classmethod
    def teardown_class(cls) -> None:
        """Clean mock after all test methods in this suite."""
        cls._sensor_already_exists_mock.stop()

    @pytest.mark.parametrize(
        "scenario",
        [
            {
                "scenario_name": "execute_stream_sensor",
                "sensor_data": {
                    "sensor_id": "sensor_id_1",
                    "assets": ["asset_1"],
                    "control_db_table_name": "control_sensor_table_name",
                    "input_spec": {
                        "spec_id": "input_spec",
                        "read_type": ReadType.STREAMING.value,
                        "data_format": InputFormat.CSV.value,
                    },
                    "fail_on_empty_result": False,
                    "base_checkpoint_location": "s3://dummy-bucket",
                },
                "expected_result": True,
            }
        ],
    )
    def test_execute_stream_sensor(self, scenario: dict, capsys: Any) -> None:
        """Test streaming Sensor execution.

        Args:
            scenario: scenario to test.
            capsys: capture stdout and stderr.
        """
        with patch.object(
            SensorControlTableManager,
            "check_if_sensor_has_acquired_data",
            new=MagicMock(return_value=scenario["expected_result"]),
        ) as check_if_sensor_acquired_data_mock:
            check_if_sensor_acquired_data_mock.start()
            with patch.object(
                SensorUpstreamManager,
                "read_new_data",
                new=MagicMock(
                    return_value=DataframeHelpers.create_empty_dataframe(StructType([]))
                ),
            ) as sensor_new_data_mock:
                with patch.object(
                    Sensor,
                    "_run_streaming_sensor",
                    new=MagicMock(return_value=scenario["expected_result"]),
                ) as run_stream_sensor_mock:
                    run_stream_sensor_mock.start()
                    subject = Sensor(scenario["sensor_data"]).execute()

                    assert subject == scenario["expected_result"]
                    run_stream_sensor_mock.stop()
                sensor_new_data_mock.stop()
            check_if_sensor_acquired_data_mock.stop()

    @pytest.mark.parametrize(
        "scenario",
        [
            {
                "scenario_name": "execute_batch_sensor",
                "sensor_data": {
                    "sensor_id": "sensor_id_1",
                    "assets": ["asset_1"],
                    "control_db_table_name": "control_sensor_table_name",
                    "input_spec": {
                        "spec_id": "input_spec",
                        "read_type": ReadType.BATCH.value,
                        "data_format": InputFormat.JDBC.value,
                    },
                },
                "expected_result": True,
            },
        ],
    )
    def test_execute_batch_sensor(self, scenario: dict, capsys: Any) -> None:
        """Test batch Sensor execution.

        Args:
            scenario: scenario to test.
            capsys: capture stdout and stderr.
        """
        with patch.object(
            SensorControlTableManager,
            "check_if_sensor_has_acquired_data",
            new=MagicMock(return_value=scenario["expected_result"]),
        ) as check_if_sensor_acquired_data_mock:
            check_if_sensor_acquired_data_mock.start()
            with patch.object(
                SensorUpstreamManager,
                "read_new_data",
                new=MagicMock(
                    return_value=DataframeHelpers.create_empty_dataframe(StructType([]))
                ),
            ) as sensor_new_data_mock:
                sensor_new_data_mock.start()
                with patch.object(
                    Sensor,
                    "_run_batch_sensor",
                    new=MagicMock(return_value=scenario["expected_result"]),
                ) as run_batch_sensor_mock:
                    run_batch_sensor_mock.start()
                    subject = Sensor(scenario["sensor_data"]).execute()

                    assert subject == scenario["expected_result"]
                    run_batch_sensor_mock.stop()
                sensor_new_data_mock.stop()
            check_if_sensor_acquired_data_mock.stop()

    @pytest.mark.parametrize(
        "scenario",
        [
            {
                "scenario_name": "raise_exception_sensor_"
                "input_spec_format_not_implemented",
                "sensor_data": {
                    "sensor_id": "sensor_id_1",
                    "assets": ["asset_1"],
                    "control_db_table_name": "control_sensor_table_name",
                    "input_spec": {
                        "spec_id": "input_spec",
                        "read_type": ReadType.BATCH.value,
                        "data_format": InputFormat.DATAFRAME.value,
                    },
                    "base_checkpoint_location": "s3://dummy-bucket",
                },
                "expected_result": "A sensor has not been implemented yet for "
                "this data format or, this data format is not available for "
                "the read_type batch. Check the allowed combinations of "
                "read_type and data_formats: {'streaming': ['kafka', 'avro', "
                "'json', 'parquet', 'csv', 'delta', "
                "'cloudfiles'], 'batch': ['delta', 'jdbc']}",
            },
            {
                "scenario_name": "raise_exception_sensor_"
                "input_spec_format_doesnt_exists",
                "sensor_data": {
                    "sensor_id": "sensor_id_1",
                    "assets": ["asset_1"],
                    "control_db_table_name": "control_sensor_table_name",
                    "input_spec": {
                        "spec_id": "input_spec",
                        "db_table": "test_db.test_table",
                        "read_type": ReadType.BATCH.value,
                        "data_format": "databricks",
                    },
                    "base_checkpoint_location": "s3://dummy-bucket",
                },
                "expected_result": "Data format databricks isn't implemented yet.",
            },
        ],
    )
    def test_execute_sensor_raise_no_input_spec_format_implemented(
        self, scenario: dict, capsys: Any
    ) -> None:
        """Expect to raise exception for input spec format not implemented.

        Args:
            scenario: scenario to test.
            capsys: capture stdout and stderr.
        """
        with pytest.raises(NotImplementedError) as exception:
            Sensor(scenario["sensor_data"]).execute()

        assert scenario["expected_result"] == str(exception.value)

    @pytest.mark.parametrize(
        "scenario",
        [
            {
                "scenario_name": "raise_no_new_data_exception",
                "sensor_data": {
                    "sensor_id": "sensor_id_1",
                    "assets": ["asset_1"],
                    "control_db_table_name": "control_sensor_table_name",
                    "input_spec": {
                        "spec_id": "input_spec",
                        "read_type": ReadType.STREAMING.value,
                        "data_format": InputFormat.KAFKA.value,
                    },
                    "base_checkpoint_location": "s3://dummy-bucket",
                    "fail_on_empty_result": True,
                },
                "expected_result": "No data was acquired by sensor_id_1 sensor.",
            },
        ],
    )
    def test_execute_sensor_raise_no_new_data_exception(
        self, scenario: dict, capsys: Any
    ) -> None:
        """Expect to raise exception for empty data.

        When we pass the flag `fail_on_empty_result` equals to `True`.

        Args:
            scenario: scenario to test.
            capsys: capture stdout and stderr.
        """
        with patch.object(
            SensorControlTableManager,
            "check_if_sensor_has_acquired_data",
            new=MagicMock(return_value=False),
        ) as check_if_sensor_acquired_data_mock:
            check_if_sensor_acquired_data_mock.start()
            with patch.object(
                SensorUpstreamManager,
                "read_new_data",
                new=MagicMock(
                    return_value=DataframeHelpers.create_empty_dataframe(StructType([]))
                ),
            ) as sensor_new_data_mock:
                with patch.object(
                    Sensor, "_run_streaming_sensor", new=MagicMock(return_value=False)
                ) as run_stream_sensor_mock:
                    run_stream_sensor_mock.start()
                    with pytest.raises(NoNewDataException) as exception:
                        Sensor(scenario["sensor_data"]).execute()

                    assert scenario["expected_result"] == str(exception.value)
                    run_stream_sensor_mock.stop()
                sensor_new_data_mock.stop()
            check_if_sensor_acquired_data_mock.stop()


================================================
FILE: tests/unit/test_sensor_manager.py
================================================
"""Module with unit tests for Sensor Manager module."""

from datetime import datetime
from typing import Any
from unittest.mock import MagicMock, patch

import pytest
from delta import DeltaTable
from pyspark.sql import DataFrame
from pyspark.sql.types import (
    ArrayType,
    Row,
    StringType,
    StructField,
    StructType,
    TimestampType,
)

from lakehouse_engine.algorithms.sensors.sensor import SensorStatus
from lakehouse_engine.core.definitions import SensorSpec
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.core.sensor_manager import (
    SensorControlTableManager,
    SensorUpstreamManager,
)
from lakehouse_engine.io.reader_factory import ReaderFactory

TEST_DEFAULT_DATETIME = datetime(2023, 5, 26, 14, 38, 16, 676508)


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "should_return_default_update_set_when_empty_fields",
            "updated_set_to_add": {},
        },
        {
            "scenario_name": "should_add_just_one_field_to_update_set",
            "assets": ["asset_1"],
            "updated_set_to_add": {"sensors.assets": "updates.assets"},
        },
        {
            "scenario_name": "should_add_multiple_fields_to_update_set",
            "assets": ["asset_1"],
            "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_1",
            "upstream_key": "dummy_column",
            "upstream_value": "dummy_value",
            "updated_set_to_add": {
                "sensors.assets": "updates.assets",
                "sensors.checkpoint_location": "updates.checkpoint_location",
                "sensors.upstream_key": "updates.upstream_key",
                "sensors.upstream_value": "updates.upstream_value",
            },
        },
    ],
)
def test_sensor_update_set(scenario: dict, capsys: Any) -> None:
    """Test sensor update set adding multiple fields based in the items to add.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    expected_default_update_set = {
        "sensors.sensor_id": "updates.sensor_id",
        "sensors.status": "updates.status",
        "sensors.status_change_timestamp": "updates.status_change_timestamp",
    }

    subject = SensorControlTableManager._get_sensor_update_set(
        assets=scenario.get("assets"),
        checkpoint_location=scenario.get("checkpoint_location"),
        upstream_key=scenario.get("upstream_key"),
        upstream_value=scenario.get("upstream_value"),
    )

    assert subject == {**expected_default_update_set, **scenario["updated_set_to_add"]}


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "true_when_table_data_and_status_acquired_new_data",
            "sensor_id": "sensor_id_1",
            "assets": ["asset_1"],
            "status": SensorStatus.ACQUIRED_NEW_DATA.value,
            "status_change_timestamp": datetime.now(),
            "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_1",
            "upstream_key": "dummy_column",
            "upstream_value": "dummy_value",
        },
    ],
)
def test_sensor_data(scenario: dict, capsys: Any) -> None:
    """Test Sensor data construction.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    subject = SensorControlTableManager._convert_sensor_to_data(
        spec=SensorSpec(
            sensor_id=scenario["sensor_id"],
            assets=scenario["assets"],
            control_db_table_name=None,
            checkpoint_location=scenario["checkpoint_location"],
            preprocess_query=None,
            input_spec=None,
        ),
        status=scenario["status"],
        upstream_key=scenario["upstream_key"],
        upstream_value=scenario["upstream_value"],
        status_change_timestamp=scenario["status_change_timestamp"],
    )

    assert subject == [
        {
            "sensor_id": scenario["sensor_id"],
            "assets": scenario["assets"],
            "status": scenario["status"],
            "status_change_timestamp": scenario["status_change_timestamp"],
            "checkpoint_location": scenario["checkpoint_location"],
            "upstream_key": scenario["upstream_key"],
            "upstream_value": scenario["upstream_value"],
        }
    ]


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "true_when_table_data_and_status_acquired_new_data",
            "sensor_id": "sensor_id_1",
            "control_db_table_name": "sensor_control_db_table",
            "sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=TEST_DEFAULT_DATETIME,
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": True,
        },
        {
            "scenario_name": "false_when_table_data_is_absent",
            "sensor_id": "sensor_id_1",
            "control_db_table_name": "sensor_control_db_table",
            "sensor_data": None,
            "expected_result": False,
        },
        {
            "scenario_name": "false_when_table_data_is_present_and_"
            "status_different_than_acquired_new_data",
            "sensor_id": "sensor_id_1",
            "control_db_table_name": "sensor_control_db_table",
            "sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.PROCESSED_NEW_DATA.value,
                status_change_timestamp=TEST_DEFAULT_DATETIME,
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            ),
            "expected_result": False,
        },
    ],
)
def test_check_if_sensor_has_acquired_data(scenario: dict, capsys: Any) -> None:
    """Test if Sensor has acquired data.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    with patch.object(
        SensorControlTableManager,
        "read_sensor_table_data",
        new=MagicMock(return_value=scenario["sensor_data"]),
    ) as sensor_table_data_mock:
        sensor_table_data_mock.start()
        subject = SensorControlTableManager.check_if_sensor_has_acquired_data(
            sensor_id=scenario["sensor_id"],
            control_db_table_name=scenario["control_db_table_name"],
        )

        assert subject == scenario["expected_result"]
        sensor_table_data_mock.stop()


@pytest.fixture
def control_table_fixture() -> DataFrame:
    """Return a dummy dataframe in the Sensor control table schema."""
    schema = StructType(
        [
            StructField("sensor_id", StringType(), False),
            StructField("assets", ArrayType(StringType(), False), True),
            StructField("status", StringType(), False),
            StructField("status_change_timestamp", TimestampType(), False),
            StructField("checkpoint_location", StringType(), True),
        ]
    )
    return ExecEnv.SESSION.createDataFrame(
        [
            [
                "sensor_id_1",
                [],
                SensorStatus.ACQUIRED_NEW_DATA.value,
                TEST_DEFAULT_DATETIME,
                "s3://dummy-bucket/sensors/sensor_id_1",
            ],
            [
                "sensor_id_2",
                ["asset_2"],
                SensorStatus.PROCESSED_NEW_DATA.value,
                TEST_DEFAULT_DATETIME,
                "s3://dummy-bucket/sensors/sensor_id_2",
            ],
            [
                "sensor_id_3",
                ["asset_3"],
                SensorStatus.ACQUIRED_NEW_DATA.value,
                TEST_DEFAULT_DATETIME,
                "s3://dummy-bucket/sensors/sensor_id_3",
            ],
        ],
        schema,
    )


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sensor_id_is_present",
            "sensor_id": "sensor_id_1",
            "control_db_table_name": "sensor_control_db_table",
            "assets": None,
            "expected_result": {
                "sensor_id": "sensor_id_1",
                "assets": [],
                "status": SensorStatus.ACQUIRED_NEW_DATA.value,
                "status_change_timestamp": TEST_DEFAULT_DATETIME,
                "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_1",
            },
        },
        {
            "scenario_name": "sensor_id_is_absent_and_assets_is_present",
            "sensor_id": None,
            "control_db_table_name": "sensor_control_db_table",
            "assets": ["asset_2"],
            "expected_result": {
                "sensor_id": "sensor_id_2",
                "assets": ["asset_2"],
                "status": SensorStatus.PROCESSED_NEW_DATA.value,
                "status_change_timestamp": TEST_DEFAULT_DATETIME,
                "checkpoint_location": "s3://dummy-bucket/sensors/sensor_id_2",
            },
        },
        {
            "scenario_name": "sensor_id_and_sensor_asset_are_absent",
            "sensor_id": None,
            "control_db_table_name": "sensor_control_db_table",
            "assets": None,
            "expected_result": "Either sensor_id or assets "
            "need to be provided as arguments.",
        },
    ],
)
def test_read_sensor_table_data(
    scenario: dict, capsys: Any, control_table_fixture: DataFrame
) -> None:
    """Test read data from Sensor control table.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
        control_table_fixture: fixture representing the
            control table as DataFrame.
    """
    expected_result = scenario["expected_result"]

    with patch.object(DeltaTable, "forName", MagicMock()) as delta_table_for_name_mock:
        delta_table_for_name_mock.start()
        with patch.object(
            delta_table_for_name_mock.return_value,
            "toDF",
            MagicMock(return_value=control_table_fixture),
        ) as delta_table_for_to_df_mock:
            delta_table_for_to_df_mock.start()

            if scenario["scenario_name"] == "sensor_id_and_sensor_asset_are_absent":
                with pytest.raises(ValueError) as exception:
                    SensorControlTableManager.read_sensor_table_data(
                        sensor_id=scenario["sensor_id"],
                        control_db_table_name=scenario["control_db_table_name"],
                        assets=scenario["assets"],
                    )

                assert expected_result in str(exception.value)
            else:
                subject = SensorControlTableManager.read_sensor_table_data(
                    sensor_id=scenario["sensor_id"],
                    control_db_table_name=scenario["control_db_table_name"],
                    assets=scenario["assets"],
                )

                assert subject.asDict() == expected_result

            delta_table_for_to_df_mock.stop()
    delta_table_for_name_mock.stop()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "test_if_has_new_data",
            "empty_df": False,
            "expected_result": True,
        },
        {
            "scenario_name": "test_if_has_not_new_data",
            "empty_df": True,
            "expected_result": False,
        },
    ],
)
def test_has_new_data(scenario: dict, capsys: Any) -> None:
    """Test if checking for new data works correctly where there is new data.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    new_data_df = _prepare_new_data_tests(return_empty_df=scenario["empty_df"])

    has_new_data = SensorUpstreamManager.get_new_data(new_data_df) is not None

    assert has_new_data == scenario["expected_result"]


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "sensor_db_table_and_default_dummy_value",
            "sensor": {
                "sensor_id": "sensor_id_1",
                "filter_exp": "?upstream_key > '?upstream_value'",
                "control_db_table_name": "test_jdbc_sensor_default_dummy_value",
                "upstream_key": "dummy_time",
                "upstream_value": None,
            },
            "sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=TEST_DEFAULT_DATETIME,
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
                upstream_key="dummy_time",
                upstream_value=None,
            ),
            "expected_result": "SELECT COUNT(1) as count, "
            "'dummy_time' as UPSTREAM_KEY, "
            "max(dummy_time) as UPSTREAM_VALUE "
            "FROM sensor_new_data "
            "WHERE dummy_time > '-2147483647' "
            "HAVING COUNT(1) > 0",
        },
        {
            "scenario_name": "sensor_db_table_with_custom_value",
            "sensor": {
                "sensor_id": "sensor_id_1",
                "filter_exp": "?upstream_key > '?upstream_value'",
                "control_db_table_name": "test_jdbc_sensor_custom_value",
                "upstream_key": "dummy_time",
                "upstream_value": "3333333333",
            },
            "sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=TEST_DEFAULT_DATETIME,
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
                upstream_key="dummy_time",
                upstream_value="3333333333",
            ),
            "expected_result": "SELECT COUNT(1) as count, "
            "'dummy_time' as UPSTREAM_KEY, "
            "max(dummy_time) as UPSTREAM_VALUE "
            "FROM sensor_new_data "
            "WHERE dummy_time > '3333333333' "
            "HAVING COUNT(1) > 0",
        },
        {
            "scenario_name": "filter_exp_preprocess_query",
            "sensor": {
                "sensor_id": "sensor_id_1",
                "filter_exp": "my_column > 'my_value'",
                "control_db_table_name": None,
                "upstream_key": None,
                "upstream_value": None,
            },
            "sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=TEST_DEFAULT_DATETIME,
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
                upstream_key=None,
                upstream_value=None,
            ),
            "expected_result": "SELECT COUNT(1) as count "
            "FROM sensor_new_data "
            "WHERE my_column > 'my_value' "
            "HAVING COUNT(1) > 0",
        },
        {
            "scenario_name": "filter_exp_preprocess_query_from_upstream_table_name",
            "sensor": {
                "sensor_id": "sensor_id_1",
                "filter_exp": "?upstream_key > '?upstream_value'",
                "control_db_table_name": "test_jdbc_sensor_default_dummy_value",
                "upstream_key": "dummy_time",
                "upstream_value": "3333333333",
                "upstream_table_name": "test_db.dummy_table",
            },
            "sensor_data": Row(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                status=SensorStatus.ACQUIRED_NEW_DATA.value,
                status_change_timestamp=TEST_DEFAULT_DATETIME,
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
                upstream_key="dummy_time",
                upstream_value="3333333333",
            ),
            "expected_result": "SELECT COUNT(1) as count, "
            "'dummy_time' as UPSTREAM_KEY, "
            "max(dummy_time) as UPSTREAM_VALUE "
            "FROM test_db.dummy_table "
            "WHERE dummy_time > '3333333333' "
            "HAVING COUNT(1) > 0",
        },
        {
            "scenario_name": "raise_exception_db_name_is_defined_and_upstream_key_not",
            "sensor": {
                "sensor_id": "sensor_id_1",
                "filter_exp": "my_column > 'my_value'",
                "control_db_table_name": "test_jdbc_sensor_raise_exception",
                "upstream_key": None,
                "upstream_value": None,
            },
            "expected_result": "If control_db_table_name is defined, "
            "upstream_key should "
            "also be defined!",
        },
    ],
)
def test_if_generate_filter_exp_preprocess_query(scenario: dict, capsys: Any) -> None:
    """Test filter expression for preprocess query gen.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    sensor_data = scenario["sensor"]
    expected_result = scenario["expected_result"]
    db_table = sensor_data.get("control_db_table_name")

    if (
        scenario["scenario_name"]
        == "raise_exception_db_name_is_defined_and_upstream_key_not"
    ):
        with pytest.raises(ValueError) as exception:
            SensorUpstreamManager.generate_filter_exp_query(
                sensor_data.get("sensor_id"),
                sensor_data.get("filter_exp"),
                f"test_db.{db_table}" if db_table else None,
                sensor_data.get("upstream_key"),
                sensor_data.get("upstream_value"),
            )

        assert expected_result in str(exception.value)
    else:
        with patch.object(
            SensorControlTableManager,
            "read_sensor_table_data",
            new=MagicMock(return_value=scenario["sensor_data"]),
        ) as sensor_table_data_mock:
            sensor_table_data_mock.start()
            subject = SensorUpstreamManager.generate_filter_exp_query(
                sensor_data.get("sensor_id"),
                sensor_data.get("filter_exp"),
                f"test_db.{db_table}" if db_table else None,
                sensor_data.get("upstream_key"),
                sensor_data.get("upstream_value"),
                sensor_data.get("upstream_table_name"),
            )

            assert subject == expected_result
            sensor_table_data_mock.stop()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "generate_sensor_table_preprocess_query",
            "sensor_id": "sensor_id_1",
            "expected_result": "SELECT * "  # nosec
            "FROM sensor_new_data "
            "WHERE"
            " _change_type in ('insert', 'update_postimage')"
            " and sensor_id = 'sensor_id_1'"
            f" and status = '{SensorStatus.PROCESSED_NEW_DATA.value}'",
        }
    ],
)
def test_generate_sensor_table_preprocess_query(scenario: dict, capsys: Any) -> None:
    """Test if we are generating correctly the preprocess query.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    subject = SensorUpstreamManager.generate_sensor_table_preprocess_query(
        scenario["sensor_id"]
    )

    assert subject == scenario["expected_result"]


@pytest.fixture
def dataframe_fixture() -> DataFrame:
    """Return a dummy dataframe to be used in our tests."""
    schema = StructType([StructField("dummy_field", StringType(), True)])
    return ExecEnv.SESSION.createDataFrame(
        [["a"], ["b"], ["c"]],
        schema,
    )


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "read_new_data",
            "preprocess_query": None,
            "expected_result": 3,
        },
        {
            "scenario_name": "read_new_data_with_preprocess_query",
            "preprocess_query": "SELECT *"
            "FROM sensor_new_data "
            "WHERE dummy_field = 'b' ",
            "expected_result": 1,
        },
    ],
)
def test_read_new_data(
    scenario: dict, capsys: Any, dataframe_fixture: DataFrame
) -> None:
    """Test if we execute the preprocess query when reading new data.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
        dataframe_fixture: fixture representing a dummy dataframe to be
            used as mock return.
    """
    with patch.object(
        ReaderFactory, "get_data", MagicMock(return_value=dataframe_fixture)
    ) as reader_factory_mock:
        reader_factory_mock.start()

        new_data = SensorUpstreamManager.read_new_data(
            sensor_spec=SensorSpec(
                sensor_id="sensor_id_1",
                assets=["asset_1"],
                control_db_table_name="test_db.sensor_control_table",
                input_spec=None,
                preprocess_query=scenario["preprocess_query"],
                checkpoint_location="s3://dummy-bucket/sensors/sensor_id_1",
            )
        )

        assert new_data.count() == scenario["expected_result"]
        reader_factory_mock.stop()


@pytest.mark.parametrize(
    "scenario",
    [
        {
            "scenario_name": "generate_sap_logchain_query",
            "chain_id": "MY_SAP_CHAIN_ID",
            "expected_result": "WITH sensor_new_data AS ("
            "SELECT "
            "CHAIN_ID, "
            "CONCAT(DATUM, ZEIT) AS LOAD_DATE, "
            "ANALYZED_STATUS "
            "FROM SAPPHA.RSPCLOGCHAIN "
            "WHERE "
            "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') "
            "AND UPPER(ANALYZED_STATUS) = UPPER('G')"
            ")",  # nosec
        },
        {
            "scenario_name": "generate_sap_logchain_query_dbtable",
            "chain_id": "MY_SAP_CHAIN_ID",
            "dbtable": "test_db.test_table",
            "expected_result": "WITH sensor_new_data AS ("
            "SELECT "
            "CHAIN_ID, "
            "CONCAT(DATUM, ZEIT) AS LOAD_DATE, "
            "ANALYZED_STATUS "
            "FROM test_db.test_table "
            "WHERE "
            "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') "
            "AND UPPER(ANALYZED_STATUS) = UPPER('G')"
            ")",  # nosec
        },
        {
            "scenario_name": "generate_sap_logchain_query_status",
            "chain_id": "MY_SAP_CHAIN_ID",
            "status": "A",
            "expected_result": "WITH sensor_new_data AS ("
            "SELECT "
            "CHAIN_ID, "
            "CONCAT(DATUM, ZEIT) AS LOAD_DATE, "
            "ANALYZED_STATUS "
            "FROM SAPPHA.RSPCLOGCHAIN "
            "WHERE "
            "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') "
            "AND UPPER(ANALYZED_STATUS) = UPPER('A')"
            ")",  # nosec
        },
        {
            "scenario_name": "generate_sap_logchain_query_engine_table",
            "chain_id": "MY_SAP_CHAIN_ID",
            "engine_table_name": "test_SAPTABLE",
            "expected_result": "WITH test_SAPTABLE AS ("
            "SELECT "
            "CHAIN_ID, "
            "CONCAT(DATUM, ZEIT) AS LOAD_DATE, "
            "ANALYZED_STATUS "
            "FROM SAPPHA.RSPCLOGCHAIN "
            "WHERE "
            "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') "
            "AND UPPER(ANALYZED_STATUS) = UPPER('G')"
            ")",  # nosec
        },
        {
            "scenario_name": "generate_sap_logchain_query_full_custom",
            "chain_id": "MY_SAP_CHAIN_ID",
            "dbtable": "test_db.test_table",
            "status": "A",
            "engine_table_name": "test_SAPTABLE",
            "expected_result": "WITH test_SAPTABLE AS ("
            "SELECT "
            "CHAIN_ID, "
            "CONCAT(DATUM, ZEIT) AS LOAD_DATE, "
            "ANALYZED_STATUS "
            "FROM test_db.test_table "
            "WHERE "
            "UPPER(CHAIN_ID) = UPPER('MY_SAP_CHAIN_ID') "
            "AND UPPER(ANALYZED_STATUS) = UPPER('A')"
            ")",  # nosec
        },
        {
            "scenario_name": "raise_exception_chain_id_is_not_defined",
            "chain_id": None,
            "expected_result": "To query on log chain SAP table the chain id "
            "should be defined!",
        },
    ],
)
def test_generate_sensor_sap_logchain_query(scenario: dict, capsys: Any) -> None:
    """Test if we are generating correctly the sap logchain query.

    Args:
        scenario: scenario to test.
        capsys: capture stdout and stderr.
    """
    if scenario["scenario_name"] == "raise_exception_chain_id_is_not_defined":
        with pytest.raises(ValueError) as exception:
            SensorUpstreamManager.generate_sensor_sap_logchain_query(
                scenario["chain_id"],
                scenario.get("dbtable"),
                scenario.get("status"),
                scenario.get("engine_table_name"),
            )

        assert scenario["expected_result"] in str(exception.value)
    else:
        if scenario["scenario_name"] == "generate_sap_logchain_query":
            subject = SensorUpstreamManager.generate_sensor_sap_logchain_query(
                scenario.get("chain_id"),
            )
        elif scenario["scenario_name"] == "generate_sap_logchain_query_dbtable":
            subject = SensorUpstreamManager.generate_sensor_sap_logchain_query(
                scenario.get("chain_id"),
                dbtable=scenario.get("dbtable"),
            )
        elif scenario["scenario_name"] == "generate_sap_logchain_query_status":
            subject = SensorUpstreamManager.generate_sensor_sap_logchain_query(
                scenario.get("chain_id"),
                status=scenario.get("status"),
            )
        elif scenario["scenario_name"] == "generate_sap_logchain_query_engine_table":
            subject = SensorUpstreamManager.generate_sensor_sap_logchain_query(
                scenario.get("chain_id"),
                engine_table_name=scenario.get("engine_table_name"),
            )
        else:
            subject = SensorUpstreamManager.generate_sensor_sap_logchain_query(
                scenario.get("chain_id"),
                scenario.get("dbtable"),
                scenario.get("status"),
                scenario.get("engine_table_name"),
            )

        assert subject == scenario["expected_result"]


def _prepare_new_data_tests(return_empty_df: bool = False) -> DataFrame:
    schema = StructType([StructField("dummy_field", StringType(), True)])

    if return_empty_df:
        return ExecEnv.SESSION.createDataFrame(
            [],
            schema,
        )
    else:
        return ExecEnv.SESSION.createDataFrame(
            [["a"], ["b"], ["c"]],
            schema,
        )


================================================
FILE: tests/unit/test_sharepoint_csv_reader.py
================================================
"""Test Sharepoint CSV reader.

Unit tests for delimiter detection and Spark CSV option resolution in
`SharepointCsvReader`.
"""

from __future__ import annotations

from typing import Any, Dict, cast

from lakehouse_engine.io.readers.sharepoint_reader import SharepointCsvReader


class DummySharepointOptions:
    """Minimal Sharepoint options stub used to build a `SharepointCsvReader`.

    Args:
        local_options: Dictionary of local CSV read options (for example, header,
            delimiter, sep).
    """

    def __init__(self, local_options: Dict[str, Any]) -> None:
        """Initialize the dummy options with the provided local options."""
        self.local_options = local_options


class DummyInputSpec:
    """Minimal input spec stub that exposes `sharepoint_opts` as expected by the reader.

    Args:
        sharepoint_options: Instance containing `local_options`.
    """

    def __init__(self, sharepoint_options: DummySharepointOptions) -> None:
        """Initialize the dummy input spec with the provided Sharepoint options."""
        self.sharepoint_opts = sharepoint_options


def create_csv_reader(local_options: Dict[str, Any]) -> SharepointCsvReader:
    """Create a `SharepointCsvReader` instance without calling its constructor.

    Args:
        local_options: Dictionary of local CSV read options.

    Returns:
        SharepointCsvReader: A partially-initialized reader instance.
    """
    csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader)
    csv_reader._input_spec = cast(
        Any, DummyInputSpec(DummySharepointOptions(local_options))
    )
    return csv_reader


def test_detect_delimiter_uses_user_provided_delimiter() -> None:
    """It should always return the explicitly provided delimiter."""
    csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader)
    detected: str = csv_reader.detect_delimiter(
        file_content=b"column_a;column_b\n1;2\n",
        provided_delimiter="|",
        expected_columns=None,
    )
    assert detected == "|"


def test_detect_delimiter_autodetects_semicolon() -> None:
    """It should infer the delimiter from the file content when none is provided."""
    csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader)
    detected: str = csv_reader.detect_delimiter(
        file_content=b"column_a;column_b\n1;2\n",
        provided_delimiter=None,
        expected_columns=None,
    )
    assert detected == ";"


def test_detect_delimiter_defaults_to_comma_on_decode_error() -> None:
    """It should fall back to comma when content cannot be decoded for sniffing."""
    csv_reader: SharepointCsvReader = SharepointCsvReader.__new__(SharepointCsvReader)
    detected: str = csv_reader.detect_delimiter(
        file_content=b"\xff\xfe",
        provided_delimiter=None,
        expected_columns=None,
    )
    assert detected == ","


def test_resolve_csv_options_prefers_sep_over_delimiter() -> None:
    """`sep` should take precedence over `delimiter`, and `delimiter` should be removed.

    Args:
        None
    Returns:
        None
    """
    csv_reader: SharepointCsvReader = create_csv_reader(
        {"sep": "|", "delimiter": ",", "header": True}
    )
    spark_options: Dict[str, Any] = csv_reader.resolve_spark_csv_options(
        b"column_a,column_b\n1,2\n"
    )
    assert spark_options["sep"] == "|"
    assert "delimiter" not in spark_options


def test_resolve_spark_csv_options_uses_delimiter_when_sep_missing() -> None:
    """If `sep` is missing, `delimiter` should be mapped into `sep` and removed."""
    csv_reader: SharepointCsvReader = create_csv_reader(
        {"delimiter": ";", "header": True}
    )
    spark_options: Dict[str, Any] = csv_reader.resolve_spark_csv_options(
        b"column_a,column_b\n1,2\n"
    )
    assert spark_options["sep"] == ";"
    assert "delimiter" not in spark_options


def test_resolve_spark_csv_options_autodetects_when_no_delimiter_provided() -> None:
    """If neither `sep` nor `delimiter` is provided, it should autodetect from content.

    Args:
        None
    Returns:
        None
    """
    csv_reader: SharepointCsvReader = create_csv_reader({"header": True})
    spark_options: Dict[str, Any] = csv_reader.resolve_spark_csv_options(
        b"column_a|column_b\n1|2\n"
    )
    assert spark_options["sep"] == "|"


def test_resolve_spark_csv_options_warns_when_expected_columns_names_mismatch(
    caplog: Any,
) -> None:
    """Warn when expected column names do not match the header.

    Args:
        caplog: Pytest log capture fixture.

    Returns:
        None.
    """
    csv_reader: SharepointCsvReader = create_csv_reader(
        {
            "header": True,
            "expected_columns": ["col_a", "col_b"],
        }
    )

    # Header uses semicolon, delimiter should be detected as ';', but names mismatch.
    file_content: bytes = b"wrong_a;wrong_b\n1;2\n"

    with caplog.at_level("WARNING"):
        csv_reader.resolve_spark_csv_options(file_content)

    assert "Expected columns don't match CSV header" in caplog.text


def test_resolve_spark_csv_options_warns_when_expected_columns_validation_fails(
    caplog: Any,
) -> None:
    """Warn when validation against the header cannot be performed.

    Args:
        caplog: Pytest log capture fixture.

    Returns:
        None.
    """
    csv_reader: SharepointCsvReader = create_csv_reader(
        {
            "header": True,
            "expected_columns": ["col_a", "col_b"],
        }
    )

    # Force decode failure inside the expected_columns validation block.
    file_content: bytes = b"\xff\xfe"

    with caplog.at_level("WARNING"):
        csv_reader.resolve_spark_csv_options(file_content)

    assert "Failed to validate expected_columns against CSV header" in caplog.text


================================================
FILE: tests/unit/test_spark_session.py
================================================
"""Test if a new spark session returns the same object as current session."""

from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.utils.logging_handler import LoggingHandler

LOGGER = LoggingHandler(__name__).get_logger()


def test_spark_session() -> None:
    """Test if a new spark session returns the same object as current session."""
    old_session = ExecEnv.SESSION.getActiveSession()
    ExecEnv.get_or_create()
    new_session = ExecEnv.SESSION.getActiveSession()

    assert old_session is new_session, (
        "Sessions pointing to different objects."
        f"{new_session} is different than {old_session}"
    )

    LOGGER.info(
        f"New session ({new_session}) is the same as previously "
        f"created session ({old_session})."
    )


================================================
FILE: tests/unit/test_version.py
================================================
"""Test if the correct version of the lib is being read."""

import re

from lakehouse_engine.utils.configs.config_utils import ConfigUtils


def test_version() -> None:
    """Test if ConfigUtils is reading the correct version from pyproject.toml."""
    configUtils = ConfigUtils()

    current_version = re.search(
        r"(?<=version = \").*(?=\")", open("pyproject.toml").read()
    ).group()
    assert current_version == configUtils.get_engine_version()


================================================
FILE: tests/utils/__init__.py
================================================
"""Tests utilities."""


================================================
FILE: tests/utils/dataframe_helpers.py
================================================
"""Module with helper functions to interact with test dataframes."""

import random
import string
from typing import Optional, OrderedDict

from pyspark.sql import DataFrame
from pyspark.sql.types import StructType

from lakehouse_engine.core.definitions import (
    InputFormat,
    InputSpec,
    OutputFormat,
    OutputSpec,
    ReadType,
    WriteType,
)
from lakehouse_engine.core.exec_env import ExecEnv
from lakehouse_engine.io.readers.file_reader import FileReader
from lakehouse_engine.io.readers.jdbc_reader import JDBCReader
from lakehouse_engine.io.readers.table_reader import TableReader
from lakehouse_engine.io.writers.jdbc_writer import JDBCWriter
from lakehouse_engine.utils.logging_handler import LoggingHandler


class DataframeHelpers(object):
    """Class with helper functions to interact with test dataframes."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def has_diff(
        cls, df: DataFrame, another_df: DataFrame, group_and_order: bool = True
    ) -> bool:
        """Check if a dataframe has differences comparing to another dataframe.

        Note: the order of the columns and rows are not considered as differences
        by default.

        Args:
            df: one dataframe.
            another_df: another dataframe.
            group_and_order: whether to group and order the DFs or not.

        Returns:
            True if it has a difference, false otherwise.
        """

        def print_diff(desc: str, diff_df: DataFrame) -> None:
            cls._logger.debug(desc)
            for row in diff_df.collect():
                cls._logger.debug(row)

        cls._logger.debug("Checking if Dataframes have diff...")
        cols_to_group = df.columns
        if group_and_order:
            df = df.select(*cols_to_group).orderBy(*cols_to_group)
            another_df = another_df.select(*cols_to_group).orderBy(*cols_to_group)

        diff_1 = df.exceptAll(another_df)
        diff_2 = another_df.exceptAll(df)
        if diff_1.isEmpty() is False or diff_2.isEmpty() is False:
            df.show(100, False)
            another_df.show(100, False)
            cls._logger.debug("Dataframes have diff...")
            print_diff("Diff 1:", diff_1)
            print_diff("Diff 2:", diff_2)
            return True
        else:
            return False

    @staticmethod
    def read_from_file(
        location: str,
        file_format: str = InputFormat.CSV.value,
        schema: Optional[dict] = None,
        options: Optional[dict] = None,
    ) -> DataFrame:
        """Read data from a file into a dataframe.

        Args:
            location: location of the file(s).
            file_format: file(s) format.
            schema: schema of the files (only works with spark schema
                StructType for now).
            options: options (e.g., spark options) to read data.

        Returns:
            The dataframe that was read.
        """
        if options is None and file_format == InputFormat.CSV.value:
            options = {"header": True, "delimiter": "|", "inferSchema": True}
        spec = InputSpec(
            spec_id=random.choice(string.ascii_letters),  # nosec
            read_type=ReadType.BATCH.value,
            data_format=file_format,
            location=location,
            schema=schema,
            options=options,
        )
        return FileReader(input_spec=spec).read()

    @staticmethod
    def read_from_table(db_table: str, options: Optional[dict] = None) -> DataFrame:
        """Read data from a table into a dataframe.

        Args:
            db_table: `database.table_name`.
            options: options (e.g., spark options) to read data.

        Returns:
            DataFrame: the dataframe that was read.
        """
        spec = InputSpec(
            spec_id=random.choice(string.ascii_letters),  # nosec
            read_type=ReadType.BATCH.value,
            db_table=db_table,
            options=options,
        )
        return TableReader(input_spec=spec).read()

    @staticmethod
    def read_from_jdbc(
        uri: str, db_table: str, driver: str = "org.sqlite.JDBC"
    ) -> DataFrame:
        """Read data from jdbc into a dataframe.

        Args:
            uri: uri for the jdbc connection.
            db_table: `database.table_name`.
            driver: driver class.

        Returns:
            DataFrame: the dataframe that was read.
        """
        spec = InputSpec(
            spec_id=random.choice(string.ascii_letters),  # nosec
            db_table=db_table,
            read_type=ReadType.BATCH.value,
            options={"url": uri, "dbtable": db_table, "driver": driver},
        )
        return JDBCReader(input_spec=spec).read()

    @staticmethod
    def write_into_jdbc_table(
        df: DataFrame,
        uri: str,
        db_table: str,
        write_type: str = WriteType.APPEND.value,
        driver: str = "org.sqlite.JDBC",
        data: OrderedDict = None,
    ) -> None:
        """Write data into a jdbc table.

        Args:
            df: dataframe containing the data to append.
            uri: uri for the jdbc connection.
            db_table: `database.table_name`.
            write_type: type of writer to use for writing into the destination
            driver: driver class.
            data: list of all dfs generated on previous steps before writer.
        """
        spec = OutputSpec(
            spec_id=random.choice(string.ascii_letters),  # nosec
            input_id=random.choice(string.ascii_letters),  # nosec
            write_type=write_type,
            data_format=OutputFormat.JDBC.value,
            options={"url": uri, "dbtable": db_table, "driver": driver},
        )

        JDBCWriter(output_spec=spec, df=df.coalesce(1), data=data).write()

    @staticmethod
    def create_empty_dataframe(struct_type: StructType) -> DataFrame:
        """Create an empty DataFrame.

        Args:
            struct_type: dict containing a spark schema structure. [Check here](
                https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html).

        Returns:
            An empty dataframe
        """
        return ExecEnv.SESSION.createDataFrame(data=[], schema=struct_type)

    @staticmethod
    def create_dataframe(data: list, schema: StructType) -> DataFrame:
        """Create a DataFrame.

        Args:
            data: dict containing the data to create the DataFrame.
            schema: dict containing a spark schema structure. [Check here](
                https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/StructType.html).

        Returns:
            The created DataFrame.
        """
        return ExecEnv.SESSION.createDataFrame(data=data, schema=schema)

    @staticmethod
    def create_delta_table(
        cols: dict, table: str, db: str = "test_db", enable_cdf: bool = False
    ) -> None:
        """Create a delta table for test purposes.

        Args:
            cols: dict of columns to create table and their types.
            table: table name.
            db: database name.
            enable_cdf: whether to enable change data feed, or not.
        """
        ExecEnv.SESSION.sql(
            f"""
            CREATE EXTERNAL TABLE {db}.{table} (
                {','.join([f'{cname} {ctype}' for cname, ctype in cols.items()])}
            )
            USING delta
            TBLPROPERTIES (delta.enableChangeDataFeed = {str(enable_cdf).lower()})
            """
        )


================================================
FILE: tests/utils/dq_rules_table_utils.py
================================================
"""Utils for dealing with DQ Rules tables."""

from lakehouse_engine.core.exec_env import ExecEnv
from tests.utils.local_storage import LocalStorage


def _create_dq_functions_source_table(
    test_resources_path: str,
    lakehouse_in_path: str,
    lakehouse_out_path: str,
    test_name: str,
    scenario: str,
    table_name: str,
) -> None:
    """Create test dq functions source table.

    Args:
        test_resources_path: path to the test resources.
        lakehouse_in_path: path to the lakehouse in.
        lakehouse_out_path: path to the lakehouse out.
        test_name: name of the test.
        scenario: name of the test scenario.
        table_name: name of the test table.
    """
    LocalStorage.copy_file(
        f"{test_resources_path}/{test_name}/data/dq_functions/{table_name}.csv",
        f"{lakehouse_in_path}/{test_name}/{scenario}/dq_functions/",
    )

    ExecEnv.SESSION.sql(
        f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            dq_rule_id STRING,
            dq_check_type STRING,
            dq_tech_function STRING,
            execution_point STRING,
            schema STRING,
            table STRING,
            column STRING,
            filters STRING,
            arguments STRING,
            expected_technical_expression STRING,
            dimension STRING
        )
        USING delta
        LOCATION '{lakehouse_out_path}/{test_name}/{scenario}/dq_functions'
        TBLPROPERTIES(
          'lakehouse.primary_key'='dq_rule_id',
          'delta.enableChangeDataFeed'='false'
        )
        """
    )
    dq_functions = (
        ExecEnv.SESSION.read.option("delimiter", "|")
        .option("header", True)
        .csv(
            f"{lakehouse_in_path}/{test_name}/{scenario}/dq_functions/{table_name}.csv"
        )
    )

    dq_functions.write.saveAsTable(
        name=f"{table_name}", format="delta", mode="overwrite"
    )


================================================
FILE: tests/utils/exec_env_helpers.py
================================================
"""Module with helper functions to interact with test execution environment."""

from lakehouse_engine.core.exec_env import ExecEnv


class ExecEnvHelpers(object):
    """Class with helper functions to interact with test execution environment."""

    @staticmethod
    def prepare_exec_env(spark_driver_memory: str) -> None:
        """Create single execution environment session."""
        ExecEnv.get_or_create(
            app_name="Lakehouse Engine Tests",
            enable_hive_support=False,
            config={
                "spark.master": "local[2]",
                "spark.driver.memory": spark_driver_memory,
                "spark.sql.warehouse.dir": "file:///app/tests/lakehouse/spark-warehouse/",  # noqa: E501
                "spark.sql.shuffle.partitions": "2",
                "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
                "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.delta.catalog.DeltaCatalog",  # noqa: E501
                "spark.jars.packages": "io.delta:delta-spark_2.13:4.0.0,org.xerial:sqlite-jdbc:3.50.3.0",  # noqa: E501
                "spark.jars.excludes": "net.sourceforge.f2j:arpack_combined_all",
                "spark.sql.sources.parallelPartitionDiscovery.parallelism": "2",
                "spark.sql.legacy.charVarcharAsString": True,
            },
        )

    @classmethod
    def set_exec_env_config(cls, key: str, value: str) -> None:
        """Set any execution environment (e.g., spark) session setting."""
        ExecEnv.SESSION.conf.set(key, value)

    @classmethod
    def reset_default_spark_session_configs(cls) -> None:
        """Reset spark session configs."""
        cls.set_exec_env_config(
            "spark.databricks.delta.schema.autoMerge.enabled", "false"
        )
        cls.set_exec_env_config("spark.sql.streaming.schemaInference", "false")
        cls.set_exec_env_config(
            "spark.sql.sources.partitionColumnTypeInference.enabled", "true"
        )


================================================
FILE: tests/utils/local_storage.py
================================================
"""Utilities to interact with the local file system used in the tests."""

import glob
from os import makedirs, path, remove
from pathlib import Path
from shutil import copy, copytree, rmtree

from lakehouse_engine.utils.logging_handler import LoggingHandler

_LOGGER = LoggingHandler(__name__).get_logger()


class LocalStorage(object):
    """Helper class to support local storage operations in tests."""

    @staticmethod
    def copy_file(from_path: str, to_path: str) -> None:
        """Copy files (supports regex) into target file or folder.

        :param str from_path: path from where to copy files from (supports regex).
        :param str to_path: path to where to copy files to.
        """
        makedirs(path.dirname(to_path), exist_ok=True)

        for file in glob.glob(from_path):
            copy(file, to_path)

    @staticmethod
    def clean_folder(folder_path: str) -> None:
        """Clean a folder content.

        :param str folder_path: path of the folder to clean.
        """
        if Path(folder_path).is_dir():
            rmtree(folder_path)

    @staticmethod
    def delete_file(file_path: str) -> None:
        """Delete a file.

        :param str file_path: path of the file(s) to delete (supports regex).
        """
        for file in glob.glob(file_path):
            if Path(file).exists():
                remove(file)

    @staticmethod
    def read_file(file_path: str) -> str:
        """Read file from directory.

        Args:
            file_path: path of the file to be read.
        """
        with open(file_path, "r") as f:
            result = f.read()
        return result

    @staticmethod
    def copy_dir(source: str, destination: str) -> None:
        """Copy all files in a directory.

        Args:
            source: string with the source location.
            destination: string with the destination location.
        """
        copytree(source, destination, dirs_exist_ok=True)


================================================
FILE: tests/utils/mocks.py
================================================
"""Module to hold utilities Mocks tests."""

from __future__ import annotations

from typing import Any, Optional
from unittest.mock import MagicMock


class MockRESTResponse:
    """Mock Rest Responses for tests."""

    def __init__(
        self,
        status_code: int,
        json_data: Optional[dict[str, Any]] = None,
        content: bytes = b"",
    ) -> None:
        """Construct MockRESTResponse instances.

        :param status_code: status code.
        :param json_data: json response.
        :param content: raw response content.
        """
        self.status_code: int = status_code
        self.json_data: Optional[dict[str, Any]] = json_data
        self.content: bytes = content
        self.text: str = content.decode("utf-8", errors="ignore") if content else ""
        self.raise_for_status: MagicMock = MagicMock()

    def json(self) -> Optional[dict[str, Any]]:
        """Get json response.

        :return dict: json response.
        """
        return self.json_data

    def __enter__(self) -> MockRESTResponse:
        """Allow use as a context manager."""
        return self

    def __exit__(
        self,
        exc_type: type[BaseException] | None,
        exc: BaseException | None,
        tb: Any,
    ) -> None:
        """Context manager exit."""
        return None


================================================
FILE: tests/utils/smtp_server.py
================================================
"""A simple SMTP server for testing purposes."""

from logging import Logger
from typing import Any

from aiosmtpd import controller
from aiosmtpd.handlers import Message

from lakehouse_engine.utils.logging_handler import LoggingHandler


class SMTPHandler(Message):
    """Custom handler to capture emails during testing."""

    def __init__(self) -> None:
        """Initialize the SMTP handler."""
        super().__init__()
        self.messages: list = []

    def handle_message(self, message: Any) -> None:
        """Handle incoming messages and store them for verification.

        Args:
            message: The incoming email message.

        Returns:
            A string indicating the result of the message handling.
        """
        self.messages.append(message)


class SMTPServer:
    """Test SMTP server for unit testing."""

    _LOGGER: Logger = LoggingHandler(__name__).get_logger()

    def __init__(self, host: str, port: int) -> None:
        """Initialize the SMTP server.

        Args:
            host: The hostname of the SMTP server.
            port: The port number of the SMTP server.
        """
        self.host = host
        self.port = port
        self.handler = SMTPHandler()
        self.controller: controller.Controller | None = None

    def start(self) -> None:
        """Start the SMTP server."""
        self.controller = controller.Controller(
            self.handler, hostname=self.host, port=self.port
        )
        self.controller.start()
        self._LOGGER.info(f"Test SMTP server started on {self.host}:{self.port}")

    def stop(self) -> None:
        """Stop the SMTP server."""
        if self.controller:
            self.controller.stop()
            self._LOGGER.info("Test SMTP server stopped")

    def get_messages(self) -> list:
        """Get all captured messages."""
        return self.handler.messages

    def clear_messages(self) -> None:
        """Clear all captured messages."""
        self.handler.messages.clear()

    def get_last_message(self) -> Any:
        """Get the last received message."""
        return self.handler.messages[-1] if self.handler.messages else None