Showing preview only (3,655K chars total). Download the full file or copy to clipboard to get everything.
Repository: adidas/lakehouse-engine
Branch: master
Commit: 1487dfdcafbf
Files: 1183
Total size: 3.2 MB
Directory structure:
gitextract_pl4w_c1i/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ └── feature_request.md
│ └── pull_request_template.md
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.txt
├── Makefile
├── README.md
├── assets/
│ └── gab/
│ ├── metadata/
│ │ ├── gab/
│ │ │ └── f_agg_dummy_sales_kpi/
│ │ │ ├── 1_article_category.sql
│ │ │ └── 2_f_agg_dummy_sales_kpi.sql
│ │ └── tables/
│ │ ├── dim_calendar.sql
│ │ ├── dummy_sales_kpi.sql
│ │ ├── gab_log_events.sql
│ │ ├── gab_use_case_results.sql
│ │ └── lkp_query_builder.sql
│ ├── notebooks/
│ │ ├── gab.py
│ │ ├── gab_dim_calendar.py
│ │ ├── gab_job_manager.py
│ │ └── query_builder_helper.py
│ └── utils/
│ ├── databricks_job_utils.py
│ └── query_builder_utils.py
├── cicd/
│ ├── .bumpversion.cfg
│ ├── Dockerfile
│ ├── Jenkinsfile
│ ├── Jenkinsfile_deploy
│ ├── bandit.yaml
│ ├── code_doc/
│ │ ├── content.css
│ │ ├── custom_example_macros.py
│ │ ├── examples.json
│ │ ├── gen_ref_nav.py
│ │ ├── index.html.jinja2
│ │ ├── mkdocs.yml
│ │ ├── mkdocs_macros.py
│ │ ├── module.html.jinja2
│ │ ├── render_doc.py
│ │ └── render_docs.py
│ ├── flake8.conf
│ ├── meta.yaml
│ ├── requirements.txt
│ ├── requirements_azure.txt
│ ├── requirements_cicd.txt
│ ├── requirements_dq.txt
│ ├── requirements_os.txt
│ ├── requirements_sftp.txt
│ └── requirements_sharepoint.txt
├── lakehouse_engine/
│ ├── __init__.py
│ ├── algorithms/
│ │ ├── __init__.py
│ │ ├── algorithm.py
│ │ ├── data_loader.py
│ │ ├── dq_validator.py
│ │ ├── exceptions.py
│ │ ├── gab.py
│ │ ├── reconciliator.py
│ │ ├── sensor.py
│ │ └── sensors/
│ │ ├── __init__.py
│ │ ├── heartbeat.py
│ │ └── sensor.py
│ ├── configs/
│ │ ├── __init__.py
│ │ └── engine.yaml
│ ├── core/
│ │ ├── __init__.py
│ │ ├── dbfs_file_manager.py
│ │ ├── definitions.py
│ │ ├── exec_env.py
│ │ ├── executable.py
│ │ ├── file_manager.py
│ │ ├── gab_manager.py
│ │ ├── gab_sql_generator.py
│ │ ├── s3_file_manager.py
│ │ ├── sensor_manager.py
│ │ └── table_manager.py
│ ├── dq_processors/
│ │ ├── __init__.py
│ │ ├── custom_expectations/
│ │ │ ├── __init__.py
│ │ │ ├── expect_column_pair_a_to_be_not_equal_to_b.py
│ │ │ ├── expect_column_pair_a_to_be_smaller_or_equal_than_b.py
│ │ │ ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py
│ │ │ ├── expect_column_values_to_be_date_not_older_than.py
│ │ │ ├── expect_column_values_to_not_be_null_or_empty_string.py
│ │ │ ├── expect_multicolumn_column_a_must_equal_b_or_c.py
│ │ │ └── expect_queried_column_agg_value_to_be.py
│ │ ├── dq_factory.py
│ │ ├── exceptions.py
│ │ └── validator.py
│ ├── engine.py
│ ├── io/
│ │ ├── __init__.py
│ │ ├── exceptions.py
│ │ ├── reader.py
│ │ ├── reader_factory.py
│ │ ├── readers/
│ │ │ ├── __init__.py
│ │ │ ├── dataframe_reader.py
│ │ │ ├── file_reader.py
│ │ │ ├── jdbc_reader.py
│ │ │ ├── kafka_reader.py
│ │ │ ├── query_reader.py
│ │ │ ├── sap_b4_reader.py
│ │ │ ├── sap_bw_reader.py
│ │ │ ├── sftp_reader.py
│ │ │ ├── sharepoint_reader.py
│ │ │ └── table_reader.py
│ │ ├── writer.py
│ │ ├── writer_factory.py
│ │ └── writers/
│ │ ├── __init__.py
│ │ ├── console_writer.py
│ │ ├── dataframe_writer.py
│ │ ├── delta_merge_writer.py
│ │ ├── file_writer.py
│ │ ├── jdbc_writer.py
│ │ ├── kafka_writer.py
│ │ ├── rest_api_writer.py
│ │ ├── sharepoint_writer.py
│ │ └── table_writer.py
│ ├── terminators/
│ │ ├── __init__.py
│ │ ├── cdf_processor.py
│ │ ├── dataset_optimizer.py
│ │ ├── notifier.py
│ │ ├── notifier_factory.py
│ │ ├── notifiers/
│ │ │ ├── __init__.py
│ │ │ ├── email_notifier.py
│ │ │ ├── exceptions.py
│ │ │ └── notification_templates.py
│ │ ├── sensor_terminator.py
│ │ ├── spark_terminator.py
│ │ └── terminator_factory.py
│ ├── transformers/
│ │ ├── __init__.py
│ │ ├── aggregators.py
│ │ ├── column_creators.py
│ │ ├── column_reshapers.py
│ │ ├── condensers.py
│ │ ├── custom_transformers.py
│ │ ├── data_maskers.py
│ │ ├── date_transformers.py
│ │ ├── exceptions.py
│ │ ├── filters.py
│ │ ├── joiners.py
│ │ ├── null_handlers.py
│ │ ├── optimizers.py
│ │ ├── regex_transformers.py
│ │ ├── repartitioners.py
│ │ ├── transformer_factory.py
│ │ ├── unions.py
│ │ └── watermarker.py
│ └── utils/
│ ├── __init__.py
│ ├── acon_utils.py
│ ├── configs/
│ │ ├── __init__.py
│ │ └── config_utils.py
│ ├── databricks_utils.py
│ ├── dq_utils.py
│ ├── engine_usage_stats.py
│ ├── expectations_utils.py
│ ├── extraction/
│ │ ├── __init__.py
│ │ ├── jdbc_extraction_utils.py
│ │ ├── sap_b4_extraction_utils.py
│ │ ├── sap_bw_extraction_utils.py
│ │ └── sftp_extraction_utils.py
│ ├── file_utils.py
│ ├── gab_utils.py
│ ├── logging_handler.py
│ ├── rest_api.py
│ ├── schema_utils.py
│ ├── sharepoint_utils.py
│ ├── spark_utils.py
│ ├── sql_parser_utils.py
│ └── storage/
│ ├── __init__.py
│ ├── dbfs_storage.py
│ ├── file_storage.py
│ ├── file_storage_functions.py
│ ├── local_fs_storage.py
│ └── s3_storage.py
├── lakehouse_engine_usage/
│ ├── __init__.py
│ ├── data_loader/
│ │ ├── __init__.py
│ │ ├── append_load_from_jdbc_with_permissive_mode/
│ │ │ ├── __init__.py
│ │ │ └── append_load_from_jdbc_with_permissive_mode.md
│ │ ├── append_load_with_failfast/
│ │ │ ├── __init__.py
│ │ │ └── append_load_with_failfast.md
│ │ ├── batch_delta_load_init_delta_backfill_with_merge/
│ │ │ ├── __init__.py
│ │ │ └── batch_delta_load_init_delta_backfill_with_merge.md
│ │ ├── custom_transformer/
│ │ │ ├── __init__.py
│ │ │ ├── custom_transformer.md
│ │ │ └── sql_custom_transformer.md
│ │ ├── custom_transformer_sql/
│ │ │ ├── __init__.py
│ │ │ └── custom_transformer_sql.md
│ │ ├── data_loader.md
│ │ ├── extract_from_sap_b4_adso/
│ │ │ ├── __init__.py
│ │ │ └── extract_from_sap_b4_adso.md
│ │ ├── extract_from_sap_bw_dso/
│ │ │ ├── __init__.py
│ │ │ └── extract_from_sap_bw_dso.md
│ │ ├── extract_from_sftp/
│ │ │ ├── __init__.py
│ │ │ └── extract_from_sftp.md
│ │ ├── extract_using_jdbc_connection/
│ │ │ ├── __init__.py
│ │ │ └── extract_using_jdbc_connection.md
│ │ ├── filtered_full_load/
│ │ │ ├── __init__.py
│ │ │ └── filtered_full_load.md
│ │ ├── filtered_full_load_with_selective_replace/
│ │ │ ├── __init__.py
│ │ │ └── filtered_full_load_with_selective_replace.md
│ │ ├── flatten_schema_and_explode_columns/
│ │ │ ├── __init__.py
│ │ │ └── flatten_schema_and_explode_columns.md
│ │ ├── full_load/
│ │ │ ├── __init__.py
│ │ │ └── full_load.md
│ │ ├── read_from_dataframe/
│ │ │ ├── __init__.py
│ │ │ └── read_from_dataframe.md
│ │ ├── read_from_sharepoint/
│ │ │ ├── __init__.py
│ │ │ └── read_from_sharepoint.md
│ │ ├── streaming_append_load_with_malformed/
│ │ │ ├── __init__.py
│ │ │ └── streaming_append_load_with_malformed.md
│ │ ├── streaming_append_load_with_terminator/
│ │ │ ├── __init__.py
│ │ │ └── streaming_append_load_with_terminator.md
│ │ ├── streaming_delta_load_with_group_and_rank_condensation/
│ │ │ ├── __init__.py
│ │ │ └── streaming_delta_load_with_group_and_rank_condensation.md
│ │ ├── streaming_delta_with_late_arriving_and_out_of_order_events/
│ │ │ ├── __init__.py
│ │ │ └── streaming_delta_with_late_arriving_and_out_of_order_events.md
│ │ ├── write_and_read_dataframe/
│ │ │ ├── __init__.py
│ │ │ └── write_and_read_dataframe.md
│ │ ├── write_to_console/
│ │ │ ├── __init__.py
│ │ │ └── write_to_console.md
│ │ ├── write_to_rest_api/
│ │ │ ├── __init__.py
│ │ │ └── write_to_rest_api.md
│ │ └── write_to_sharepoint/
│ │ ├── __init__.py
│ │ └── write_to_sharepoint.md
│ ├── data_quality/
│ │ ├── __init__.py
│ │ ├── custom_expectations/
│ │ │ ├── __init__.py
│ │ │ └── custom_expectations.md
│ │ ├── data_quality.md
│ │ ├── data_quality_validator/
│ │ │ ├── __init__.py
│ │ │ └── data_quality_validator.md
│ │ ├── minimal_example/
│ │ │ ├── __init__.py
│ │ │ └── minimal_example.md
│ │ ├── prisma/
│ │ │ ├── __init__.py
│ │ │ └── prisma.md
│ │ ├── result_sink/
│ │ │ ├── __init__.py
│ │ │ └── result_sink.md
│ │ ├── row_tagging/
│ │ │ ├── __init__.py
│ │ │ └── row_tagging.md
│ │ └── validations_failing/
│ │ ├── __init__.py
│ │ └── validations_failing.md
│ ├── gab/
│ │ ├── __init__.py
│ │ ├── gab.md
│ │ └── step_by_step/
│ │ ├── __init__.py
│ │ └── step_by_step.md
│ ├── lakehouse_engine_usage.md
│ ├── managerhelper/
│ │ ├── managerhelper.md
│ │ ├── operations-script.js
│ │ ├── operations-styles-mkdocs.css
│ │ └── styles-mkdocs.css
│ ├── reconciliator/
│ │ ├── __init__.py
│ │ └── reconciliator.md
│ ├── sensor/
│ │ ├── __init__.py
│ │ ├── delta_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_table.md
│ │ ├── delta_upstream_sensor_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_upstream_sensor_table.md
│ │ ├── file/
│ │ │ ├── __init__.py
│ │ │ └── file.md
│ │ ├── jdbc_table/
│ │ │ ├── __init__.py
│ │ │ └── jdbc_table.md
│ │ ├── kafka/
│ │ │ ├── __init__.py
│ │ │ └── kafka.md
│ │ ├── sap_bw_b4/
│ │ │ ├── __init__.py
│ │ │ └── sap_bw_b4.md
│ │ ├── sensor.md
│ │ └── update_sensor_status/
│ │ ├── __init__.py
│ │ └── update_sensor_status.md
│ └── sensors/
│ ├── __init__.py
│ ├── heartbeat/
│ │ ├── __init__.py
│ │ ├── delta_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_table.md
│ │ ├── heartbeat.md
│ │ ├── heartbeat_sensor_data_feed/
│ │ │ ├── __init__.py
│ │ │ └── heartbeat_sensor_data_feed.md
│ │ ├── kafka/
│ │ │ ├── __init__.py
│ │ │ └── kafka.md
│ │ ├── manual_table/
│ │ │ ├── __init__.py
│ │ │ └── manual_table.md
│ │ ├── sap_bw_b4/
│ │ │ ├── __init__.py
│ │ │ └── sap_bw_b4.md
│ │ ├── trigger_file/
│ │ │ ├── __init__.py
│ │ │ └── trigger_file.md
│ │ └── update_heartbeat_sensor_status/
│ │ ├── __init__.py
│ │ └── update_heartbeat_sensor_status.md
│ ├── sensor/
│ │ ├── __init__.py
│ │ ├── delta_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_table.md
│ │ ├── delta_upstream_sensor_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_upstream_sensor_table.md
│ │ ├── file/
│ │ │ ├── __init__.py
│ │ │ └── file.md
│ │ ├── jdbc_table/
│ │ │ ├── __init__.py
│ │ │ └── jdbc_table.md
│ │ ├── kafka/
│ │ │ ├── __init__.py
│ │ │ └── kafka.md
│ │ ├── sap_bw_b4/
│ │ │ ├── __init__.py
│ │ │ └── sap_bw_b4.md
│ │ ├── sensor.md
│ │ └── update_sensor_status/
│ │ ├── __init__.py
│ │ └── update_sensor_status.md
│ └── sensors.md
├── pyproject.toml
├── samples/
│ ├── cricket_dq_tutorial.py
│ └── tpch_load_and_analysis_tutorial.py
└── tests/
├── __init__.py
├── configs/
│ ├── __init__.py
│ └── engine.yaml
├── conftest.py
├── feature/
│ ├── __init__.py
│ ├── custom_expectations/
│ │ ├── __init__.py
│ │ ├── test_custom_expectations.py
│ │ └── test_expectation_validity.py
│ ├── data_loader_custom_transformer/
│ │ ├── __init__.py
│ │ ├── test_data_loader_custom_transformer_calculate_kpi.py
│ │ ├── test_data_loader_custom_transformer_delta_load.py
│ │ └── test_data_loader_custom_transformer_sql_transformation.py
│ ├── delta_load/
│ │ ├── __init__.py
│ │ ├── test_delta_load_group_and_rank.py
│ │ ├── test_delta_load_merge_options.py
│ │ └── test_delta_load_record_mode_cdc.py
│ ├── test_append_load.py
│ ├── test_data_quality.py
│ ├── test_dq_validator.py
│ ├── test_engine_usage_stats.py
│ ├── test_extract_from_sap_b4.py
│ ├── test_extract_from_sap_bw.py
│ ├── test_file_manager.py
│ ├── test_file_manager_dbfs.py
│ ├── test_file_manager_s3.py
│ ├── test_full_load.py
│ ├── test_gab.py
│ ├── test_heartbeat.py
│ ├── test_jdbc_reader.py
│ ├── test_materialize_cdf.py
│ ├── test_notification.py
│ ├── test_reconciliation.py
│ ├── test_schema_evolution.py
│ ├── test_sensors.py
│ ├── test_sftp_reader.py
│ ├── test_sharepoint_reader.py
│ ├── test_sharepoint_writer.py
│ ├── test_table_manager.py
│ ├── test_writers.py
│ └── transformations/
│ ├── __init__.py
│ ├── test_chain_transformations.py
│ ├── test_column_creators.py
│ ├── test_column_reshapers.py
│ ├── test_data_maskers.py
│ ├── test_date_transformers.py
│ ├── test_drop_duplicate_rows.py
│ ├── test_joiners.py
│ ├── test_multiple_transformations.py
│ ├── test_null_handlers.py
│ ├── test_optimizers.py
│ ├── test_regex_transformers.py
│ ├── test_unions.py
│ └── test_watermarker.py
├── resources/
│ ├── feature/
│ │ ├── append_load/
│ │ │ ├── failfast/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ └── part-03.csv
│ │ │ ├── jdbc_permissive/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ └── part-03.csv
│ │ │ ├── streaming_dropmalformed/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ └── part-03.csv
│ │ │ │ └── streaming.json
│ │ │ └── streaming_with_terminators/
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ └── streaming.json
│ │ ├── custom_expectations/
│ │ │ ├── expect_column_pair_a_to_be_not_equal_to_b/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_pair_a_to_be_smaller_or_equal_than_b/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_values_to_be_date_not_older_than/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_values_to_not_be_null_or_empty_string/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_multicolumn_column_a_must_equal_b_or_c/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ └── expect_queried_column_agg_value_to_be/
│ │ │ ├── batch.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── dq_control_success.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── dq_sales_schema.json
│ │ │ └── streaming.json
│ │ ├── data_loader_custom_transformer/
│ │ │ ├── calculate_kpi/
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source_schema.json
│ │ │ ├── delta_load/
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ └── sql_transformation/
│ │ │ ├── control_schema.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ └── source_schema.json
│ │ ├── data_quality/
│ │ │ ├── build_data_docs/
│ │ │ │ ├── with_data_docs_local_fs/
│ │ │ │ │ └── 20240410-080323-dq_success-sales_orders-checkpoint/
│ │ │ │ │ └── 20240410T080323.289170Z/
│ │ │ │ │ └── 7ba399ea28cc40bf8c79213a440aeb91.json
│ │ │ │ └── without_data_docs_local_fs/
│ │ │ │ └── 20240409-143548-dq_validator-sales_source-checkpoint/
│ │ │ │ └── 20240409T143548.454043Z/
│ │ │ │ └── f0d7bd293d22bcfd3c1fec5a7d566638.json
│ │ │ ├── load_with_dq_table/
│ │ │ │ ├── delta_with_dupl_tag_gen_fail/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ ├── dq_functions/
│ │ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init.csv
│ │ │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── delta_with_duplicates_tag/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ ├── dq_functions/
│ │ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init.csv
│ │ │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ └── full_overwrite_tag/
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── batch_new.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── data_validator.json
│ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ ├── sales.json
│ │ │ │ │ └── sales_schema.json
│ │ │ │ ├── dq_functions/
│ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init.csv
│ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_new.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── load_with_dq_validator/
│ │ │ │ ├── delta_with_dupl_tag_gen_fail/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── delta_with_duplicates/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ └── data_validator_schema.json
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── delta_with_duplicates_tag/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── full_overwrite/
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ ├── batch_new.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ └── data_validator_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── full_overwrite_tag/
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ ├── batch_new.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── no_transformers/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ └── data_validator_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ └── part-04.csv
│ │ │ │ ├── streaming_init.json
│ │ │ │ └── streaming_new.json
│ │ │ └── validator/
│ │ │ └── data/
│ │ │ ├── control/
│ │ │ │ └── data_validator.csv
│ │ │ ├── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_dq_failure.csv
│ │ │ │ ├── test_db.dq_functions_source_dq_failure_error_disabled.csv
│ │ │ │ ├── test_db.dq_functions_source_dq_failure_max_percentage.csv
│ │ │ │ └── test_db.dq_functions_source_dq_success.csv
│ │ │ └── source/
│ │ │ └── part-01.csv
│ │ ├── delta_load/
│ │ │ ├── group_and_rank/
│ │ │ │ ├── fail_with_duplicates_in_same_file/
│ │ │ │ │ ├── batch_delta.json
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ ├── control_batch_schema.json
│ │ │ │ │ ├── control_streaming_schema.json
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── batch.csv
│ │ │ │ │ │ │ └── streaming.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ │ ├── WE_SO_SCL_202108111500000000.csv
│ │ │ │ │ │ └── WE_SO_SCL_202108111600000000.csv
│ │ │ │ │ ├── source_schema.json
│ │ │ │ │ └── streaming_delta.json
│ │ │ │ └── with_duplicates_in_same_file/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── control_batch_schema.json
│ │ │ │ ├── control_streaming_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── batch.csv
│ │ │ │ │ │ └── streaming.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ ├── WE_SO_SCL_202108111500000000.csv
│ │ │ │ │ └── WE_SO_SCL_202108111600000000.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming_delta.json
│ │ │ ├── merge_options/
│ │ │ │ ├── control_batch_schema.json
│ │ │ │ ├── insert_column_set/
│ │ │ │ │ ├── batch_delta.json
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── batch.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ └── WE_SO_SCL_202108111500000000.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ ├── update_all/
│ │ │ │ │ ├── batch_delta.json
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── batch.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ └── WE_SO_SCL_202108111500000000.csv
│ │ │ │ └── update_column_set/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── batch.csv
│ │ │ │ └── source/
│ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ └── WE_SO_SCL_202108111500000000.csv
│ │ │ └── record_mode_cdc/
│ │ │ ├── backfill/
│ │ │ │ ├── batch_backfill.json
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ ├── part-04.csv
│ │ │ │ └── part-05.csv
│ │ │ ├── direct_silver_load/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ ├── late_arriving_changes/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ └── part-04.csv
│ │ │ │ └── streaming_delta.json
│ │ │ ├── out_of_order_changes/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ └── part-04.csv
│ │ │ │ └── streaming_delta.json
│ │ │ ├── with_deletes_additional_columns/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ ├── with_duplicates/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ └── with_upserts_only_removed_columns/
│ │ │ ├── batch_delta.json
│ │ │ ├── batch_init.json
│ │ │ └── data/
│ │ │ ├── control/
│ │ │ │ └── part-01.csv
│ │ │ └── source/
│ │ │ ├── part-01.json
│ │ │ ├── part-02.json
│ │ │ ├── part-03.json
│ │ │ └── part-04.json
│ │ ├── dq_validator/
│ │ │ ├── batch.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── data_restore_control.csv
│ │ │ │ │ ├── dq_control_failure.csv
│ │ │ │ │ ├── dq_control_failure_disabled.csv
│ │ │ │ │ ├── dq_control_success.csv
│ │ │ │ │ ├── dq_control_success_explode.csv
│ │ │ │ │ └── dq_control_success_explode_disabled.csv
│ │ │ │ ├── dq_functions/
│ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── dq_sales_schema.json
│ │ │ ├── streaming.json
│ │ │ ├── streaming_dataframe_two_runs/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_streaming_dataframe_two_runs_first_run.csv
│ │ │ │ └── test_db.dq_functions_streaming_dataframe_two_runs_second_run.csv
│ │ │ ├── table_batch_dataframe_failure_disabled/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_batch_dataframe_success/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_batch_dq_rule/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_table_rule_id_failure.csv
│ │ │ │ └── test_db.dq_table_rule_id_success.csv
│ │ │ ├── table_batch_failure_disabled/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_batch_success/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_streaming_dq_rule/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_table_rule_id_failure.csv
│ │ │ │ └── test_db.dq_table_rule_id_success.csv
│ │ │ ├── table_streaming_failure_disabled/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ └── table_streaming_success/
│ │ │ └── data/
│ │ │ └── dq_functions/
│ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ ├── engine_usage_stats/
│ │ │ ├── dq_validator/
│ │ │ │ └── data/
│ │ │ │ ├── control.json
│ │ │ │ └── source.csv
│ │ │ ├── load_custom_transf_and_df/
│ │ │ │ └── data/
│ │ │ │ ├── control.json
│ │ │ │ └── source.csv
│ │ │ ├── load_simple_acon/
│ │ │ │ └── data/
│ │ │ │ ├── control.json
│ │ │ │ └── source.csv
│ │ │ └── table_manager/
│ │ │ └── data/
│ │ │ └── control.json
│ │ ├── extract_from_sap_b4/
│ │ │ ├── extract_aq_dso/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_1.csv
│ │ │ │ │ ├── dummy_table_2.csv
│ │ │ │ │ └── rspmrequest.csv
│ │ │ │ ├── dummy_table_schema.json
│ │ │ │ └── rspmrequest_schema.json
│ │ │ └── extract_cl_dso/
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ └── source/
│ │ │ │ ├── dummy_table.csv
│ │ │ │ ├── dummy_table_cl_1.csv
│ │ │ │ ├── dummy_table_cl_2.csv
│ │ │ │ └── rspmrequest.csv
│ │ │ ├── dummy_table_cl_schema.json
│ │ │ ├── dummy_table_schema.json
│ │ │ └── rspmrequest_schema.json
│ │ ├── extract_from_sap_bw/
│ │ │ ├── derive_changelog_table_name/
│ │ │ │ ├── RSBASIDOC_schema.json
│ │ │ │ ├── RSTSODS_schema.json
│ │ │ │ └── data/
│ │ │ │ └── source/
│ │ │ │ ├── RSBASIDOC.csv
│ │ │ │ └── RSTSODS.csv
│ │ │ ├── extract_dso/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_cl_1.csv
│ │ │ │ │ ├── dummy_table_cl_2.csv
│ │ │ │ │ └── rsodsactreq.csv
│ │ │ │ ├── dummy_table_cl_schema.json
│ │ │ │ ├── dummy_table_schema.json
│ │ │ │ └── rsodsactreq_schema.json
│ │ │ └── extract_write_optimised_dso/
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_actreq_timestamp.csv
│ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ └── source/
│ │ │ │ ├── dummy_table.csv
│ │ │ │ ├── dummy_table_1.csv
│ │ │ │ ├── dummy_table_2.csv
│ │ │ │ └── rsodsactreq.csv
│ │ │ ├── dummy_table_schema.json
│ │ │ └── rsodsactreq_schema.json
│ │ ├── file_manager/
│ │ │ ├── check_restore_status/
│ │ │ │ ├── acon_check_restore_status_directory.json
│ │ │ │ └── acon_check_restore_status_single_object.json
│ │ │ ├── copy_object/
│ │ │ │ ├── acon_copy_directory.json
│ │ │ │ ├── acon_copy_directory_dry_run.json
│ │ │ │ ├── acon_copy_single_object.json
│ │ │ │ └── acon_copy_single_object_dry_run.json
│ │ │ ├── delete_objects/
│ │ │ │ ├── acon_delete_objects.json
│ │ │ │ └── acon_delete_objects_dry_run.json
│ │ │ ├── request_restore/
│ │ │ │ ├── acon_request_restore_directory.json
│ │ │ │ └── acon_request_restore_single_object.json
│ │ │ └── request_restore_to_destination_and_wait/
│ │ │ ├── acon_request_restore_to_destination_and_wait_directory.json
│ │ │ ├── acon_request_restore_to_destination_and_wait_single_object.json
│ │ │ └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json
│ │ ├── file_manager_dbfs/
│ │ │ ├── copy_objects/
│ │ │ │ ├── acon_copy_directory.json
│ │ │ │ ├── acon_copy_directory_dry_run.json
│ │ │ │ └── acon_copy_single_object.json
│ │ │ ├── delete_objects/
│ │ │ │ ├── acon_delete_objects.json
│ │ │ │ └── acon_delete_objects_dry_run.json
│ │ │ └── move_objects/
│ │ │ ├── acon_move_objects.json
│ │ │ └── acon_move_objects_dry_run.json
│ │ ├── file_manager_s3/
│ │ │ ├── check_restore_status/
│ │ │ │ ├── acon_check_restore_status_directory.json
│ │ │ │ └── acon_check_restore_status_single_object.json
│ │ │ ├── copy_objects/
│ │ │ │ ├── acon_copy_directory.json
│ │ │ │ ├── acon_copy_directory_dry_run.json
│ │ │ │ ├── acon_copy_single_object.json
│ │ │ │ └── acon_copy_single_object_dry_run.json
│ │ │ ├── delete_objects/
│ │ │ │ ├── acon_delete_objects.json
│ │ │ │ └── acon_delete_objects_dry_run.json
│ │ │ ├── request_restore/
│ │ │ │ ├── acon_request_restore_directory.json
│ │ │ │ └── acon_request_restore_single_object.json
│ │ │ └── request_restore_to_destination_and_wait/
│ │ │ ├── acon_request_restore_to_destination_and_wait_directory.json
│ │ │ ├── acon_request_restore_to_destination_and_wait_single_object.json
│ │ │ └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json
│ │ ├── full_load/
│ │ │ ├── full_overwrite/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── with_filter/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ └── with_filter_partition_overwrite/
│ │ │ ├── batch.json
│ │ │ ├── batch_init.json
│ │ │ └── data/
│ │ │ ├── control/
│ │ │ │ └── part-01.csv
│ │ │ └── source/
│ │ │ ├── part-01.csv
│ │ │ └── part-02.csv
│ │ ├── gab/
│ │ │ ├── control/
│ │ │ │ ├── data/
│ │ │ │ │ ├── vw_dummy_sales_kpi.csv
│ │ │ │ │ ├── vw_nam_orders_all_snapshot.csv
│ │ │ │ │ ├── vw_nam_orders_filtered_snapshot.csv
│ │ │ │ │ ├── vw_negative_offset_orders_all.csv
│ │ │ │ │ ├── vw_negative_offset_orders_filtered.csv
│ │ │ │ │ ├── vw_orders_all.csv
│ │ │ │ │ ├── vw_orders_all_snapshot.csv
│ │ │ │ │ ├── vw_orders_filtered.csv
│ │ │ │ │ └── vw_orders_filtered_snapshot.csv
│ │ │ │ └── schema/
│ │ │ │ ├── vw_dummy_sales_kpi.json
│ │ │ │ └── vw_orders.json
│ │ │ ├── setup/
│ │ │ │ ├── column_list/
│ │ │ │ │ ├── calendar.json
│ │ │ │ │ ├── dummy_sales_kpi.json
│ │ │ │ │ ├── gab_log_events.json
│ │ │ │ │ ├── gab_use_case_results.json
│ │ │ │ │ ├── lkp_query_builder.json
│ │ │ │ │ └── order_events.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── dummy_sales_kpi.csv
│ │ │ │ │ ├── lkp_query_builder.csv
│ │ │ │ │ └── order_events.csv
│ │ │ │ └── schema/
│ │ │ │ ├── dummy_sales_kpi.json
│ │ │ │ ├── lkp_query_builder.json
│ │ │ │ └── order_events.json
│ │ │ └── usecases/
│ │ │ ├── dummy_sales_kpi/
│ │ │ │ ├── 1_article_category.sql
│ │ │ │ ├── 2_dummy_sales_kpi.sql
│ │ │ │ └── scenario/
│ │ │ │ └── dummy_sales_kpi.json
│ │ │ └── order_events/
│ │ │ ├── 1_order_events.sql
│ │ │ └── scenario/
│ │ │ ├── order_events.json
│ │ │ ├── order_events_nam.json
│ │ │ ├── order_events_negative_timezone_offset.json
│ │ │ ├── order_events_snapshot.json
│ │ │ ├── skip_use_case_by_empty_reconciliation.json
│ │ │ ├── skip_use_case_by_empty_requested_cadence.json
│ │ │ ├── skip_use_case_by_not_configured_cadence.json
│ │ │ └── skip_use_case_by_unexisting_cadence.json
│ │ ├── heartbeat/
│ │ │ ├── control/
│ │ │ │ ├── default/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── ctr_heart_tbl_heartb_feed.csv
│ │ │ │ │ │ ├── ctrl_heart_tbl_exec_sensor.csv
│ │ │ │ │ │ ├── ctrl_heart_tbl_trigger_job.csv
│ │ │ │ │ │ ├── ctrl_heart_tbl_updated.csv
│ │ │ │ │ │ └── ctrl_sensor_tbl_upd_status.json
│ │ │ │ │ └── schema/
│ │ │ │ │ ├── ctrl_heart_tbl_schema.json
│ │ │ │ │ └── ctrl_heart_tbl_trig_schema.json
│ │ │ │ └── heartbeat_paused_sensor_new_record/
│ │ │ │ ├── data/
│ │ │ │ │ ├── ctr_heart_tbl_heartb_feed.csv
│ │ │ │ │ ├── ctrl_heart_tbl_exec_sensor.csv
│ │ │ │ │ ├── ctrl_heart_tbl_trigger_job.csv
│ │ │ │ │ ├── ctrl_heart_tbl_updated.csv
│ │ │ │ │ └── ctrl_sensor_tbl_upd_status.json
│ │ │ │ └── schema/
│ │ │ │ └── ctrl_heart_tbl_schema.json
│ │ │ └── setup/
│ │ │ ├── default/
│ │ │ │ ├── column_list/
│ │ │ │ │ ├── heartbeat_sensor_control_table.json
│ │ │ │ │ └── sensor_table.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── setup_heartbeat_data.csv
│ │ │ │ │ └── setup_sensor_data.json
│ │ │ │ └── schema/
│ │ │ │ └── schema_sensor_df.json
│ │ │ └── heartbeat_paused_sensor_new_record/
│ │ │ ├── column_list/
│ │ │ │ ├── heartbeat_sensor_control_table.json
│ │ │ │ └── sensor_table.json
│ │ │ ├── data/
│ │ │ │ ├── setup_heartbeat_data.csv
│ │ │ │ └── setup_sensor_data.json
│ │ │ └── schema/
│ │ │ └── schema_sensor_df.json
│ │ ├── jdbc_reader/
│ │ │ ├── jdbc_format/
│ │ │ │ ├── correct_arguments/
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── predicates/
│ │ │ │ │ └── batch_init.json
│ │ │ │ └── wrong_arguments/
│ │ │ │ └── batch_init.json
│ │ │ └── jdbc_function/
│ │ │ ├── correct_arguments/
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ └── wrong_arguments/
│ │ │ └── batch_init.json
│ │ ├── materialize_cdf/
│ │ │ ├── acon_create_table.json
│ │ │ ├── control_schema.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01_cdf.csv
│ │ │ │ ├── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── table/
│ │ │ │ └── streaming_with_cdf.sql
│ │ │ ├── streaming_with_clean_and_vacuum.json
│ │ │ └── streaming_without_clean_cdf.json
│ │ ├── notification/
│ │ │ └── test_attachement.txt
│ │ ├── reconciliation/
│ │ │ └── data/
│ │ │ ├── current.json
│ │ │ ├── current_different_rows.json
│ │ │ ├── current_fail.json
│ │ │ ├── current_nulls_and_zeros.json
│ │ │ ├── current_nulls_and_zeros_fail.json
│ │ │ ├── truth.json
│ │ │ ├── truth_different_rows.json
│ │ │ ├── truth_empty.json
│ │ │ ├── truth_nulls_and_zeros.json
│ │ │ └── truth_nulls_and_zeros_fail.json
│ │ ├── schema_evolution/
│ │ │ ├── append_load/
│ │ │ │ ├── batch_append_disabled.json
│ │ │ │ ├── batch_append_disabled_cast.json
│ │ │ │ ├── batch_append_enabled.json
│ │ │ │ ├── batch_append_enabled_cast.json
│ │ │ │ ├── batch_init_disabled.json
│ │ │ │ ├── batch_init_enabled.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ ├── part-05.csv
│ │ │ │ │ │ └── part-06.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ ├── part-04.csv
│ │ │ │ │ ├── part-05.csv
│ │ │ │ │ └── part-06.csv
│ │ │ │ └── schema/
│ │ │ │ ├── control/
│ │ │ │ │ ├── control_schema.json
│ │ │ │ │ ├── control_schema_add_column.json
│ │ │ │ │ └── control_schema_rename.json
│ │ │ │ └── source/
│ │ │ │ ├── source_part-01_schema.json
│ │ │ │ ├── source_part-02_schema.json
│ │ │ │ ├── source_part-03_schema.json
│ │ │ │ ├── source_part-04_schema.json
│ │ │ │ ├── source_part-05_schema.json
│ │ │ │ └── source_part-06_schema.json
│ │ │ ├── delta_load/
│ │ │ │ ├── batch_delta_disabled.json
│ │ │ │ ├── batch_delta_disabled_rename.json
│ │ │ │ ├── batch_delta_enabled.json
│ │ │ │ ├── batch_init_disabled.json
│ │ │ │ ├── batch_init_enabled.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ ├── part-04.csv
│ │ │ │ │ │ ├── part-05.csv
│ │ │ │ │ │ └── part-06.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ ├── part-04.csv
│ │ │ │ │ ├── part-05.csv
│ │ │ │ │ └── part-06.csv
│ │ │ │ └── schema/
│ │ │ │ ├── control/
│ │ │ │ │ ├── control_schema.json
│ │ │ │ │ ├── control_schema_add_column.json
│ │ │ │ │ └── control_schema_rename.json
│ │ │ │ └── source/
│ │ │ │ ├── source_part-01_schema.json
│ │ │ │ ├── source_part-02_schema.json
│ │ │ │ ├── source_part-03_schema.json
│ │ │ │ ├── source_part-04_schema.json
│ │ │ │ ├── source_part-05_schema.json
│ │ │ │ └── source_part-06_schema.json
│ │ │ └── full_load/
│ │ │ ├── batch_init.json
│ │ │ ├── batch_merge_disabled.json
│ │ │ ├── batch_merge_enabled.json
│ │ │ ├── batch_overwrite.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ └── schema/
│ │ │ ├── control/
│ │ │ │ ├── control_schema_merge_enabled.json
│ │ │ │ └── control_schema_overwrite.json
│ │ │ └── source/
│ │ │ ├── source_part-01_schema.json
│ │ │ └── source_part-02_schema.json
│ │ ├── sftp_reader/
│ │ │ └── data/
│ │ │ ├── file.csv
│ │ │ ├── file1.csv
│ │ │ ├── file2.csv
│ │ │ ├── file3.json
│ │ │ ├── file4.xml
│ │ │ └── file5.txt
│ │ ├── sharepoint/
│ │ │ ├── exceptions/
│ │ │ │ ├── acons/
│ │ │ │ │ ├── drive_exception.json
│ │ │ │ │ ├── endpoint_exception.json
│ │ │ │ │ ├── local_path_exception.json
│ │ │ │ │ ├── site_exception.json
│ │ │ │ │ └── streaming_exception.json
│ │ │ │ └── schemas/
│ │ │ │ └── schema.json
│ │ │ ├── reader/
│ │ │ │ ├── acons/
│ │ │ │ │ ├── read_file_name_and_file_pattern_conflict_should_fail.json
│ │ │ │ │ ├── read_file_name_unsupported_extension_should_fail.json
│ │ │ │ │ ├── read_folder_csv_archive_enabled_success.json
│ │ │ │ │ ├── read_folder_csv_archive_success_subfolder_override_success.json
│ │ │ │ │ ├── read_folder_csv_no_csv_files_should_fail.json
│ │ │ │ │ ├── read_folder_csv_one_file_schema_mismatch_custom_error_subfolder_should_archive_error.json
│ │ │ │ │ ├── read_folder_csv_one_file_schema_mismatch_should_archive_error.json
│ │ │ │ │ ├── read_folder_csv_pattern_matches_no_files_should_fail.json
│ │ │ │ │ ├── read_folder_csv_pattern_success.json
│ │ │ │ │ ├── read_folder_csv_success.json
│ │ │ │ │ ├── read_folder_path_does_not_exist_should_fail.json
│ │ │ │ │ ├── read_folder_relative_path_looks_like_file_unsupported_extension_should_fail.json
│ │ │ │ │ ├── read_single_csv_archive_default_enabled_success.json
│ │ │ │ │ ├── read_single_csv_archive_enabled_success.json
│ │ │ │ │ ├── read_single_csv_archive_success_subfolder_override_success.json
│ │ │ │ │ ├── read_single_csv_download_error_should_archive_error.json
│ │ │ │ │ ├── read_single_csv_empty_file_should_archive_error.json
│ │ │ │ │ ├── read_single_csv_full_path_success.json
│ │ │ │ │ ├── read_single_csv_full_path_with_file_name_should_fail.json
│ │ │ │ │ ├── read_single_csv_full_path_with_file_pattern_should_fail.json
│ │ │ │ │ ├── read_single_csv_full_path_with_file_type_should_fail.json
│ │ │ │ │ ├── read_single_csv_spark_load_fails_should_archive_error.json
│ │ │ │ │ ├── read_single_csv_success.json
│ │ │ │ │ └── read_unsupported_file_type_should_fail.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── bad_schema.csv
│ │ │ │ │ ├── other.csv
│ │ │ │ │ ├── sample_1.csv
│ │ │ │ │ └── sample_2.csv
│ │ │ │ └── mocks/
│ │ │ │ ├── get_drive_id.json
│ │ │ │ ├── get_file_metadata.json
│ │ │ │ ├── get_site_id.json
│ │ │ │ └── rename_file.json
│ │ │ └── writer/
│ │ │ ├── acons/
│ │ │ │ └── write_to_local_success.json
│ │ │ ├── data/
│ │ │ │ ├── file_control.csv
│ │ │ │ └── file_source.csv
│ │ │ ├── mocks/
│ │ │ │ ├── create_upload_session.json
│ │ │ │ ├── get_drive_id.json
│ │ │ │ └── get_site_id.json
│ │ │ └── schemas/
│ │ │ └── schema.json
│ │ ├── table_manager/
│ │ │ ├── compute_table_statistics/
│ │ │ │ ├── table_stats_complex_default_scenario1.json
│ │ │ │ ├── table_stats_complex_default_scenario2.json
│ │ │ │ ├── table_stats_complex_different_delimiter_scenario1.json
│ │ │ │ ├── table_stats_complex_different_delimiter_scenario2.json
│ │ │ │ └── table_stats_simple_split_scenario.json
│ │ │ ├── create/
│ │ │ │ ├── acon_create_table.json
│ │ │ │ ├── acon_create_table_complex_default_scenario.json
│ │ │ │ ├── acon_create_table_complex_different_delimiter_scenario.json
│ │ │ │ ├── acon_create_table_simple_split_scenario.json
│ │ │ │ ├── acon_create_view.json
│ │ │ │ ├── acon_create_view_complex_default_scenario.json
│ │ │ │ ├── acon_create_view_complex_different_delimiter_scenario.json
│ │ │ │ ├── acon_create_view_simple_split_scenario.json
│ │ │ │ ├── table/
│ │ │ │ │ ├── test_table_complex_default_scenario.sql
│ │ │ │ │ ├── test_table_complex_different_delimiter_scenario.sql
│ │ │ │ │ └── test_table_simple_split_scenario.sql
│ │ │ │ └── view/
│ │ │ │ ├── test_view_complex_default_scenario.sql
│ │ │ │ ├── test_view_complex_different_delimiter_scenario.sql
│ │ │ │ └── test_view_simple_split_scenario.sql
│ │ │ ├── delete/
│ │ │ │ └── acon_delete_where_table_simple_split_scenario.json
│ │ │ ├── describe/
│ │ │ │ └── acon_describe_simple_split_scenario.json
│ │ │ ├── drop/
│ │ │ │ ├── acon_drop_table_simple_split_scenario.json
│ │ │ │ └── acon_drop_view_simple_split_scenario.json
│ │ │ ├── execute_sql/
│ │ │ │ ├── acon_execute_sql_complex_default_scenario.json
│ │ │ │ ├── acon_execute_sql_complex_different_delimiter_scenario.json
│ │ │ │ └── acon_execute_sql_simple_split_scenario.json
│ │ │ ├── get_tbl_pk/
│ │ │ │ └── get_tbl_pk_simple_split_scenario.json
│ │ │ ├── optimize/
│ │ │ │ ├── optimize_location.json
│ │ │ │ ├── optimize_location_simple_split_scenario.json
│ │ │ │ ├── optimize_table.json
│ │ │ │ └── optimize_table_simple_split_scenario.json
│ │ │ ├── show_tbl_properties/
│ │ │ │ └── show_tbl_properties_simple_split_scenario.json
│ │ │ └── vacuum/
│ │ │ ├── acon_vacuum_location.json
│ │ │ ├── acon_vacuum_location_simple_split_scenario.json
│ │ │ └── acon_vacuum_table_simple_split_scenario.json
│ │ ├── transformations/
│ │ │ ├── chain_transformations/
│ │ │ │ ├── acons/
│ │ │ │ │ ├── batch.json
│ │ │ │ │ ├── streaming.json
│ │ │ │ │ ├── streaming_batch.json
│ │ │ │ │ ├── write_streaming_struct_data.json
│ │ │ │ │ └── write_streaming_struct_data_fail.json
│ │ │ │ ├── control/
│ │ │ │ │ ├── chain_control.csv
│ │ │ │ │ └── struct_data.json
│ │ │ │ ├── schema/
│ │ │ │ │ ├── customer_schema.json
│ │ │ │ │ ├── sales_schema.json
│ │ │ │ │ └── struct_data_schema.json
│ │ │ │ └── source/
│ │ │ │ ├── customers.csv
│ │ │ │ ├── sales_historical.csv
│ │ │ │ ├── sales_new.csv
│ │ │ │ └── struct_data.csv
│ │ │ ├── column_creators/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── column_reshapers/
│ │ │ │ ├── explode_arrays/
│ │ │ │ │ ├── batch.json
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ ├── source_schema.json
│ │ │ │ │ └── streaming.json
│ │ │ │ ├── flatten_and_explode_arrays_and_maps/
│ │ │ │ │ ├── batch.json
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ ├── source_schema.json
│ │ │ │ │ └── streaming.json
│ │ │ │ └── flatten_schema/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.json
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── data_maskers/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── drop_columns.csv
│ │ │ │ │ │ └── hash_masking.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── drop_columns.json
│ │ │ │ ├── drop_columns_control_schema.json
│ │ │ │ ├── hash_masking.json
│ │ │ │ ├── hash_masking_control_schema.json
│ │ │ │ └── source_schema.json
│ │ │ ├── date_transformers/
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── drop_duplicate_rows/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── batch_distinct.json
│ │ │ │ │ │ ├── batch_drop_duplicates.json
│ │ │ │ │ │ ├── streaming_distinct.json
│ │ │ │ │ │ └── streaming_drop_duplicates.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── joiners/
│ │ │ │ ├── batch.json
│ │ │ │ ├── control_scenario_1_and_2_schema.json
│ │ │ │ ├── control_scenario_3_schema.json
│ │ │ │ ├── customer_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── control_scenario_1_and_2.csv
│ │ │ │ │ │ └── control_scenario_3.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── customer-part-01.csv
│ │ │ │ │ ├── sales-part-01.csv
│ │ │ │ │ └── sales-part-02.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── streaming.json
│ │ │ │ ├── streaming_foreachBatch.json
│ │ │ │ ├── streaming_without_broadcast.json
│ │ │ │ └── streaming_without_column_rename.json
│ │ │ ├── multiple_transform/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source_schema.json
│ │ │ ├── null_handlers/
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── replace_nulls.csv
│ │ │ │ │ │ └── replace_nulls_col_subset.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── replace_nulls.json
│ │ │ │ ├── replace_nulls_col_subset.json
│ │ │ │ └── source_schema.json
│ │ │ ├── optimizers/
│ │ │ │ └── data/
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ ├── regex_transformers/
│ │ │ │ └── with_regex_value/
│ │ │ │ ├── batch.json
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── WE_SO_SCL_202108111400000029.csv
│ │ │ │ └── source_schema.json
│ │ │ ├── unions/
│ │ │ │ ├── batch_union.json
│ │ │ │ ├── batch_unionByName.json
│ │ │ │ ├── batch_unionByName_diff_schema.json
│ │ │ │ ├── batch_unionByName_diff_schema_error.json
│ │ │ │ ├── batch_union_diff_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── control_sales.csv
│ │ │ │ │ │ ├── control_sales_shipment.csv
│ │ │ │ │ │ ├── control_sales_shipment_streaming.csv
│ │ │ │ │ │ ├── control_sales_shipment_streaming_foreachBatch.csv
│ │ │ │ │ │ ├── control_sales_streaming.csv
│ │ │ │ │ │ └── control_sales_streaming_foreachBatch.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── sales-historical-part-01.csv
│ │ │ │ │ ├── sales-historical-part-02.csv
│ │ │ │ │ ├── sales-new-part-01.csv
│ │ │ │ │ ├── sales-new-part-02.csv
│ │ │ │ │ ├── sales-shipment-part-01.csv
│ │ │ │ │ └── sales-shipment-part-02.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── sales_shipment_schema.json
│ │ │ │ ├── streaming_union.json
│ │ │ │ ├── streaming_unionByName_diff_schema.json
│ │ │ │ ├── streaming_unionByName_diff_schema_foreachBatch.json
│ │ │ │ └── streaming_union_foreachBatch.json
│ │ │ └── watermarker/
│ │ │ ├── streaming_drop_duplicates/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_drop_duplicates.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming_drop_duplicates.json
│ │ │ ├── streaming_drop_duplicates_overall_watermark/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_drop_duplicates_overall_watermark.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming_drop_duplicates_overall_watermark.json
│ │ │ ├── streaming_inner_join/
│ │ │ │ ├── customer_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_inner_join.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── customer-part-01.csv
│ │ │ │ │ ├── sales-part-01.csv
│ │ │ │ │ └── sales-part-02.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── streaming_inner_join.json
│ │ │ │ └── streaming_inner_join_control_schema.json
│ │ │ ├── streaming_left_outer_join/
│ │ │ │ ├── customer_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_left_outer_join.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── customer-part-01.csv
│ │ │ │ │ ├── customer-part-02.csv
│ │ │ │ │ ├── customer-part-03.csv
│ │ │ │ │ ├── customer-part-04.csv
│ │ │ │ │ ├── customer-part-05.csv
│ │ │ │ │ ├── sales-part-01.csv
│ │ │ │ │ ├── sales-part-02.csv
│ │ │ │ │ ├── sales-part-03.csv
│ │ │ │ │ ├── sales-part-04.csv
│ │ │ │ │ └── sales-part-05.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── streaming_left_outer_join.json
│ │ │ │ └── streaming_left_outer_join_control_schema.json
│ │ │ └── streaming_right_outer_join/
│ │ │ ├── customer_schema.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── streaming_right_outer_join.csv
│ │ │ │ └── source/
│ │ │ │ ├── customer-part-01.csv
│ │ │ │ ├── sales-part-01.csv
│ │ │ │ └── sales-part-02.csv
│ │ │ ├── sales_schema.json
│ │ │ ├── streaming_right_outer_join.json
│ │ │ └── streaming_right_outer_join_control_schema.json
│ │ └── writers/
│ │ ├── acons/
│ │ │ ├── write_batch_console.json
│ │ │ ├── write_batch_dataframe.json
│ │ │ ├── write_batch_files.json
│ │ │ ├── write_batch_jdbc.json
│ │ │ ├── write_batch_rest_api.json
│ │ │ ├── write_batch_table.json
│ │ │ ├── write_streaming_console.json
│ │ │ ├── write_streaming_dataframe.json
│ │ │ ├── write_streaming_df_with_checkpoint.json
│ │ │ ├── write_streaming_files.json
│ │ │ ├── write_streaming_foreachBatch_console.json
│ │ │ ├── write_streaming_foreachBatch_dataframe.json
│ │ │ ├── write_streaming_foreachBatch_df_with_checkpoint.json
│ │ │ ├── write_streaming_foreachBatch_files.json
│ │ │ ├── write_streaming_foreachBatch_jdbc.json
│ │ │ ├── write_streaming_foreachBatch_table.json
│ │ │ ├── write_streaming_multiple_dfs.json
│ │ │ ├── write_streaming_rest_api.json
│ │ │ └── write_streaming_table.json
│ │ ├── control/
│ │ │ ├── writers_control.csv
│ │ │ ├── writers_control_streaming_dataframe_1.csv
│ │ │ ├── writers_control_streaming_dataframe_2.csv
│ │ │ ├── writers_control_streaming_dataframe_foreachBatch_1.csv
│ │ │ └── writers_control_streaming_dataframe_foreachBatch_2.csv
│ │ ├── schema/
│ │ │ └── sales_schema.json
│ │ └── source/
│ │ ├── sales_historical_1.csv
│ │ ├── sales_historical_2.csv
│ │ ├── sales_new_1.csv
│ │ └── sales_new_2.csv
│ └── unit/
│ ├── custom_configs/
│ │ └── custom_engine_config.yaml
│ ├── heartbeat/
│ │ ├── heartbeat_acon_creation/
│ │ │ └── setup/
│ │ │ └── column_list/
│ │ │ ├── heartbeat_sensor_control_table.json
│ │ │ └── sensor_table.json
│ │ └── heartbeat_anchor_job/
│ │ └── setup/
│ │ └── column_list/
│ │ ├── heartbeat_sensor_control_table.json
│ │ └── sensor_table.json
│ └── sharepoint_reader/
│ └── data/
│ ├── sample_ok.csv
│ └── sample_other_delim.csv
├── unit/
│ ├── __init__.py
│ ├── test_acon_validation.py
│ ├── test_custom_configs.py
│ ├── test_databricks_utils.py
│ ├── test_failure_notification_creation.py
│ ├── test_heartbeat_acon_creation.py
│ ├── test_heartbeat_anchor_job.py
│ ├── test_log_filter_sensitive_data.py
│ ├── test_notification_creation.py
│ ├── test_notification_factory.py
│ ├── test_prisma_dq_rule_id.py
│ ├── test_prisma_function_definition.py
│ ├── test_rest_api_functions.py
│ ├── test_sensor.py
│ ├── test_sensor_manager.py
│ ├── test_sharepoint_csv_reader.py
│ ├── test_spark_session.py
│ └── test_version.py
└── utils/
├── __init__.py
├── dataframe_helpers.py
├── dq_rules_table_utils.py
├── exec_env_helpers.py
├── local_storage.py
├── mocks.py
└── smtp_server.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a Bug report to help us improve
title: "[BUG] Function X is raising error Y"
labels: bug
assignees: jmcorreia
---
**Describe the bug**
A clear and concise description of what the bug is.
**Environment Details**
- Lakehouse Engine Version
- Environment where you are using the Lakehouse Engine (Ex. Databricks 13.3LTS)
**To Reproduce**
Please include all the necessary details to reproduce the problem, including the full ACON or functions that are being used and at what point the problem is occurring.
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**Additional context**
Add any other context about the problem here.
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: "[FEATURE] I would like to have the capability to do X"
labels: enhancement
assignees: jmcorreia
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context, useful links or screenshots about the feature request here.
================================================
FILE: .github/pull_request_template.md
================================================
- [ ] Description of PR changes above includes a link to [an existing GitHub issue](https://github.com/adidas/lakehouse-engine/issues)
- [ ] PR title is prefixed with one of: [BUGFIX], [FEATURE]
- [ ] Appropriate tests and docs have been updated
- [ ] Code is linted and tested -
```
make style
make lint
make test
make test-security
```
For more information about contributing, see [Contribute](https://github.com/adidas/lakehouse-engine/blob/master/CONTRIBUTING.md).
After you submit your PR, keep **monitoring its statuses and discuss/apply fixes for any issues or suggestions coming from the PR Reviews**.
Thanks for contributing!
================================================
FILE: .gitignore
================================================
# mac os hidden files
.DS_Store
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checer
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# intellij and vscode
.idea/
**.iml
.vscode/
# credentials
**credential**
# lakehouse and spark
/tests/lakehouse/**
*derby.log*
**/metastore_db/
/metastore_db/
**/spark-warehouse/
/spark-warehouse/
/artefacts/
tmp_os/
================================================
FILE: CONTRIBUTING.md
================================================
# How to Contribute
📖 Search algorithms, transformations and check implementation details & examples in our [documentation](https://adidas.github.io/lakehouse-engine-docs/lakehouse_engine.html).
💭 In case you have doubts, ideas, want to ask for help or want to discuss different approach and usages, feel free to create a [discussion](https://github.com/adidas/lakehouse-engine/discussions).
⚠️ Are you facing any issues? Open an issue on [GitHub](https://github.com/adidas/lakehouse-engine/issues).
💡 Do you have ideas for new features? Open a feature request on [GitHub](https://github.com/adidas/lakehouse-engine/issues).
🚀 Want to find the available releases? Check our release notes on [GitHub](https://github.com/adidas/lakehouse-engine/releases) and [PyPi](https://pypi.org/project/lakehouse-engine/).
## Prerequisites
1. Git.
2. Your IDE of choice with a Python 3 environment (e.g., virtualenv created from the requirements_cicd.txt file).
3. Docker. **Warning:** The default spark driver memory limit for the tests is set at 2g. This limit is configurable but your
testing docker setup **MUST** always have **at least** 2 * spark driver memory limit + 1 gb configured.
4. GNU make.
## General steps for contributing
1. Fork the project.
2. Clone the forked project into your working environment.
3. Create your feature branch following the convention [feature|bugfix]/ISSUE_ID_short_name.
4. Apply your changes in the recently created branch. It is **mandatory** to add tests covering the feature of fix contributed.
5. Style, lint, test and test security:
```
make style
make lint
make test
make test-security
```
---
> ***Note:*** To use the make targets with another docker-compatible cli other than docker you can pass the parameter "container_cli".
Example: `make test container_cli=nerdctl`
---
---
> ***Note:*** Most make target commands are running on docker. If you face any problem, you can also check the code of the respective make targets and directly execute the code in your python virtual environment.
---
6. (optional) You can build the wheel locally with `make build`.
7. (optional) Install the wheel you have just generated and test it.
8. If you have changed or added new requirements, you should run `make build-lock-files`, to rebuild the lock files.
9. If the transitive dependencies have not been updated for a while, and you want to upgrade them, you can use `make upgrade-lock-files` to update them. This will update the transitive dependencies even if you have not changed the requirements.
10. When you're ready with your changes, open a Pull Request (PR) to develop.
11. Ping the team through the preferred communication channel.
12. The team will come together to review it and approve it (2 approvals required).
13. Your changes will be tested internally, promoted to master and included in the next release.
> 🚀🚀🚀
>
> **Pull Requests are welcome from anyone**. However, before opening one, please make sure to open an issue on [GitHub](https://github.com/adidas/lakehouse-engine/issues)
> and link it.
> Moreover, if the Pull Request intends to cover big changes or features, it is recommended to first discuss it on a [GitHub issue](https://github.com/adidas/lakehouse-engine/issues) or [Discussion](https://github.com/adidas/lakehouse-engine/discussions).
>
> 🚀🚀🚀
================================================
FILE: LICENSE.txt
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2023 adidas AG
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: Makefile
================================================
SHELL := /bin/bash -euxo pipefail
container_cli := docker
image_name := lakehouse-engine
deploy_env := dev
project_version := $(shell cat cicd/.bumpversion.cfg | grep "current_version =" | cut -f 3 -d " ")
version := $(project_version)
# Gets system information in upper case
system_information := $(shell uname -mvp | tr a-z A-Z)
meta_conf_file := cicd/meta.yaml
meta_os_conf_file := cicd/meta_os.yaml
group_id := $(shell id -g ${USER})
engine_conf_file := lakehouse_engine/configs/engine.yaml
engine_os_conf_file := lakehouse_engine/configs/engine_os.yaml
remove_files_from_os := $(engine_conf_file) $(meta_conf_file) CODEOWNERS sonar-project.properties CONTRIBUTING.md CHANGELOG.md assets/img/os_strategy.png
last_commit_msg := "$(shell git log -1 --pretty=%B)"
git_tag := $(shell git describe --tags --abbrev=0)
commits_url := $(shell cat $(meta_conf_file) | grep commits_url | cut -f 2 -d " ")
ifneq ($(project_version), $(version))
wheel_version := $(project_version)+$(subst _,.,$(subst -,.,$(version)))
project_name := lakehouse-engine-experimental
else
wheel_version := $(version)
project_name := lakehouse-engine
endif
# Add \ to make reg safe comparisons (e.g. in the perl commands)
wheel_version_reg_safe := $(subst +,\+,$(subst .,\.,$(wheel_version)))
project_version_reg_safe := $(subst .,\.,$(project_version))
# Condition to define the Python image to be built based on the machine CPU architecture.
# The base Python image only changes if the identified CPU architecture is ARM.
ifneq (,$(findstring ARM,$(system_information)))
python_image := $(shell cat $(meta_conf_file) | grep arm_python_image | cut -f 2 -d " ")
cpu_architecture := arm64
else
python_image := $(shell cat $(meta_conf_file) | grep amd_python_image | cut -f 2 -d " ")
cpu_architecture := amd64
endif
# Condition to define the spark driver memory limit to be used in the tests
# In order to change this limit you can use the spark_driver_memory parameter
# Example: make test spark_driver_memory=3g
#
# WARNING: When the tests are being run 2 spark nodes are created, so despite
# the default value being 2g, your configured docker environment should have
# extra memory for communication and overhead.
ifndef $(spark_driver_memory)
spark_driver_memory := "2g"
endif
# A requirements_full.lock file is created based on all the requirements of the project (core, dq, os, azure, sftp, cicd and sharepoint).
# The requirements_full.lock file is then used as a constraints file to build the other lock file so that we ensure dependencies are consistent and compatible
# with each other, otherwise, the the installations would likely fail.
# Moreover, the requirement_full.lock file is also used in the dockerfile to install all project dependencies.
full_requirements := -o requirements_full.lock requirements.txt requirements_os.txt requirements_dq.txt requirements_azure.txt requirements_sftp.txt requirements_cicd.txt requirements_sharepoint.txt
requirements := -o requirements.lock requirements.txt -c requirements_full.lock
os_requirements := -o requirements_os.lock requirements_os.txt -c requirements_full.lock
dq_requirements = -o requirements_dq.lock requirements_dq.txt -c requirements_full.lock
azure_requirements = -o requirements_azure.lock requirements_azure.txt -c requirements_full.lock
sftp_requirements = -o requirements_sftp.lock requirements_sftp.txt -c requirements_full.lock
sharepoint_requirements = -o requirements_sharepoint.lock requirements_sharepoint.txt -c requirements_full.lock
os_deployment := False
container_user_dir := /home/appuser
trust_git_host := ssh -oStrictHostKeyChecking=no -i $(container_user_dir)/.ssh/id_rsa git@github.com
ifeq ($(os_deployment), True)
build_src_dir := tmp_os/lakehouse-engine
else
build_src_dir := .
endif
build-image:
$(container_cli) build \
--build-arg USER_ID=$(shell id -u ${USER}) \
--build-arg GROUP_ID=$(group_id) \
--build-arg PYTHON_IMAGE=$(python_image) \
--build-arg CPU_ARCHITECTURE=$(cpu_architecture) \
-t $(image_name):$(version) . -f cicd/Dockerfile
build-image-windows:
$(container_cli) build \
--build-arg PYTHON_IMAGE=$(python_image) \
--build-arg CPU_ARCHITECTURE=$(cpu_architecture) \
-t $(image_name):$(version) . -f cicd/Dockerfile
# The build target is used to build the wheel package.
# It makes usage of some `perl` commands to change the project wheel version in the pyproject.toml file,
# whenever the goal is to release a package for testing, instead of an official release.
# Ex: if you run 'make build-image version=feature-x-1276, and the current project version is 1.20.0, the generated wheel will be: lakehouse_engine_experimental-1.20.0+feature.x.1276-py3-none-any,
# while for the official 1.20.0 release, the wheel will be: lakehouse_engine-1.20.0-py3-none-any.
build:
perl -pi -e 's/version = "$(project_version_reg_safe)"/version = "$(wheel_version)"/g' pyproject.toml && \
perl -pi -e 's/name = "lakehouse-engine"/name = "$(project_name)"/g' pyproject.toml && \
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'python -m build --wheel $(build_src_dir)' && \
perl -pi -e 's/version = "$(wheel_version_reg_safe)"/version = "$(project_version)"/g' pyproject.toml && \
perl -pi -e 's/name = "$(project_name)"/name = "lakehouse-engine"/g' pyproject.toml
deploy: build
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
-v $(artifactory_credentials_file):$(container_user_dir)/.pypirc \
$(image_name):$(version) \
/bin/bash -c 'twine upload -r artifactory dist/$(subst -,_,$(project_name))-$(wheel_version)-py3-none-any.whl --skip-existing'
docs:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'cd $(build_src_dir) && pip install . && python ./cicd/code_doc/render_docs.py'
# mypy incremental mode is used by default, so in case there is any cache related issue,
# you can modify the command to include --no-incremental flag or you can delete mypy_cache folder.
lint:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'flake8 --docstring-convention google --config=cicd/flake8.conf lakehouse_engine tests cicd/code_doc/render_docs.py \
&& mypy --no-incremental lakehouse_engine tests'
# useful to print and use make variables. Usage: make print-variable var=variable_to_print.
print-variable:
@echo $($(var))
style:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c '''isort lakehouse_engine tests cicd/code_doc/render_docs.py && \
black lakehouse_engine tests cicd/code_doc/render_docs.py'''
terminal:
$(container_cli) run \
-it \
--rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash
# Can use test only: ```make test test_only="tests/feature/test_delta_load_record_mode_cdc.py"```.
# You can also hack it by doing ```make test test_only="-rx tests/feature/test_delta_load_record_mode_cdc.py"```
# to show complete output even of passed tests.
# We also fix the coverage filepaths, using perl, so that report has the correct paths
test:
$(container_cli) run \
--rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c "pytest \
--junitxml=artefacts/tests.xml \
--cov-report xml --cov-report xml:artefacts/coverage.xml \
--cov-report term-missing --cov=lakehouse_engine \
--log-cli-level=INFO --color=yes -x -vv \
--spark_driver_memory=$(spark_driver_memory) $(test_only)" && \
perl -pi -e 's/filename=\"/filename=\"lakehouse_engine\//g' artefacts/coverage.xml
test-security:
$(container_cli) run \
--rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'bandit -c cicd/bandit.yaml -r lakehouse_engine tests'
#####################################
##### Dependency Management Targets #####
#####################################
audit-dep-safety:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'pip-audit -r cicd/requirements_full.lock --desc on -f json --fix --dry-run -o artefacts/safety_analysis.json'
# This target will build the lock files to be used for building the wheel and delivering it.
build-lock-files:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'cd cicd && pip-compile --resolver=backtracking $(full_requirements) && \
pip-compile --resolver=backtracking $(requirements) && \
pip-compile --resolver=backtracking $(os_requirements) && \
pip-compile --resolver=backtracking $(dq_requirements) && \
pip-compile --resolver=backtracking $(azure_requirements) && \
pip-compile --resolver=backtracking $(sftp_requirements) && \
pip-compile --resolver=backtracking $(sharepoint_requirements)'
# We test the dependencies to check if they need to be updated because requirements.txt files have changed.
# On top of that, we also test if we will be able to install the base and the extra packages together,
# as their lock files are built separately and therefore dependency constraints might be too restricted.
# If that happens, pip install will fail because it cannot solve the dependency resolution process, and therefore
# we need to pin those conflict dependencies in the requirements.txt files to a version that fits both the base and
# extra packages.
test-deps:
@GIT_STATUS="$$(git status --porcelain --ignore-submodules cicd/)"; \
if [ ! "x$$GIT_STATUS" = "x" ]; then \
echo "!!! Requirements lists has been updated but lock file was not rebuilt !!!"; \
echo "!!! Run make build-lock-files !!!"; \
echo -e "$${GIT_STATUS}"; \
git diff cicd/; \
exit 1; \
fi
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'pip install -e .[azure,dq,sftp,os] --dry-run --ignore-installed'
# This will update the transitive dependencies even if there were no changes in the requirements files.
# This should be a recurrent activity to make sure transitive dependencies are kept up to date.
upgrade-lock-files:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'cd cicd && pip-compile --resolver=backtracking --upgrade $(full_requirements) && \
pip-compile --resolver=backtracking --upgrade $(requirements) && \
pip-compile --resolver=backtracking --upgrade $(os_requirements) && \
pip-compile --resolver=backtracking --upgrade $(dq_requirements) && \
pip-compile --resolver=backtracking --upgrade $(azure_requirements) && \
pip-compile --resolver=backtracking --upgrade $(sftp_requirements) && \
pip-compile --resolver=backtracking --upgrade $(sharepoint_requirements)'
#####################################
##### GitHub Deployment Targets #####
#####################################
prepare-github-repo:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
-v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \
$(image_name):$(version) \
/bin/bash -c """mkdir -p tmp_os/$(repository); \
cd tmp_os/$(repository); \
git init -b master; \
git config pull.rebase false; \
git config user.email 'lak-engine@adidas.com'; \
git config user.name 'Lakehouse Engine'; \
$(trust_git_host); \
git remote add origin git@github.com:adidas/$(repository).git; \
git pull origin master --tags"""
sync-to-github: prepare-github-repo
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
-v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \
$(image_name):$(version) \
/bin/bash -c """cd tmp_os/lakehouse-engine; \
rsync -r --exclude=.git --exclude=.*cache* --exclude=venv --exclude=dist --exclude=tmp_os /app/ . ; \
rm $(remove_files_from_os); \
mv $(engine_os_conf_file) $(engine_conf_file); \
mv $(meta_os_conf_file) $(meta_conf_file); \
mv CONTRIBUTING_OS.md CONTRIBUTING.md; \
$(trust_git_host); \
git add . ; \
git commit -m "'${last_commit_msg}'"; \
git tag -a $(git_tag) -m 'Release $(git_tag)' ; \
git push origin master --follow-tags;"""
deploy-docs-to-github: docs prepare-github-repo
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
-v $(git_credentials_file):$(container_user_dir)/.ssh/id_rsa \
$(image_name):$(version) \
/bin/bash -c """cp -r tmp_os/lakehouse-engine/artefacts/docs/site/* tmp_os/lakehouse-engine-docs/ ; \
cd tmp_os/lakehouse-engine-docs; \
$(trust_git_host); \
git add . ; \
git commit -m 'Lakehouse Engine $(version) documentation'; \
git push origin master ; \
cd .. && rm -rf tmp_os/lakehouse-engine-docs"""
deploy-to-pypi: build
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
-v $(pypi_credentials_file):$(container_user_dir)/.pypirc \
$(image_name):$(version) \
/bin/bash -c 'twine upload tmp_os/lakehouse-engine/dist/lakehouse_engine-$(project_version)-py3-none-any.whl --skip-existing'
deploy-to-pypi-and-clean: deploy-to-pypi
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'rm -rf tmp_os/lakehouse-engine'
###########################
##### Release Targets #####
###########################
create-changelog:
echo "# Changelog - $(shell date +"%Y-%m-%d") v$(shell cat cicd/.bumpversion.cfg | grep "current_version =" | cut -f 3 -d " ")" > CHANGELOG.md && \
echo "All notable changes to this project will be documented in this file automatically. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html)." >> CHANGELOG.md && \
echo "" >> CHANGELOG.md && \
git log --no-decorate --pretty=format:"#### [%cs] [%(describe)]%n [%h]($(commits_url)%H) %s" -n 1000 >> CHANGELOG.md
bump-up-version:
$(container_cli) run --rm \
-w /app \
-v "$$PWD":/app \
$(image_name):$(version) \
/bin/bash -c 'bump2version --config-file cicd/.bumpversion.cfg $(increment)'
prepare-release: bump-up-version create-changelog
echo "Prepared version and changelog to release!"
commit-release:
git commit -a -m 'Create release $(version)' && \
git tag -a 'v$(version)' -m 'Release $(version)'
push-release:
git push --follow-tags
delete-tag:
git push --delete origin $(tag)
.PHONY: $(MAKECMDGOALS)
================================================
FILE: README.md
================================================
<img align="right" src="assets/img/lakehouse_engine_logo_symbol_small.png" alt="Lakehouse Engine Logo">
# Lakehouse Engine
A configuration driven Spark framework, written in Python, serving as a scalable and distributed engine for several lakehouse algorithms, data flows and utilities for Data Products.
---
> ***Note:*** whenever you read Data Product or Data Product team, we want to refer to Teams and use cases, whose main focus is on
leveraging the power of data, on a particular topic, end-to-end (ingestion, consumption...) to achieve insights, supporting faster and better decisions,
which generate value for their businesses. These Teams should not be focusing on building reusable frameworks, but on re-using the existing frameworks to achieve their goals.
---
## Main Goals
The goal of the Lakehouse Engine is to bring some advantages, such as:
- offer cutting-edge, standard, governed and battle-tested foundations that several Data Product teams can benefit from;
- avoid that Data Product teams develop siloed solutions, reducing technical debts and high operating costs (redundant developments across teams);
- allow Data Product teams to focus mostly on data-related tasks, avoiding wasting time & resources on developing the same code for different use cases;
- benefit from the fact that many teams are reusing the same code, which increases the likelihood that common issues are surfaced and solved faster;
- decrease the dependency and learning curve to Spark and other technologies that the Lakehouse Engine abstracts;
- speed up repetitive tasks;
- reduced vendor lock-in.
---
> ***Note:*** even though you will see a focus on AWS and Databricks, this is just due to the lack of use cases for other technologies like GCP and Azure, but we are open for contribution.
---
## Key Features
⭐ **Data Loads:** perform data loads from diverse source types and apply transformations and data quality validations,
ensuring trustworthy data, before integrating it into distinct target types. Additionally, people can also define termination
actions like optimisations or notifications. [On the usage section](#load-data-usage-example) you will find an example using all the supported keywords for data loads.
---
> ***Note:*** The Lakehouse
Engine supports different types of sources and targets, such as, kafka, jdbc, dataframes, files (csv, parquet, json, delta...), sftp, sap bw, sap b4...
---
⭐ **Transformations:** configuration driven transformations without the need to write any spark code. Transformations can be applied by using the `transform_specs` in the Data Loads.
---
> ***Note:*** you can search all the available transformations, as well as checking implementation details and examples [here](reference/packages/transformers/index.md).
---
⭐ **Data Quality Validations:** the Lakehouse Engine uses Great Expectations as a backend and abstracts any implementation
details by offering people the capability to specify what validations to apply on the data, solely using dict/json based configurations.
The Data Quality validations can be applied on:
- post-mortem (static) data, using the DQ Validator algorithm (`execute_dq_validation`)
- data in-motion, using the `dq_specs` keyword in the Data Loads, to add it as one more step while loading data.
[On the usage section](#load-data-usage-example) you will find an example using this type of Data Quality validations.
⭐ **Reconciliation:** useful algorithm to compare two source of data, by defining one version of the `truth` to compare
against the `current` version of the data. It can be particularly useful during migrations phases, two compare a few KPIs
and ensure the new version of a table (`current`), for example, delivers the same vision of the data as the old one (`truth`).
Find usage examples [here](lakehouse_engine_usage/reconciliator/reconciliator.md).
⭐ **Sensors:** an abstraction to otherwise complex spark code that can be executed in very small single-node clusters
to check if an upstream system or Data Product contains new data since the last execution. With this feature, people can
trigger jobs to run in more frequent intervals and if the upstream does not contain new data, then the rest of the job
exits without creating bigger clusters to execute more intensive data ETL (Extraction, Transformation, and Loading).
Find usage examples [here](lakehouse_engine_usage/sensors/sensors.md).
⭐ **Terminators:** this feature allow people to specify what to do as a last action, before finishing a Data Load.
Some examples of actions are: optimising target table, vacuum, compute stats, expose change data feed to external location
or even send e-mail notifications. Thus, it is specifically used in Data Loads, using the `terminate_specs` keyword.
[On the usage section](#load-data-usage-example) you will find an example using terminators.
⭐ **Table Manager:** function `manage_table`, offers a set of actions to manipulate tables/views in several ways, such as:
- compute table statistics;
- create/drop tables and views;
- delete/truncate/repair tables;
- vacuum delta tables or locations;
- optimize table;
- describe table;
- show table properties;
- execute sql.
⭐ **File Manager:** function `manage_files`, offers a set of actions to manipulate files in several ways, such as:
- delete Objects in S3;
- copy Objects in S3;
- restore Objects from S3 Glacier;
- check the status of a restore from S3 Glacier;
- request a restore of objects from S3 Glacier and wait for them to be copied to a destination.
⭐ **Notifications:** you can configure and send email notifications.
---
> ***Note:*** it can be used as an independent function (`send_notification`) or as a `terminator_spec`, using the function `notify`.
---
📖 In case you want to check further details you can check the documentation of the [Lakehouse Engine facade](reference/packages/engine.md).
## Installation
As the Lakehouse Engine is built as wheel (look into our **build** and **deploy** make targets) you can install it as any other python package using **pip**.
```
pip install lakehouse-engine
```
Alternatively, you can also upload the wheel to any target of your like (e.g. S3) and perform a pip installation pointing to that target location.
---
> ***Note:*** The Lakehouse Engine is packaged with plugins or optional dependencies, which are not installed by default. The goal is
> to make its installation lighter and to avoid unnecessary dependencies. You can check all the optional dependencies in
> the [tool.setuptools.dynamic] section of the [pyproject.toml](pyproject.toml) file. They are currently: os, dq, azure, sharepoint and sftp. So,
> in case you want to make usage of the Data Quality features offered in the Lakehouse Engine, instead of running the previous command, you should run
> the command below, which will bring the core functionalities, plus DQ.
> ```
> pip install lakehouse-engine[dq]
> ```
> In case you are in an environment without pre-install spark and delta, you will also want to install the `os` optional dependencies, like so:
> ```
> pip install lakehouse-engine[os]
> ```
> And in case you want to install several optional dependencies, you can run a command like:
> ```
> pip install lakehouse-engine[dq,sftp]
> ```
> It is advisable for a Data Product to pin a specific version of the Lakehouse Engine (and have recurring upgrading activities)
> to avoid breaking changes in a new release.
> In case you don't want to be so conservative, you can pin to a major version, which usually shouldn't include changes that break backwards compatibility.
---
## How Data Products use the Lakehouse Engine Framework?
<img src="assets/img/lakehouse_dp_usage.drawio.png?raw=true" style="max-width: 800px; height: auto; "/>
The Lakehouse Engine is a configuration-first Data Engineering framework, using the concept of ACONs to configure algorithms.
An ACON, stands for Algorithm Configuration and is a JSON representation, as the [Load Data Usage Example](#load-data-usage-example) demonstrates.
Below you find described the main keywords you can use to configure and ACON for a Data Load.
---
> ***Note:*** the usage logic for the other [algorithms/features presented](#key-features) will always be similar, but using different keywords,
which you can search for in the examples and documentation provided in the [Key Features](#key-features) and [Community Support and Contributing](#community-support-and-contributing) sections.
---
- **Input specifications (input_specs):** specify how to read data. This is a **mandatory** keyword.
- **Transform specifications (transform_specs):** specify how to transform data.
- **Data quality specifications (dq_specs):** specify how to execute the data quality process.
- **Output specifications (output_specs):** specify how to write data to the target. This is a **mandatory** keyword.
- **Terminate specifications (terminate_specs):** specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc).
- **Execution environment (exec_env):** custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example).
## Load Data Usage Example
You can use the Lakehouse Engine in a **pyspark script** or **notebook**.
Below you can find an example on how to execute a Data Load using the Lakehouse Engine, which is doing the following:
1. Read CSV files, from a specified location, in a streaming fashion and providing a specific schema and some additional
options for properly read the files (e.g. header, delimiter...);
2. Apply two transformations on the input data:
1. Add a new column having the Row ID;
2. Add a new column `extraction_date`, which extracts the date from the `lhe_extraction_filepath`, based on a regex.
3. Apply Data Quality validations and store the result of their execution in the table `your_database.order_events_dq_checks`:
1. Check if the column `omnihub_locale_code` is not having null values;
2. Check if the distinct value count for the column `product_division` is between 10 and 100;
3. Check if the max of the column `so_net_value` is between 10 and 1000;
4. Check if the length of the values in the column `omnihub_locale_code` is between 1 and 10;
5. Check if the mean of the values for the column `coupon_code` is between 15 and 20.
4. Write the output into the table `your_database.order_events_with_dq` in a delta format, partitioned by `order_date_header`
and applying a merge predicate condition, ensuring the data is only inserted into the table if it does not match the predicate
(meaning the data is not yet available in the table). Moreover, the `insert_only` flag is used to specify that there should not
be any updates or deletes in the target table, only inserts;
5. Optimize the Delta Table that we just wrote in (e.g. z-ordering);
6. Specify 3 custom Spark Session configurations.
---
> ⚠️ ***Note:*** `spec_id` is one of the main concepts to ensure you can chain the steps of the algorithm,
so, for example, you can specify the transformations (in `transform_specs`) of a DataFrame that was read in the `input_specs`.
---
```python
from lakehouse_engine.engine import load_data
acon = {
"input_specs": [
{
"spec_id": "orders_bronze",
"read_type": "streaming",
"data_format": "csv",
"schema_path": "s3://my-data-product-bucket/artefacts/metadata/bronze/schemas/orders.json",
"with_filepath": True,
"options": {
"badRecordsPath": "s3://my-data-product-bucket/badrecords/order_events_with_dq/",
"header": False,
"delimiter": "\u005E",
"dateFormat": "yyyyMMdd",
},
"location": "s3://my-data-product-bucket/bronze/orders/",
}
],
"transform_specs": [
{
"spec_id": "orders_bronze_with_extraction_date",
"input_id": "orders_bronze",
"transformers": [
{"function": "with_row_id"},
{
"function": "with_regex_value",
"args": {
"input_col": "lhe_extraction_filepath",
"output_col": "extraction_date",
"drop_input_col": True,
"regex": ".*WE_SO_SCL_(\\d+).csv",
},
},
],
}
],
"dq_specs": [
{
"spec_id": "check_orders_bronze_with_extraction_date",
"input_id": "orders_bronze_with_extraction_date",
"dq_type": "validator",
"result_sink_db_table": "your_database.order_events_dq_checks",
"fail_on_error": False,
"dq_functions": [
{
"dq_function": "expect_column_values_to_not_be_null",
"args": {
"column": "omnihub_locale_code"
}
},
{
"dq_function": "expect_column_unique_value_count_to_be_between",
"args": {
"column": "product_division",
"min_value": 10,
"max_value": 100
},
},
{
"dq_function": "expect_column_max_to_be_between",
"args": {
"column": "so_net_value",
"min_value": 10,
"max_value": 1000
}
},
{
"dq_function": "expect_column_value_lengths_to_be_between",
"args": {
"column": "omnihub_locale_code",
"min_value": 1,
"max_value": 10
},
},
{
"dq_function": "expect_column_mean_to_be_between",
"args": {
"column": "coupon_code",
"min_value": 15,
"max_value": 20
}
},
],
},
],
"output_specs": [
{
"spec_id": "orders_silver",
"input_id": "check_orders_bronze_with_extraction_date",
"data_format": "delta",
"write_type": "merge",
"partitions": ["order_date_header"],
"merge_opts": {
"merge_predicate": """
new.sales_order_header = current.sales_order_header
AND new.sales_order_schedule = current.sales_order_schedule
AND new.sales_order_item=current.sales_order_item
AND new.epoch_status=current.epoch_status
AND new.changed_on=current.changed_on
AND new.extraction_date=current.extraction_date
AND new.lhe_batch_id=current.lhe_batch_id
AND new.lhe_row_id=current.lhe_row_id
""",
"insert_only": True,
},
"db_table": "your_database.order_events_with_dq",
"options": {
"checkpointLocation": "s3://my-data-product-bucket/checkpoints/template_order_events_with_dq/"
},
}
],
"terminate_specs": [
{
"function": "optimize_dataset",
"args": {
"db_table": "your_database.order_events_with_dq"
}
}
],
"exec_env": {
"spark.databricks.delta.schema.autoMerge.enabled": True,
"spark.databricks.delta.optimizeWrite.enabled": True,
"spark.databricks.delta.autoCompact.enabled": True,
},
}
load_data(acon=acon)
```
---
> ***Note:*** Although it is possible to interact with the Lakehouse Engine functions directly from your python code,
instead of relying on creating an ACON dict and use the engine api, we do not ensure the stability across new
Lakehouse Engine releases when calling internal functions (not exposed in the facade) directly.
---
---
> ***Note:*** ACON structure might change across releases, please test your Data Product first before updating to a
new version of the Lakehouse Engine in your Production environment.
---
## Overwriting default configurations
We use a YAML file to specify various configurations needed for different functionalities. You can overwrite these
configurations using a dictionary with new settings or by providing a path to a YAML file.
This functionality can be particularly useful for the open-source community as it unlocks
the usage several functionalities like Prisma and engine usage logs.
Check default configurations.
```
from lakehouse_engine.core import exec_env
print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket)
> default-bucket
```
Change the dq_dev_bucket configuration.
```
exec_env.ExecEnv.set_default_engine_config(custom_configs_dict={"dq_dev_bucket": "your-dq-bucket"})
print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket)
> your-dq-bucket
```
Reset to default configurations.
```
exec_env.ExecEnv.set_default_engine_config()
print(exec_env.ExecEnv.ENGINE_CONFIG.dq_dev_bucket)
> default-bucket
```
---
## Who maintains the Lakehouse Engine?
The Lakehouse Engine is under active development and production usage by the Adidas Lakehouse Foundations Engineering team.
## Community Support and Contributing
🤝 Do you want to contribute or need any support? Check out all the details in [CONTRIBUTING.md](https://github.com/adidas/lakehouse-engine/blob/master/CONTRIBUTING.md).
## License and Software Information
© adidas AG
adidas AG publishes this software and accompanied documentation (if any) subject to the terms of the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt)
with the aim of helping the community with our tools and libraries which we think can be also useful for other people.
You will find a copy of the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) in the root folder of this package. All rights not explicitly granted
to you under the [license](https://github.com/adidas/lakehouse-engine/blob/master/LICENSE.txt) remain the sole and exclusive property of adidas AG.
---
> ***NOTICE:*** The software has been designed solely for the purposes described in this ReadMe file. The software is NOT designed,
tested or verified for productive use whatsoever, nor or for any use related to high risk environments, such as health care,
highly or fully autonomous driving, power plants, or other critical infrastructures or services.
---
If you want to contact adidas regarding the software, you can mail us at software.engineering@adidas.com.
For further information open the [adidas terms and conditions](https://github.com/adidas/adidas-contribution-guidelines/wiki/Terms-and-conditions) page.
================================================
FILE: assets/gab/metadata/gab/f_agg_dummy_sales_kpi/1_article_category.sql
================================================
SELECT
"category_a" AS category_name
,"article1" AS article_id
UNION
SELECT
"category_a" AS category_name
,"article2" AS article_id
UNION
SELECT
"category_a" AS category_name
,"article3" AS article_id
UNION
SELECT
"category_a" AS category_name
,"article4" AS article_id
UNION
SELECT
"category_b" AS category_name
,"article5" AS article_id
UNION
SELECT
"category_b" AS category_name
,"article6" AS article_id
UNION
SELECT
"category_b" AS category_name
,"article7" AS article_id
================================================
FILE: assets/gab/metadata/gab/f_agg_dummy_sales_kpi/2_f_agg_dummy_sales_kpi.sql
================================================
SELECT
{% if replace_offset_value == 0 %} {{ project_date_column }}
{% else %} ({{ project_date_column }} + interval '{{offset_value}}' hour)
{% endif %} AS order_date,
{{ to_date }} AS to_date,
b.category_name,
COUNT(a.article_id) qty_articles,
SUM(amount) total_amount
FROM
`{{ database }}`.`dummy_sales_kpi` a {{ joins }}
LEFT JOIN article_categories b
ON a.article_id = b.article_id
WHERE
TO_DATE({{ filter_date_column }}, 'yyyyMMdd') >= (
'{{start_date}}' + interval '{{offset_value}}' hour
)
AND TO_DATE({{ filter_date_column }}, 'yyyyMMdd') < (
'{{ end_date}}' + interval '{{offset_value}}' hour
)
GROUP BY
1,2,3
================================================
FILE: assets/gab/metadata/tables/dim_calendar.sql
================================================
DROP TABLE IF EXISTS `database`.dim_calendar;
CREATE EXTERNAL TABLE `database`.dim_calendar (
calendar_date DATE COMMENT 'Full calendar date in the format yyyyMMdd.',
day_en STRING COMMENT 'Name of the day of the week.',
weeknum_mon INT COMMENT 'Week number where the week starts on Monday.',
weekstart_mon DATE COMMENT 'First day of the week where the week starts on Monday.',
weekend_mon DATE COMMENT 'Last day of the week where the week starts on Monday.',
weekstart_sun DATE COMMENT 'First day of the week where the week starts on Sunday.',
weekend_sun DATE COMMENT 'Last day of the week where the week starts on Sunday.',
month_start DATE COMMENT 'First day of the Month.',
month_end DATE COMMENT 'Last day of the Month.',
quarter_start DATE COMMENT 'First day of the Quarter.',
quarter_end DATE COMMENT 'Last day of the Quarter.',
year_start DATE COMMENT 'First day of the Year.',
year_end DATE COMMENT 'Last day of the Year.'
)
USING DELTA
LOCATION 's3://my-data-product-bucket/dim_calendar'
COMMENT 'This table stores the calendar information.'
TBLPROPERTIES(
'lakehouse.primary_key'='calendar_date',
'delta.enableChangeDataFeed'='false'
)
================================================
FILE: assets/gab/metadata/tables/dummy_sales_kpi.sql
================================================
DROP TABLE IF EXISTS `database`.`dummy_sales_kpi`;
CREATE EXTERNAL TABLE `database`.`dummy_sales_kpi` (
`order_date` DATE COMMENT 'date of the orders',
`article_id` STRING COMMENT 'article id',
`amount` INT COMMENT 'quantity/amount sold on this date'
)
USING DELTA
PARTITIONED BY (order_date)
LOCATION 's3://my-data-product-bucket/dummy_sales_kpi'
COMMENT 'Dummy sales KPI (articles sold per date).'
TBLPROPERTIES(
'lakehouse.primary_key'='article_id, order_date',
'delta.enableChangeDataFeed'='true'
)
================================================
FILE: assets/gab/metadata/tables/gab_log_events.sql
================================================
DROP TABLE IF EXISTS `database`.`gab_log_events`;
CREATE EXTERNAL TABLE `database`.`gab_log_events`
(
`run_start_time` TIMESTAMP COMMENT 'Run start time for the use case',
`run_end_time` TIMESTAMP COMMENT 'Run end time for the use case',
`input_start_date` TIMESTAMP COMMENT 'The start time set for the use case process',
`input_end_date` TIMESTAMP COMMENT 'The end time set for the use case process',
`query_id` STRING COMMENT 'Query ID for the use case',
`query_label` STRING COMMENT 'Query label for the use case',
`cadence` STRING COMMENT 'This field stores the cadence of data granularity (Day/Week/Month/Quarter/Year)',
`stage_name` STRING COMMENT 'Intermediate stage',
`stage_query` STRING COMMENT 'Query run as part of stage',
`status` STRING COMMENT 'Status of the stage',
`error_code` STRING COMMENT 'Error code'
)
USING DELTA
PARTITIONED BY (query_id)
LOCATION 's3://my-data-product-bucket/gab_log_events'
COMMENT 'This table stores the log for all use cases in gab'
TBLPROPERTIES(
'lakehouse.primary_key'='run_start_time,query_id,stage_name',
'delta.enableChangeDataFeed'='false'
)
================================================
FILE: assets/gab/metadata/tables/gab_use_case_results.sql
================================================
DROP TABLE IF EXISTS `database`.`gab_use_case_results`;
CREATE EXTERNAL TABLE `database`.`gab_use_case_results`
(
`query_id` STRING COMMENT 'Query ID for the use case',
`cadence` STRING COMMENT 'Cadence of data granularity (Day/Week/Month/Quarter/Year)',
`from_date` DATE COMMENT 'Aggregate based on the date column',
`to_date` DATE COMMENT 'Snapshot end date',
`d1` STRING COMMENT 'Dimension 1',
`d2` STRING COMMENT 'Dimension 2',
`d3` STRING COMMENT 'Dimension 3',
`d4` STRING COMMENT 'Dimension 4',
`d5` STRING COMMENT 'Dimension 5',
`d6` STRING COMMENT 'Dimension 6',
`d7` STRING COMMENT 'Dimension 7',
`d8` STRING COMMENT 'Dimension 8',
`d9` STRING COMMENT 'Dimension 9',
`d10` STRING COMMENT 'Dimension 10',
`d11` STRING COMMENT 'Dimension 11',
`d12` STRING COMMENT 'Dimension 12',
`d13` STRING COMMENT 'Dimension 13',
`d14` STRING COMMENT 'Dimension 14',
`d15` STRING COMMENT 'Dimension 15',
`d16` STRING COMMENT 'Dimension 16',
`d17` STRING COMMENT 'Dimension 17',
`d18` STRING COMMENT 'Dimension 18',
`d19` STRING COMMENT 'Dimension 19',
`d20` STRING COMMENT 'Dimension 20',
`d21` STRING COMMENT 'Dimension 21',
`d22` STRING COMMENT 'Dimension 22',
`d23` STRING COMMENT 'Dimension 23',
`d24` STRING COMMENT 'Dimension 24',
`d25` STRING COMMENT 'Dimension 25',
`d26` STRING COMMENT 'Dimension 26',
`d27` STRING COMMENT 'Dimension 27',
`d28` STRING COMMENT 'Dimension 28',
`d29` STRING COMMENT 'Dimension 29',
`d30` STRING COMMENT 'Dimension 30',
`d31` STRING COMMENT 'Dimension 31',
`d32` STRING COMMENT 'Dimension 32',
`d33` STRING COMMENT 'Dimension 33',
`d34` STRING COMMENT 'Dimension 34',
`d35` STRING COMMENT 'Dimension 35',
`d36` STRING COMMENT 'Dimension 36',
`d37` STRING COMMENT 'Dimension 37',
`d38` STRING COMMENT 'Dimension 38',
`d39` STRING COMMENT 'Dimension 39',
`d40` STRING COMMENT 'Dimension 40',
`m1` DOUBLE COMMENT 'Metric 1',
`m2` DOUBLE COMMENT 'Metric 2',
`m3` DOUBLE COMMENT 'Metric 3',
`m4` DOUBLE COMMENT 'Metric 4',
`m5` DOUBLE COMMENT 'Metric 5',
`m6` DOUBLE COMMENT 'Metric 6',
`m7` DOUBLE COMMENT 'Metric 7',
`m8` DOUBLE COMMENT 'Metric 8',
`m9` DOUBLE COMMENT 'Metric 9',
`m10` DOUBLE COMMENT 'Metric 10',
`m11` DOUBLE COMMENT 'Metric 11',
`m12` DOUBLE COMMENT 'Metric 12',
`m13` DOUBLE COMMENT 'Metric 13',
`m14` DOUBLE COMMENT 'Metric 14',
`m15` DOUBLE COMMENT 'Metric 15',
`m16` DOUBLE COMMENT 'Metric 16',
`m17` DOUBLE COMMENT 'Metric 17',
`m18` DOUBLE COMMENT 'Metric 18',
`m19` DOUBLE COMMENT 'Metric 19',
`m20` DOUBLE COMMENT 'Metric 20',
`m21` DOUBLE COMMENT 'Metric 21',
`m22` DOUBLE COMMENT 'Metric 22',
`m23` DOUBLE COMMENT 'Metric 23',
`m24` DOUBLE COMMENT 'Metric 24',
`m25` DOUBLE COMMENT 'Metric 25',
`m26` DOUBLE COMMENT 'Metric 26',
`m27` DOUBLE COMMENT 'Metric 27',
`m28` DOUBLE COMMENT 'Metric 28',
`m29` DOUBLE COMMENT 'Metric 29',
`m30` DOUBLE COMMENT 'Metric 30',
`m31` DOUBLE COMMENT 'Metric 31',
`m32` DOUBLE COMMENT 'Metric 32',
`m33` DOUBLE COMMENT 'Metric 33',
`m34` DOUBLE COMMENT 'Metric 34',
`m35` DOUBLE COMMENT 'Metric 35',
`m36` DOUBLE COMMENT 'Metric 36',
`m37` DOUBLE COMMENT 'Metric 37',
`m38` DOUBLE COMMENT 'Metric 38',
`m39` DOUBLE COMMENT 'Metric 39',
`m40` DOUBLE COMMENT 'Metric 40',
`lh_created_on` TIMESTAMP COMMENT 'This field stores the created_on in lakehouse'
)
USING DELTA
PARTITIONED BY (query_id)
LOCATION 's3://my-data-product-bucket/gab_use_case_results'
COMMENT 'This table is the common table for all use cases and stores all the dimensions and metrics'
TBLPROPERTIES(
'lakehouse.primary_key'='query_id,cadence,to_date,from_date',
'delta.enableChangeDataFeed'='false'
)
================================================
FILE: assets/gab/metadata/tables/lkp_query_builder.sql
================================================
DROP TABLE IF EXISTS `database`.`lkp_query_builder`;
CREATE EXTERNAL TABLE `database`.`lkp_query_builder`
(
`query_id` INT COMMENT 'Query ID for the use case which is a sequence of numbers',
`query_label` STRING COMMENT 'Summarized description of the use case',
`query_type` STRING COMMENT 'Type of use case based on region',
`mappings` STRING COMMENT 'Dictionary of mappings for dimensions and metrics',
`intermediate_stages` STRING COMMENT 'All the stages and their configs such as storageLevel repartitioning date columns',
`recon_window` STRING COMMENT 'Configurations for Cadence and Reconciliation Windows',
`timezone_offset` INT COMMENT 'Timezone offsets can be configured by a positive or negative integer',
`start_of_the_week` STRING COMMENT 'Sunday or Monday can be configured as the start of the week',
`is_active` STRING COMMENT 'Active Flag - Can be set to Y or N',
`queue` STRING COMMENT 'Can be set to High/Medium/Low based on the cluster computation requirement',
`lh_created_on` TIMESTAMP COMMENT 'This field stores the created_on in lakehouse'
)
USING DELTA
LOCATION 's3://my-data-product-bucket/lkp_query_builder'
COMMENT 'This table stores the configuration for the gab framework'
TBLPROPERTIES(
'lakehouse.primary_key'='query_id',
'delta.enableChangeDataFeed'='false'
)
================================================
FILE: assets/gab/notebooks/gab.py
================================================
# Databricks notebook source
from datetime import datetime, timedelta
from lakehouse_engine.engine import execute_gab
from pyspark.sql.functions import collect_list, collect_set, lit
# COMMAND ----------
dbutils.widgets.text("lookup_table", "lkp_query_builder")
lookup_table = dbutils.widgets.get("lookup_table")
dbutils.widgets.text("source_database", "source_database")
source_database = dbutils.widgets.get("source_database")
dbutils.widgets.text("target_database", "target_database")
target_database = dbutils.widgets.get("target_database")
# COMMAND ----------
def flatten_extend(list_to_flatten: list) -> list:
"""Flatten python list.
Args:
list_to_flatten: list to be flattened.
Returns:
A list containing the flatten values.
"""
flat_list = []
for row in list_to_flatten:
flat_list.extend(row)
return flat_list
lkp_query_builder_df = spark.read.table(
"{}.{}".format(target_database, lookup_table)
)
query_label_and_queue = (
lkp_query_builder_df.groupBy(lit(1)).agg(collect_list("query_label"), collect_set("queue")).collect()
)
query_list = flatten_extend(query_label_and_queue)[1]
queue_list = flatten_extend(query_label_and_queue)[2]
# COMMAND ----------
dbutils.widgets.text("start_date", "", label="Start Date")
dbutils.widgets.text("end_date", "", label="End Date")
dbutils.widgets.text("rerun_flag", "N", label="Re-Run Flag")
dbutils.widgets.text("look_back", "1", label="Look Back Window")
dbutils.widgets.multiselect(
"cadence_filter",
"All",
["All", "DAY", "WEEK", "MONTH", "QUARTER", "YEAR"],
label="Cadence",
)
dbutils.widgets.multiselect("query_label_filter", "All", query_list + ["All"], label="Use Case")
dbutils.widgets.multiselect("queue_filter", "All", queue_list + ["All"], label="Query Categorization")
dbutils.widgets.text("gab_base_path", "", label="Base Path Use Cases")
dbutils.widgets.text("target_table", "", label="Target Table")
# Input Parameters
lookback_days = "1" if dbutils.widgets.get("look_back") == "" else dbutils.widgets.get("look_back")
# COMMAND ----------
end_date_str = (
datetime.today().strftime("%Y-%m-%d") if dbutils.widgets.get("end_date") == "" else dbutils.widgets.get("end_date")
)
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
# As part of daily run, when no end_date is given, program always runs
# for yesterday date (Unless custom end date is given)
if dbutils.widgets.get("end_date") == "":
end_date = end_date - timedelta(days=1)
start_date_str = (
datetime.date(end_date - timedelta(days=int(lookback_days))).strftime("%Y-%m-%d")
if dbutils.widgets.get("start_date") == ""
else dbutils.widgets.get("start_date")
)
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
rerun_flag = dbutils.widgets.get("rerun_flag")
query_label_filter = dbutils.widgets.get("query_label_filter")
recon_filter = dbutils.widgets.get("cadence_filter")
queue_filter = dbutils.widgets.get("queue_filter")
gab_base_path = dbutils.widgets.get("gab_base_path")
# COMMAND ----------
query_label_filter = [x.strip() for x in list(set(query_label_filter.split(",")))]
queue_filter = list(set(queue_filter.split(",")))
recon_filter = list(set(recon_filter.split(",")))
if "All" in query_label_filter:
query_label_filter = query_list
if "All" in queue_filter:
queue_filter = queue_list
# COMMAND ----------
target_table = (
"gab_use_case_results" if dbutils.widgets.get("target_table") == "" else dbutils.widgets.get("target_table")
)
# COMMAND ----------
print(f"Query Label: {query_label_filter}")
print(f"Queue Filter: {queue_filter}")
print(f"Cadence Filter: {recon_filter}")
print(f"Target Database: {target_database}")
print(f"Start Date: {start_date}")
print(f"End Date: {end_date}")
print(f"Look Back Days: {lookback_days}")
print(f"Re-run Flag: {rerun_flag}")
print(f"Target Table: {target_table}")
print(f"Source Database: {source_database}")
print(f"Path Use Cases: {gab_base_path}")
# COMMAND ----------
gab_acon = {
"query_label_filter": query_label_filter,
"queue_filter": queue_filter,
"cadence_filter": recon_filter,
"target_database": target_database,
"start_date": start_date,
"end_date": end_date,
"rerun_flag": rerun_flag,
"target_table": target_table,
"source_database": source_database,
"gab_base_path": gab_base_path,
"lookup_table": lookup_table,
"calendar_table": "dim_calendar",
}
# COMMAND ----------
execute_gab(acon=gab_acon)
================================================
FILE: assets/gab/notebooks/gab_dim_calendar.py
================================================
# Databricks notebook source
# MAGIC %md
# MAGIC # This notebook holds the calendar used as part of the GAB framework.
# COMMAND ----------
# Import the required libraries
from datetime import datetime, timedelta
from pyspark.sql.functions import to_date
from pyspark.sql.types import StringType
# COMMAND ----------
DIM_CALENDAR_LOCATION = "s3://my-data-product-bucket/dim_calendar"
# COMMAND ----------
initial_date = datetime.strptime("1990-01-01", "%Y-%m-%d")
dates_list = [datetime.strftime(initial_date, "%Y-%m-%d")]
for _ in range(1, 200000):
initial_date = initial_date + timedelta(days=1)
next_date = datetime.strftime(initial_date, "%Y-%m-%d")
dates_list.append(next_date)
# COMMAND ----------
df_date_completed = spark.createDataFrame(dates_list, StringType())
df_date_completed = df_date_completed.withColumn("calendar_date", to_date(df_date_completed.value, "yyyy-MM-dd")).drop(
df_date_completed.value
)
df_date_completed.createOrReplaceTempView("dates_completed")
# COMMAND ----------
df_cal = spark.sql(
"""
WITH monday_calendar AS (
SELECT
calendar_date,
WEEKOFYEAR(calendar_date) AS weeknum_mon,
DATE_FORMAT(calendar_date, 'E') AS day_en,
MIN(calendar_date) OVER (PARTITION BY CONCAT(DATE_PART('YEAROFWEEK', calendar_date),
WEEKOFYEAR(calendar_date)) ORDER BY calendar_date) AS weekstart_mon
FROM dates_completed
ORDER BY
calendar_date
),
monday_calendar_plus_week_num_sunday AS (
SELECT
monday_calendar.*,
LEAD(weeknum_mon) OVER(ORDER BY calendar_date) AS weeknum_sun
FROM monday_calendar
),
calendar_complementary_values AS (
SELECT
calendar_date,
weeknum_mon,
day_en,
weekstart_mon,
weekstart_mon+6 AS weekend_mon,
LEAD(weekstart_mon-1) OVER(ORDER BY calendar_date) AS weekstart_sun,
DATE(DATE_TRUNC('MONTH', calendar_date)) AS month_start,
DATE(DATE_TRUNC('QUARTER', calendar_date)) AS quarter_start,
DATE(DATE_TRUNC('YEAR', calendar_date)) AS year_start
FROM monday_calendar_plus_week_num_sunday
)
SELECT
calendar_date,
day_en,
weeknum_mon,
weekstart_mon,
weekend_mon,
weekstart_sun,
weekstart_sun+6 AS weekend_sun,
month_start,
add_months(month_start, 1)-1 AS month_end,
quarter_start,
ADD_MONTHS(quarter_start, 3)-1 AS quarter_end,
year_start,
ADD_MONTHS(year_start, 12)-1 AS year_end
FROM calendar_complementary_values
"""
)
df_cal.createOrReplaceTempView("df_cal")
# COMMAND ----------
df_cal.write.format("delta").mode("overwrite").save(DIM_CALENDAR_LOCATION)
================================================
FILE: assets/gab/notebooks/gab_job_manager.py
================================================
# Databricks notebook source
import os
NOTEBOOK_CONTEXT = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
# Import the required libraries
import datetime
import json
import time
import uuid
import ast
from pyspark.sql.functions import col, lit, upper
# COMMAND ----------
# MAGIC %run ../utils/databricks_job_utils
# COMMAND ----------
AUTH_TOKEN = NOTEBOOK_CONTEXT.apiToken().getOrElse(None)
HOST_NAME = spark.conf.get("spark.databricks.workspaceUrl")
DATABRICKS_JOB_UTILS = DatabricksJobs(databricks_instance=HOST_NAME, auth=AUTH_TOKEN)
# COMMAND ----------
dbutils.widgets.text("gab_job_schedule", "{'hour': {07: 'GLOBAL'}}")
gab_job_schedule = ast.literal_eval(dbutils.widgets.get("gab_job_schedule"))
dbutils.widgets.text("source_database", "")
source_database = dbutils.widgets.get("source_database")
dbutils.widgets.text("target_database", "")
target_database = dbutils.widgets.get("target_database")
dbutils.widgets.text("gab_base_path", "")
gab_base_path = dbutils.widgets.get("gab_base_path")
dbutils.widgets.text("gab_max_jobs_limit_high_job", "")
gab_max_jobs_limit_high_job = dbutils.widgets.get("gab_max_jobs_limit_high_job")
dbutils.widgets.text("gab_max_jobs_limit_medium_job", "")
gab_max_jobs_limit_medium_job = dbutils.widgets.get("gab_max_jobs_limit_medium_job")
dbutils.widgets.text("gab_max_jobs_limit_low_job", "")
gab_max_jobs_limit_low_job = dbutils.widgets.get("gab_max_jobs_limit_low_job")
# COMMAND ----------
# functions
def divide_chunks(input_list: list, max_number_of_jobs: int) -> list:
"""Split list into predefined chunks, accordingly to the number of jobs.
This function reads the maximum job limit defined by the parameter for each queue type in order to determine
the number of parallel runs for each queue and divides the use cases into chunks for each run.
For example, if the maximum job limit is set to 30 for the high queue and there are 60 use cases for the
high queue, then each run will handle 2 use cases.
Args:
input_list: Input list to be split.
max_number_of_jobs: Max job number.
Returns:
Split chunk list.
"""
avg_chunk_size = len(input_list) // max_number_of_jobs
remainder = len(input_list) % max_number_of_jobs
chunks = [
input_list[i * avg_chunk_size + min(i, remainder) : (i + 1) * avg_chunk_size + min(i + 1, remainder)]
for i in range(max_number_of_jobs)
]
chunks = list(filter(None, chunks))
return chunks
def get_run_regions(job_schedule: dict, job_info: dict) -> list:
"""Get run regions accordingly to job_manager trigger time.
Args:
job_schedule: Markets schedule list from the parameter `gab_job_schedule`.
job_info: Job manager info to match.
Returns:
Markets run list.
"""
q_type_match = ""
for keys in job_schedule["hour"].keys():
if keys == int(datetime.datetime.fromtimestamp(job_info["start_time"] / 1000).strftime("%H")):
q_type_match = job_schedule["hour"][keys]
try:
print("Matched regions are: ", q_type_match)
return list(q_type_match.split(","))
except Exception:
raise Exception("None of the query types are configured to be run at this time")
# COMMAND ----------
context_json = json.loads(NOTEBOOK_CONTEXT.safeToJson())
run_id = ""
if context_json.get("attributes") and context_json["attributes"].get("rootRunId"):
run_id = context_json["attributes"]["rootRunId"]
print(f"Job Run Id: {run_id}")
job_status = DATABRICKS_JOB_UTILS.get_job(run_id)
print("Job Status: ", job_status)
# COMMAND ----------
list_q_type_match = get_run_regions(gab_job_schedule, job_status)
job_queues = {
"High": {"queue": "gab_high_queue", "max_jobs": gab_max_jobs_limit_high_job},
"Medium": {
"queue": "gab_medium_queue",
"max_jobs": gab_max_jobs_limit_medium_job,
},
"Low": {"queue": "gab_low_queue", "max_jobs": gab_max_jobs_limit_low_job},
}
df = spark.read.table(f"{target_database}.lkp_query_builder")
for queue_type, queue_config in job_queues.items():
lst = (
df.filter(upper(col("queue")) == lit(queue_type.upper()))
.filter(col("query_type").isin(list_q_type_match))
.select(col("query_label"))
.collect()
)
query_list = [job_queues[0] for job_queues in lst]
chunk = divide_chunks(query_list, int(queue_config["max_jobs"]))
chunk = [i for i in chunk if i]
if chunk:
for i in range(0, len(chunk)):
chunk_split = ",".join(chunk[i])
print(chunk_split)
time.sleep(2)
idempotency_token = uuid.uuid4()
print(idempotency_token)
result = DATABRICKS_JOB_UTILS.run_now(
DATABRICKS_JOB_UTILS.job_id_extraction(queue_config["queue"]),
{
"query_label_filter": chunk_split,
"start_date": "",
"look_back": "",
"end_date": "",
"cadence_filter": "All",
"queue_filter": queue_type,
"rerun_flag": "N",
"target_database": target_database,
"source_database": source_database,
"gab_base_path": gab_base_path,
},
idempotency_token=idempotency_token,
)
print(f"{result}\n")
================================================
FILE: assets/gab/notebooks/query_builder_helper.py
================================================
# Databricks notebook source
# MAGIC %md
# MAGIC # Import Utils
# COMMAND ----------
# MAGIC %run ../utils/query_builder_utils
# COMMAND ----------
QUERY_BUILDER_UTILS = QueryBuilderUtils()
# COMMAND ----------
# MAGIC %md
# MAGIC <h1>Use Case Setup
# COMMAND ----------
# MAGIC %md
# MAGIC
# MAGIC The Global Asset Builder (GAB) has been developed to help you automate the creations of aggregate tables for
# MAGIC dashboards on top of base fact tables. It reduce the efforts and time to production for new aggregate tables.
# MAGIC Users don't need to create separate pipeline for all such cases.
# MAGIC
# MAGIC This notebook has been developed to help users to create their use cases configurations easily.
# MAGIC
# MAGIC There is some mandatory information that must be completed for the use case to work correctly:
# MAGIC
# MAGIC **Use case name:** This parameter must not contain spaces or special characters.
# MAGIC The suggestion is to use lowercase and underlined alphanumeric characters.
# MAGIC
# MAGIC **Market:** Related to the job schedule, example: GLOBAL starting at 07AM UTC
# MAGIC It gets the complete coverage of last day for the market.
# MAGIC - GLOBAL - 07AM UTC
# MAGIC
# MAGIC **Reference date:** Reference date of the use case. The parameter should be the column name.
# MAGIC The selected column should have the date/datetime format.
# MAGIC
# MAGIC **To date:** This parameter is used in the template, by default its value must be "to_date".
# MAGIC You can change it if you have managed this in your SQL files.
# MAGIC The values stored in this column depend on the use case behavior:
# MAGIC - if snapshots are enabled, it will contain the snapshot end day.
# MAGIC - If snapshot is not enabled, it will contain the last day of the cadence.
# MAGIC The snapshot behaviour is set in the reconciliation steps.
# MAGIC
# MAGIC **How many dimensions?** An integer input of the number of dimensions (columns) expected in the use case.
# MAGIC Do not consider the reference date or metrics here, as they have their own parameters.
# MAGIC
# MAGIC **Time Offset:** The time zone offset that you want to apply to the reference date column.
# MAGIC It should be a number to decrement or add to the date (e.g., -8 or 8). The default value is zero,
# MAGIC which means that any time zone transformation will be applied to the date.
# MAGIC
# MAGIC **Week start:** The start of the business week of the use case. Two options are available SUNDAY or MONDAY.
# MAGIC
# MAGIC **Is Active:** Flag to make the use case active or not. Default value is "Y".
# MAGIC
# MAGIC **How many views?** Defines how many consumption views you want to have for the use case.
# MAGIC You can have as many as you want. However, they will have exactly the same structure
# MAGIC (metrics, columns, timelines, etc.), the only change will be the filter applied to them.
# MAGIC The default value is 1.
# MAGIC
# MAGIC **Complexity:** Defines the complexity of your use case. You should mainly consider the volume of data.
# MAGIC This parameter directly affects the number of workers that will be spin up to execute the use case.
# MAGIC - High
# MAGIC - Medium
# MAGIC - Low
# MAGIC
# MAGIC **SQL File Names:** Name of the SQL files used in the use case.
# MAGIC You can combine different layers of dependencies between them as shown in the example,
# MAGIC where the "2_combined.sql" file depends on "1_product_category.sql" file.
# MAGIC The file name should follow the pattern x_file_name (where x is an integer digit) and be separated by a comma
# MAGIC (e.g.: 1_first_query.sql, 2_second_query.sql).
# MAGIC
# MAGIC **DEV - Database Schema Name** Refers to the name of the development environment database where the
# MAGIC "lkp_query_builder" table resides. This parameter is used at the end of the notebook to insert data into
# MAGIC the "lkp_query_builder" table.
# COMMAND ----------
dbutils.widgets.removeAll()
dbutils.widgets.text(name="usecase_name", defaultValue="", label="Use Case Name")
dbutils.widgets.dropdown(
name="market", defaultValue="GLOBAL", label="Market", choices=["APAC", "GLOBAL", "NAM", "NIGHTLY"]
)
dbutils.widgets.text(name="from_date", defaultValue="", label="Reference Date")
dbutils.widgets.text(name="to_date", defaultValue="to_date", label="Snapshot End Date")
dbutils.widgets.text(name="num_dimensions", defaultValue="", label="How many dimensions?")
dbutils.widgets.text(name="time_offset", defaultValue="0", label="Time Offset")
dbutils.widgets.dropdown(name="week_start", defaultValue="MONDAY", label="Week start", choices=["SUNDAY", "MONDAY"])
dbutils.widgets.dropdown(name="is_active", defaultValue="Y", label="Is Active", choices=["Y", "N"])
dbutils.widgets.text(name="num_of_views", defaultValue="1", label="How many views?")
dbutils.widgets.dropdown(
name="complexity", defaultValue="Medium", label="Complexity", choices=["Low", "Medium", "High"]
)
dbutils.widgets.text(name="sql_files", defaultValue="", label="SQL File Names")
dbutils.widgets.text(name="db_schema", defaultValue="", label="DEV - Database Schema Name")
# COMMAND ----------
# MAGIC %md
# MAGIC Set configurations and validate.
# COMMAND ----------
usecase_name = dbutils.widgets.get("usecase_name").lower().strip()
market = dbutils.widgets.get("market")
from_date = dbutils.widgets.get("from_date")
to_date = dbutils.widgets.get("to_date")
num_dimensions = dbutils.widgets.get("num_dimensions")
time_offset = dbutils.widgets.get("time_offset")
week_start = dbutils.widgets.get("week_start")
is_active = dbutils.widgets.get("is_active")
num_of_views = dbutils.widgets.get("num_of_views")
complexity = dbutils.widgets.get("complexity")
sql_files = dbutils.widgets.get("sql_files").replace(".sql", "")
db_schema = dbutils.widgets.get("db_schema")
num_of_metrics = ""
QUERY_BUILDER_UTILS.check_config_inputs(
usecase_name, from_date, num_dimensions, sql_files, num_of_views, to_date, time_offset, db_schema
)
# COMMAND ----------
# MAGIC %md
# MAGIC Set Dimensions.
# MAGIC
# MAGIC In this step you will have to map the dimension columns with their respective order.
# MAGIC The options available in the widgets to fill are based on the number of dimensions previously defined.
# MAGIC For example, if you have two dimensions to analyze, such as country and category,
# MAGIC values must be set to D1 and D2.
# MAGIC For example:
# MAGIC D1. Dimension name = country
# MAGIC D2. Dimension name = category
# COMMAND ----------
QUERY_BUILDER_UTILS.set_dimensions(num_dimensions)
# COMMAND ----------
dimensions = QUERY_BUILDER_UTILS.get_dimensions(num_dimensions)
# COMMAND ----------
QUERY_BUILDER_UTILS.print_definitions(
usecase_name=usecase_name,
market=market,
from_date=from_date,
to_date=to_date,
dimensions=dimensions,
time_offset=time_offset,
week_start=week_start,
is_active=is_active,
num_of_views=num_of_views,
complexity=complexity,
sql_files=sql_files,
db_schema=db_schema,
)
# COMMAND ----------
# MAGIC %md
# MAGIC <h1> 1 - Configure view(s) name(s) and filter(s)
# COMMAND ----------
# MAGIC %md
# MAGIC The filters defined in this step will be based on the dimensions defined in the previous step.
# MAGIC
# MAGIC So, if you have set the country as D1, the filter here should be D1 = "Germany".
# MAGIC The commands allowed for the filter step are the same as those used in the where clause in SQL language.
# COMMAND ----------
QUERY_BUILDER_UTILS.set_views(num_of_views)
# COMMAND ----------
dims_dict = QUERY_BUILDER_UTILS.get_view_information(num_of_views)
# COMMAND ----------
QUERY_BUILDER_UTILS.print_definitions(
usecase_name=usecase_name,
market=market,
from_date=from_date,
to_date=to_date,
dimensions=dimensions,
time_offset=time_offset,
week_start=week_start,
is_active=is_active,
num_of_views=num_of_views,
complexity=complexity,
sql_files=sql_files,
db_schema=db_schema,
dims_dict=dims_dict,
)
# COMMAND ----------
# MAGIC %md
# MAGIC # 2 - Configure Reconciliation
# COMMAND ----------
# MAGIC %md
# MAGIC The reconciliation configuration (recon) is mandatory.
# MAGIC In this section you will set the cadence, recon and snapshot behaviour of your use case.
# MAGIC
# MAGIC CADENCE - The cadence sets how often the data will be calculated. E.g: DAY, WEEK, MONTH, QUARTER, YEAR.
# MAGIC
# MAGIC RECON - The reconciliation for the cadence set.
# MAGIC
# MAGIC IS SNAPSHOT? - Set yes or no for the combination of cadence and reconciliation.
# MAGIC
# MAGIC Combination examples:
# MAGIC - DAILY CADENCE = DAY - This configuration means that only daily data will be refreshed.
# MAGIC - MONTHLY CADENCE - WEEKLY RECONCILIATION - WITHOUT SNAPSHOT = MONTH-WEEK-N -
# MAGIC This means after every week, the whole month data is refreshed without snapshot.
# MAGIC - WEEKLY CADENCE - DAY RECONCILIATION - WITH SNAPSHOT = WEEK-DAY-Y -
# MAGIC This means that every day, the entire week's data (week to date) is refreshed with snapshot.
# MAGIC It will generate a record for each day with the specific position of the value for the week.
# COMMAND ----------
dbutils.widgets.removeAll()
dbutils.widgets.multiselect(
name="recon_cadence",
defaultValue="DAY",
label="Recon Cadence",
choices=QUERY_BUILDER_UTILS.get_recon_choices(),
)
# COMMAND ----------
recon_list = list(filter(None, dbutils.widgets.get(name="recon_cadence").split(",")))
print(f"List of chosen reconciliation values: {recon_list}")
# COMMAND ----------
recon_dict = QUERY_BUILDER_UTILS.get_recon_config(recon_list)
# COMMAND ----------
QUERY_BUILDER_UTILS.print_definitions(
usecase_name=usecase_name,
market=market,
from_date=from_date,
to_date=to_date,
dimensions=dimensions,
time_offset=time_offset,
week_start=week_start,
is_active=is_active,
num_of_views=num_of_views,
complexity=complexity,
sql_files=sql_files,
db_schema=db_schema,
dims_dict=dims_dict,
recon_dict=recon_dict,
)
# COMMAND ----------
# MAGIC %md
# MAGIC <h1> 3 - Configure METRICS
# COMMAND ----------
# MAGIC %md
# MAGIC Define how many metrics your SQL files contain. For example, you have a sum (amount) as total_amount
# MAGIC and a count(*) as total_records, you will need to set 2 here.
# MAGIC
# MAGIC The metrics column must be configured in the same order they appear in the sql files.
# MAGIC
# MAGIC For example:
# MAGIC 1. Metric name = total_amount
# MAGIC 2. Metric name = total_records
# COMMAND ----------
dbutils.widgets.removeAll()
dbutils.widgets.text(name="num_of_metrics", defaultValue="1", label="How many metrics?")
# COMMAND ----------
num_of_metrics = dbutils.widgets.get("num_of_metrics")
QUERY_BUILDER_UTILS.set_metric(num_of_metrics)
# COMMAND ----------
# MAGIC %md
# MAGIC Based on the metric setup, it is possible to derive 4 new columns based on each metric.
# MAGIC Those new columns will be based on cadences like last_cadence, last_year_cadence and window function.
# MAGIC But also, you can create a derived column, which is a SQL statement that you can write on your own
# MAGIC by selecting the option of "derived_metric".
# COMMAND ----------
metrics_dict = QUERY_BUILDER_UTILS.get_metric_configuration(num_of_metrics)
# COMMAND ----------
QUERY_BUILDER_UTILS.set_extra_metric_config(num_of_metrics, metrics_dict)
# COMMAND ----------
QUERY_BUILDER_UTILS.print_definitions(
usecase_name=usecase_name,
market=market,
from_date=from_date,
to_date=to_date,
dimensions=dimensions,
time_offset=time_offset,
week_start=week_start,
is_active=is_active,
num_of_views=num_of_views,
complexity=complexity,
sql_files=sql_files,
db_schema=db_schema,
dims_dict=dims_dict,
recon_dict=recon_dict,
metrics_dict=metrics_dict,
)
# COMMAND ----------
# MAGIC %md
# MAGIC <h1> 4 - Configure STAGES
# COMMAND ----------
# MAGIC %md
# MAGIC The parameters available for this step are:
# MAGIC
# MAGIC - Filter Date Column - This column will be used to filter the data of your use case.
# MAGIC This information will be replaced in the placeholder of the GAB template.
# MAGIC - Project Date Column - This column will be used as reference date for the query given.
# MAGIC This information will be replaced in the placeholder of the GAB template.
# MAGIC - Repartition Value - This parameter only has effect when used with Repartition Type parameter.
# MAGIC It sets the way of repartitioning the data while processing.
# MAGIC - Repartition Type - Type of repartitioning the data of the query.
# MAGIC Available values are Key and Number. When use Key, it expects column names separated by a comma.
# MAGIC When set number it expects and integer of how many partitions the user want.
# MAGIC - Storage Level - Defines the type of spark persistence storage levels you want to define
# MAGIC (e.g. Memory Only, Memory and Disk etc).
# MAGIC - Table Alias - The alias name of the sql file that will run.
# MAGIC
# COMMAND ----------
sql_files_list = QUERY_BUILDER_UTILS.set_stages(sql_files=sql_files)
# COMMAND ----------
# MAGIC %md
# MAGIC According to the number of sql files provided in the use case, a set of widgets will appear to be configured.
# MAGIC Remember that the configuration index matches the given sql file order.
# MAGIC
# MAGIC For example: 1_categories.sql, 2_fact_kpi.sql. Settings starting with index “1”.
# MAGIC will be set to sql file 1_categories.sql. The same will happen with index “2.”.
# COMMAND ----------
stages_dict = QUERY_BUILDER_UTILS.get_stages(sql_files_list, usecase_name)
# COMMAND ----------
# MAGIC %md
# MAGIC <h1> BUILD AND INSERT SQL INSTRUCTION
# COMMAND ----------
delete_sttmt, insert_sttmt = QUERY_BUILDER_UTILS.create_sql_statement(
usecase_name,
market,
stages_dict,
recon_dict,
time_offset,
week_start,
is_active,
complexity,
db_schema,
dims_dict,
dimensions,
from_date,
to_date,
metrics_dict,
)
print(delete_sttmt + "\n" + insert_sttmt)
# COMMAND ----------
# MAGIC %md
# MAGIC <h1> INSERT CONFIGURATION DATA
# MAGIC
# MAGIC **Note:** This insert will have effect just on dev/uat, to execute it on prod
# MAGIC it will need to use the Table/SQL Manager or another job.
# COMMAND ----------
QUERY_BUILDER_UTILS.insert_data_into_lkp_query_builder(delete_sttmt, insert_sttmt)
================================================
FILE: assets/gab/utils/databricks_job_utils.py
================================================
# Databricks notebook source
# imports
import enum
from typing import Tuple
from uuid import UUID
import requests
# COMMAND ----------
class BearerAuth:
"""Create authorisation object to be used in the requests header."""
def __init__(self, token):
"""Create auth object with personal access token."""
self.token = token
def __call__(self, r):
"""Add bearer token to header.
This function is internally called by get or post method of requests.
"""
r.headers["authorization"] = "Bearer " + self.token
return r
class ResultState(str, enum.Enum):
"""Possible values for result state of a job run."""
SUCCESS = "SUCCESS"
CANCELED = "CANCELED"
FAILED = "FAILED"
SKIPPED = "SKIPPED"
class DatabricksJobs:
"""Class with methods to execute databricks jobs API commands.
Refer documentation for details: https://docs.databricks.com/dev-tools/api/latest/jobs.html#.
"""
# api endpoints
RUN_NOW = "/2.1/jobs/run-now"
GET_OUTPUT = "/2.1/jobs/runs/get-output"
GET_JOB = "/2.1/jobs/runs/get"
GET_LIST_JOBS = "/2.1/jobs/list"
CANCEL_JOB = "/2.1/jobs/runs/cancel"
headers = {"Content-type": "application/json"}
def __init__(self, databricks_instance: str, auth: str):
"""
Construct a databricks jobs object using databricks instance and api token.
Parameters:
databricks_instance: domain name of databricks deployment. Use the form <account>.cloud.databricks.com
auth: personal access token
"""
self.databricks_instance = databricks_instance
self.auth = BearerAuth(auth)
@staticmethod
def _check_response(response):
if response.status_code != 200:
raise Exception(f"Response Code: {response.status_code} \n {response.content}")
def list_jobs(self, name: str = None, limit: int = 20, offset: int = 0, expand_tasks: bool = False) -> dict:
"""
List the databricks jobs corresponding to given `name`.
for details refer API documentation:
https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsList
Parameters:
name: optional, to filter jobs as per name (case-insensitive)
limit: optional, The number of jobs to return, valid range 0 to 25.
offset: The offset of the first job to return, relative to the most recently created job
expand_tasks: Whether to include task and cluster details in the response.
Returns:
A dictionary of job ids matching the name (if provided) else returns in chunks
"""
params = {"limit": limit, "offset": offset, "expand_tasks": expand_tasks}
if name:
params.update({"name": name})
response = requests.get(
f"https://{self.databricks_instance}/api{self.GET_LIST_JOBS}",
params=params,
headers=self.headers,
auth=self.auth,
)
self._check_response(response) # Raises exception if not successful
return response.json()
def run_now(self, job_id: int, notebook_params: dict, idempotency_token: UUID = None) -> dict:
"""
Trigger the job specified by the job id.
Note: currently it expects notebook tasks in a job, but can be extended for other tasks
Parameters:
job_id: databricks job identifier
notebook_params: key value pairs of the parameter name and its value to be passed to the job
idempotency_token: An optional token to guarantee the idempotency of job run requests,
it should have at most 64 characters
Returns:
A dictionary consisting of run_id and number_in_job
"""
data = {"job_id": job_id, "notebook_params": notebook_params}
if idempotency_token:
data.update({"idempotency_token": str(idempotency_token)})
response = requests.post(
f"https://{self.databricks_instance}/api{self.RUN_NOW}",
json=data,
headers=self.headers,
auth=self.auth,
)
self._check_response(response) # Raises exception if not successful
return response.json()
def get_output(self, run_id: int) -> dict:
"""
Fetch the single job run output and metadata for a single task.
Reference: https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsGetOutput
Parameters:
run_id: identifier for the job run
Returns:
A dictionary containing the output and metadata from task
"""
params = {}
if run_id:
params.update({"run_id": run_id})
response = requests.get(
f"https://{self.databricks_instance}/api{self.GET_OUTPUT}",
params=params,
headers=self.headers,
auth=self.auth,
)
self._check_response(response) # Raises exception if not successful
return response.json()
def get_job(self, run_id: int) -> dict:
"""
Retrieve the metadata of a job run identified by run_id.
Parameters:
run_id: identifier for the job run
Returns:
A dictionary containing the metadata of a job
"""
params = {}
if run_id:
params.update({"run_id": run_id})
response = requests.get(
f"https://{self.databricks_instance}/api{self.GET_JOB}", params=params, headers=self.headers, auth=self.auth
)
self._check_response(response) # Raises exception if not successful
return response.json()
def cancel_job(self, run_id: int) -> dict:
"""
Cancel job specified by run_id.
Parameters:
run_id: job run identifier
Returns:
Response received from endpoint
"""
response = requests.post(
f"https://{self.databricks_instance}/api{self.CANCEL_JOB}",
json={"run_id": run_id},
headers=self.headers,
auth=self.auth,
)
self._check_response(response) # Raises exception if not successful
return response.json()
def trigger_job_by_name(self, job_name: str, notebook_params: dict, idempotency_token: UUID = None) -> dict:
"""
Triggers a job as specified by the job name, if found.
Parameters:
job_name: name of the job
notebook_params: key value pairs of the parameter name and its value to be passed to the job
idempotency_token: Optional token to guarantee the idempotency of job run requests, 64 characters max
Returns:
A dictionary consisting of run_id and number_in_job
"""
result = self.list_jobs(name=job_name)
if result.get("jobs") is None:
raise Exception(f"job with name {job_name} not found.")
return self.run_now(int(result.get("jobs")[0].get("job_id")), notebook_params, idempotency_token)
def get_job_status(self, run_id: int) -> Tuple[bool, dict]:
"""
Fetch the status of the job run id.
Parameters:
run_id: identifier for the job run
Returns:
Tuple bool and dict containing whether the job run has succeeded and its state
"""
state = self.get_job(run_id)["state"]
result_state = state.get("result_state") or state.get("life_cycle_state")
return result_state == ResultState.SUCCESS, state
def job_id_extraction(self, job_name: str) -> int:
"""Extract the job id from the job run.
Args:
job_name: Job name.
Returns:
Job ID number.
"""
jobs_list = self.list_jobs(name=job_name)
if jobs_list.get("jobs") is None:
raise Exception("No jobs found.")
return int(jobs_list.get("jobs")[0].get("job_id"))
================================================
FILE: assets/gab/utils/query_builder_utils.py
================================================
# Databricks notebook source
import json
import re
from databricks.sdk.runtime import *
class QueryBuilderUtils:
"""Class with methods to create GAB use case configuration."""
def __init__(self):
"""Instantiate objects of the class QueryBuilderUtils."""
self.regex_no_special_characters = "^[a-zA-Z0-9]+(_[a-zA-Z0-9]+)*$"
self.cadences = ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"]
def check_config_inputs(
self,
usecase_name: str,
from_date: str,
num_dimensions: str,
sql_files: str,
num_of_views: str,
to_date: str,
time_offset: str,
db_schema: str
) -> str:
"""
Check the parameters input.
Args:
usecase_name: The use case name.
from_date: The reference date of the use case.
num_dimensions: The number of dimensions chosen for analysis.
sql_files: Name of the SQL files that will be submitted for the framework
to process (e.g. file1.sql, file2.sql).
num_of_views: Number of views the use case has.
to_date: The end date of the snapshot configuration.
time_offset: Hours related to the timezone (e.g. 8, -8).
db_schema: Database name that lkp_query_builder is located.
Returns:
A message with the status of the validation.
"""
message = ""
if (
usecase_name.strip() == ""
or from_date.strip() == ""
or num_dimensions.strip() == ""
or sql_files.strip() == ""
or num_of_views.strip() == ""
or to_date.strip() == ""
or db_schema.strip() == ""
):
message = "WRONG CONFIGURATION:"
if usecase_name.strip() == "":
message += "\n\t - Please, add the Use Case Name."
if from_date.strip() == "":
message += "\n\t - Please, add the From Date."
if num_dimensions.strip() == "":
message += "\n\t - Please, add the Number of Dimensions."
if sql_files.strip() == "":
message += "\n\t - Please, add the SQL File Names."
if num_of_views.strip() == "":
message += "\n\t - Please, add the number of views."
if to_date.strip() == "":
message += "\n\t - Please, add the to date value. This information is mandatory. "
message += "Keep it as 'to_date' unless you change its name in your SQL files."
if db_schema.strip() == "":
message += "\n\t - Please, add the database schema where the lkp_query_builder table is located."
if time_offset.strip():
try:
int(re.findall('-?\d+\.?\d*',time_offset.strip())[0])
except Exception:
if message:
message += "\n\t The timezone offset must be a number (e.g. 0, 12 or -8)."
else:
message = "WRONG CONFIGURATION:"
message += "\n\t - The timezone offset must be a number (e.g. 0, 12 or -8)."
if num_dimensions.strip():
try:
int(num_dimensions)
if int(num_dimensions) == 0:
message = "WRONG CONFIGURATION:"
message += "\n\t - The number of dimensions must be greater than zero."
except Exception:
if message:
message += "\n\t - The number of dimensions must be an integer."
else:
message = "WRONG CONFIGURATION:"
message += "\n\t - The number of dimensions must be an integer."
if sql_files.strip():
files_list = self._sort_files(sql_files)
for file in files_list:
sql_files_err = f"""\n\t - Check the SQL file name '{file}'. """
sql_files_err += "It must follow the pattern x_file_name (X is an integer digit)." ""
try:
int(re.match("(.*?)_", file).group()[:-1])
except Exception:
if message:
message += sql_files_err
else:
message = "WRONG CONFIGURATION:"
message += sql_files_err
if not message:
message = "Validation status: OK"
return print(message)
def create_sql_statement(
self,
usecase_name: str,
market: str,
stages_dict: dict,
recon_dict: dict,
time_offset: str,
week_start: str,
is_active: str,
complexity: str,
db_schema: str,
dims_dict: dict,
dimensions: str,
from_date: str,
to_date: str,
metrics_dict: dict,
) -> tuple[str, str]:
"""
Create the SQL statement to insert data into lkp_query_builder_table.
Args:
usecase_name: The name of use case.
market: The market used for the use case (APAC, GLOBAL, NAM, NIGHTLY).
stages_dict: A dictionary of stages and it's configurations.
recon_dict: A dictionary of reconciliation setup.
time_offset: Hours related to the timezone (e.g. 8, -8).
week_start: Day of the start of the week (e.g. Sunday, Monday)
is_active: If the use case is active or not. (e.g. Y, N)
complexity: The categories are directly related to the number of workers in each cluster.
That is, High = 10 workers, Medium = 6 workers and Low = 4 workers.
db_schema: Database name that lkp_query_builder is located.
dims_dict: The dictionary of views and it's setup.
dimensions: Store supporting information to the fact table.
from_date: Aggregating date column for the use case.
to_date: Contains the current date (default value is to_date).
Information used as template for the framework.
metrics_dict: The dictionary of metrics and it's setup.
Returns:
A tuple with a text formatted with the delete and insert statement.
"""
dbutils.widgets.removeAll()
mapping_dict = self._get_mapping(dims_dict, dimensions, from_date, to_date, metrics_dict)
query_id = self._generate_query_id(usecase_name)
query_label = f"'{usecase_name}'"
query_type = f"'{market}'"
mapping_str = json.dumps(mapping_dict, indent=4)
mappings = '"""' + mapping_str.replace('"', "'").replace("#+#-#", '\\"') + '"""'
steps_str = json.dumps(stages_dict, indent=4)
intermediate_stages = '"""' + steps_str.replace('"', "'") + '"""'
recon_str = json.dumps(recon_dict)
recon_window = '"""' + recon_str.replace('"', "'") + '"""'
col_time_offset = f"'{time_offset}'"
start_of_week = f"'{week_start}'"
col_is_active = f"'{is_active}'"
queue = f"'{complexity}'"
delete_sttmt = f"""DELETE FROM {db_schema}.lkp_query_builder WHERE QUERY_LABEL = {query_label};"""
insert_sttmt = f"""INSERT INTO {db_schema}.lkp_query_builder VALUES (
{query_id},
{query_label},
{query_type},
{mappings},
{intermediate_stages},
{recon_window},
{col_time_offset},
{start_of_week},
{col_is_active},
{queue},
current_timestamp());"""
return delete_sttmt, insert_sttmt
def get_dimensions(self, num_dimensions: str) -> str:
"""
Get the dimensions set on the widgets and validate.
Args:
num_dimensions: The number of dimensions set.
Returns:
A string with comma-separated dimensions names.
"""
dimensions = ""
list_status = []
for i in range(int(num_dimensions)):
i = i + 1
if re.match(self.regex_no_special_characters, dbutils.widgets.get(f"D{i}").strip()):
dimensions += "," + dbutils.widgets.get(f"D{i}").strip()
list_status.append("success")
else:
print("WRONG CONFIGURATION:")
print(f"\t- {dbutils.widgets.get(f'D{i}')} is empty of malformed!")
print(
"\t Names can contain only alphanumeric characters and must begin with "
"an alphabetic character or an underscore (_)."
)
list_status.append("fail")
if "fail" not in list_status:
print("Dimensions validation status: OK")
return dimensions[1:]
@classmethod
def get_recon_choices(cls) -> list:
"""
Return all possible combinations for cadences, reconciliations and the snapshot flag value (Y,N).
Returns:
List used to generate a multiselect widget for the users to interact with.
"""
return [
"DAY",
"DAY-WEEK-N",
"DAY-MONTH-N",
"DAY-QUARTER-N",
"DAY-YEAR-N",
"WEEK",
"WEEK-DAY-N",
"WEEK-DAY-Y",
"WEEK-MONTH-N",
"WEEK-QUARTER-N",
"WEEK-YEAR-N",
"MONTH",
"MONTH-DAY-N",
"MONTH-DAY-Y",
"MONTH-WEEK-Y",
"MONTH-WEEK-N",
"MONTH-QUARTER-N",
"MONTH-YEAR-N",
"QUARTER",
"QUARTER-DAY-N",
"QUARTER-DAY-Y",
"QUARTER-WEEK-N",
"QUARTER-WEEK-Y",
"QUARTER-MONTH-N",
"QUARTER-MONTH-Y",
"QUARTER-YEAR-N",
"YEAR",
"YEAR-DAY-N",
"YEAR-DAY-Y",
"YEAR-WEEK-N",
"YEAR-WEEK-Y",
"YEAR-MONTH-N",
"YEAR-MONTH-Y",
"YEAR-QUARTER-N",
"YEAR-QUARTER-Y",
]
@classmethod
def get_metric_configuration(cls, num_of_metrics: str) -> dict:
"""
Get metrics information based on the widget setup.
Args:
num_of_metrics: Number of metrics selected.
Returns:
metrics_dict: The dictionary of metrics and their setup.
"""
metrics_dict = {}
for i in range(int(num_of_metrics)):
i = i + 1
if dbutils.widgets.get(f"metric_name{i}"):
metrics_dict[f"m{i}"] = {
"metric_name": dbutils.widgets.get(f"metric_name{i}"),
"calculated_metric": {},
"derived_metric": {},
}
calculated_metric_list = list(filter(None, dbutils.widgets.get(f"calculated_metric{i}").split(",")))
for calc_metric in calculated_metric_list:
if calc_metric == "last_cadence":
metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}})
# add label and window for last_cadence
dbutils.widgets.text(
name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
)
dbutils.widgets.text(
name=f"{i}_{calc_metric}_window", defaultValue="", label=f"{i}_{calc_metric}.Window"
)
if calc_metric == "last_year_cadence":
metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}})
# add label and window for last_cadence
dbutils.widgets.text(
name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
)
if calc_metric == "window_function":
metrics_dict[f"m{i}"]["calculated_metric"].update({calc_metric: {}})
# add label and window for window_function
dbutils.widgets.text(
name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
)
dbutils.widgets.text(
name=f"{i}_{calc_metric}_window",
defaultValue="",
label=f"{i}_{calc_metric}.Window Interval",
)
dbutils.widgets.dropdown(
name=f"{i}_{calc_metric}_agg_func",
defaultValue="sum",
label=f"{i}_{calc_metric}.Agg Func",
choices=["sum", "avg", "max", "min", "count"],
)
# add label and window for derived_metric
if calc_metric == "derived_metric":
dbutils.widgets.text(
name=f"{i}_{calc_metric}_label", defaultValue="", label=f"{i}_{calc_metric}.Label"
)
dbutils.widgets.text(
name=f"{i}_{calc_metric}_formula", defaultValue="", label=f"{i}_{calc_metric}.Formula"
)
print("Metric configuration status: OK")
else:
print("WRONG CONFIGURATION:")
print("\t- The metric name is mandatory!")
return metrics_dict
def get_recon_config(self, recon_list: list) -> dict:
"""
Get reconciliation information based on the widget setup.
Args:
recon_list: List of cadences setup for the reconciliation.
Returns:
A dictionary of reconciliation setup.
"""
cadence_list = []
# create a list with the distinct cadences values.
for cadence in recon_list:
cadence_name = cadence.split("-")[0]
cadence_list.append(cadence_name)
cadence_list = list(dict.fromkeys(cadence_list))
# create a dict with the structure of each cadence.
recon_dict = {}
for cad in cadence_list:
recon_dict[f"{cad}"] = {}
recon_dict[f"{cad}"]["recon_window"] = {}
# updates the dict of each cadence with the recon configurations selected.
for cadence in recon_list:
if cadence in self.cadences:
recon_dict[f"{cad}"]["recon_window"] = {}
else:
cadence_name = cadence.split("-")[0]
recon = cadence.split("-")[1]
snapshot = cadence.split("-")[2]
for cad in cadence_list:
if cadence_name == cad:
recon_dict[cad]["recon_window"].update({recon: {"snapshot": snapshot}})
# remove empty recon_window when the selected just cadence.
for cadence in recon_list:
if cadence in ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"]:
if recon_dict[f"{cadence}"]["recon_window"] == {}:
del recon_dict[f"{cadence}"]["recon_window"]
if recon_dict:
print("Reconciliation configuration status: OK")
else:
print("WRONG CONFIGURATION:")
print("\t- The recon information is mandatory!")
return recon_dict
def get_stages(self, sql_files_list: list, usecase_name: str) -> dict:
"""
Set stages based on the widget setup.
Args:
sql_files_list: A list of sql files and their setup.
usecase_name: The use case name.
Returns:
stages_dict: A dictionary of stages and their setup.
"""
stages_dict = {}
i = 0
list_status = []
for file in sql_files_list:
i = i + 1
if dbutils.widgets.get(name=f"{i}_script_table_alias"):
stages_dict[f"{i}"] = {
"file_path": usecase_name + "/" + file.strip() + ".sql",
"table_alias": dbutils.widgets.get(name=f"{i}_script_table_alias"),
"storage_level": dbutils.widgets.get(name=f"{i}_script_storage_level"),
"project_date_column": dbutils.widgets.get(name=f"{i}_script_project_dt_col"),
"filter_date_column": dbutils.widgets.get(name=f"{i}_script_filter_dt_col"),
}
repartition_value = self._format_keys_list(dbutils.widgets.get(name=f"{i}_script_repartition_value"))
stages_dict[f"{i}"]["repartition"] = {}
if dbutils.widgets.get(name=f"{i}_script_repartition_type") == "NUMBER":
try:
int(dbutils.widgets.get(name=f"{i}_script_repartition_value").split(",")[0])
stages_dict[f"{i}"]["repartition"] = {
"numPartitions": dbutils.widgets.get(name=f"{i}_script_repartition_value")
.split(",")[0]
.replace("'", "")
}
except Exception:
print("The repartition value must be INTEGER when the type is defined as NUMBER.")
list_status.append("fail")
elif dbutils.widgets.get(name=f"{i}_script_repartition_type") == "KEY":
stages_dict[f"{i}"]["repartition"] = {"keys": repartition_value}
else:
print(f"The field script alias is missing for {i}.Script Table Alias. This field is mandatory!")
stages_dict = {}
list_status.append("fail")
if "fail" not in list_status:
print("Stages configuration status: OK")
return stages_dict
def get_view_information(self, num_of_views: str) -> dict:
"""
Get the views information based on the widget setup.
Args:
num_of_views: Number of views selected.
Returns:
The dictionary of views and their setup.
"""
dims_dict = {}
for i in range(int(num_of_views)):
i = i + 1
if re.match(self.regex_no_special_characters, dbutils.widgets.get(f"view_name{i}")):
dims_dict[f"view_name{i}"] = {
"name": dbutils.widgets.get(f"view_name{i}"),
"filter": dbutils.widgets.get(f"view_filter{i}").replace("'", "#+#-#").replace('"', "#+#-#"),
}
print("Views validation status: OK")
else:
print("WRONG CONFIGURATION:")
print("\t- View name is empty of malformed!")
print(
"\t Names can contain only alphanumeric characters and must begin with "
"an alphabetic character or an underscore (_)."
)
return dims_dict
@classmethod
def insert_data_into_lkp_query_builder(cls, delete_sttmt: str, insert_sttmt: str):
"""
Insert data into the lkp query builder table.
Args:
delete_sttmt: The delete statement.
insert_sttmt: The insert statement.
"""
try:
spark.sql(f"{delete_sttmt}")
spark.sql(f"{insert_sttmt}")
print("CONFIGURATION INSERTED SUCCESSFULLY!")
except Exception as e:
print(e)
def print_definitions(
self,
usecase_name,
market,
from_date,
to_date,
dimensions,
time_offset,
week_start,
is_active,
num_of_views,
complexity,
sql_files,
db_schema,
dims_dict: dict = None,
recon_dict: dict = None,
metrics_dict: dict = None,
stages_dict: dict = None,
):
"""
Print the definitions set on widgets.
Args:
usecase_name: The name of use case.
market: The market used for the use case (APAC, GLOBAL, NAM, NIGHTLY).
from_date: Aggregating date column for the use case.
to_date: Contains the current date (default value is to_date).
Information used as template for the framework.
dimensions: Store supporting information to the fact table
time_offset: Hours related to the timezone (e.g. 8, -8).
week_start: Day of the start of the week (e.g. Sunday, Monday)
is_active: If the use case is active or not. (e.g. Y, N)
num_of_views: Number of views desired for the use case (e.g. 1, 2, 3).
complexity: The categories are directly related to the number of workers in each cluster.
That is, High = 10 workers, Medium = 6 workers and Low = 4 workers
sql_files: Name of the SQL files that will be submitted for the framework
to process (e.g. file1.sql, file2.sql).
Database name that lkp_query_builder is located.
dims_dict: A dictionary of dimensions.
recon_dict: A dictionary of reconciliation setup.
metrics_dict: The dictionary of metrics and their setup.
stages_dict: A dictionary of stages and their setup.
"""
print("USE CASE DEFINITIONS:")
print("Use Case Name:", usecase_name)
print("Market:", market)
print("From Date:", from_date)
print("To Date:", to_date)
print("Dimensions:", dimensions)
print("Time Offset:", time_offset)
print("Week Start:", week_start)
print("Is Active:", is_active)
print("How many views?", num_of_views)
print("Complexity:", complexity)
print("SQL Files:", sql_files)
print("Database Schema Name:", db_schema)
self._print_dims_dict(dims_dict)
self._print_recon_dict(recon_dict)
if metrics_dict:
print("METRICS CONFIGURED:")
for key_metrics in metrics_dict:
self._print_metrics_dict(key_metrics, metrics_dict)
self._print_stages_dict(stages_dict)
@classmethod
def set_dimensions(cls, num_dimensions: str):
"""
Set the dimension mappings based on the widget setup.
Args:
num_dimensions: Number of dimensions selected.
"""
dbutils.widgets.removeAll()
for i in range(int(num_dimensions)):
i = i + 1
dbutils.widgets.text(name=f"D{i}", defaultValue="", label=f"D{i}.Dimension Name")
print("Please, configure the dimensions using the widgets and proceed to the next cmd.")
def set_extra_metric_config(self, num_of_metrics: str, metrics_dict: dict):
"""
Set extra metrics information based on the widget setup.
Args:
num_of_metrics: Number of metrics selected.
"""
for i in range(int(num_of_metrics)):
i = i + 1
calculated_metric_list = list(filter(None, dbutils.widgets.get(f"calculated_metric{i}").split(",")))
if calculated_metric_list:
for calc_metric in calculated_metric_list:
self._validate_metrics_config(calc_metric, metrics_dict, i)
else:
print("Extra metrics configuration status: OK")
@classmethod
def set_metric(cls, num_of_metrics: str):
"""
Set metrics information based on the widget setup.
Args:
num_of_metrics: Number of metrics selected.
"""
dbutils.widgets.removeAll()
for i in range(1, int(num_of_metrics) + 1):
dbutils.widgets.text(name=f"metric_name{i}", defaultValue="", label=f"{i}.Metric Name")
dbutils.widgets.multiselect(
name=f"calculated_metric{i}",
defaultValue="",
label=f"{i}.Calculated Metric",
choices=["", "last_cadence", "last_year_cadence", "window_function", "derived_metric"],
)
print("Please, configure the metrics using the widgets and proceed to the next cmd.")
def set_stages(self, sql_files: list) -> list:
"""
Set stages based on the widget setup.
Args:
sql_files: The SQL file names that will be used in the use case.
Returns:
sql_files_list: A list of sql files and their setup.
"""
dbutils.widgets.removeAll()
sql_files_list = self._sort_files(sql_files)
for i in range(1, len(sql_files_list) + 1):
dbutils.widgets.dropdown(
name=f"{i}_script_storage_level",
defaultValue="MEMORY_ONLY",
label=f"{i}.Storage Level",
choices=[
"DISK_ONLY",
"DISK_ONLY_2",
"DISK_ONLY_3",
"MEMORY_AND_DISK",
"MEMORY_AND_DISK_2",
"MEMORY_AND_DISK_DESER",
"MEMORY_ONLY",
"MEMORY_ONLY_2",
"OFF_HEAP",
],
)
dbutils.widgets.text(name=f"{i}_script_table_alias", defaultValue="", label=f"{i}.Table Alias")
dbutils.widgets.text(name=f"{i}_script_project_dt_col", defaultValue="", label=f"{i}.Project Date Column")
dbutils.widgets.text(name=f"{i}_script_filter_dt_col", defaultValue="", label=f"{i}.Filter Date Column")
dbutils.widgets.dropdown(
name=f"{i}_script_repartition_type",
defaultValue="",
label=f"{i}.Repartition Type",
choices=["", "KEY", "NUMBER"],
)
dbutils.widgets.text(name=f"{i}_script_repartition_value", defaultValue="", label=f"{i}.Repartition Value")
print("Please, configure the stages using the widgets and proceed to the next cmd.")
return sql_files_list
@classmethod
def set_views(cls, num_of_views: str):
"""
Set views that will be used in the use case.
Args:
num_of_views: Number of views selected.
"""
dbutils.widgets.removeAll()
for i in range(1, int(num_of_views) + 1):
dbutils.widgets.text(name=f"view_name{i}", defaultValue="", label=f"{i}.View Name")
dbutils.widgets.text(name=f"view_filter{i}", defaultValue="", label=f"{i}.View Filter")
print("Please, configure the views using the widgets and proceed to the next cmd.")
@classmethod
def _format_keys_list(cls, key_str: str) -> list:
"""
Format the list of keys based on the widget keys data provided.
Args:
key_str: Input text with key column names.
Returns:
A formatted list with the keys selected for repartitioning.
"""
key_list = key_str.strip().split(",")
output_list = []
for key in key_list:
output_list.append(key.replace("'", "").replace('"', "").strip())
return output_list
@classmethod
def _generate_query_id(cls, usecase_name: str) -> int:
"""
Generate the query id for the lookup query builder table.
The logic to create the ID is a hash of the use case name converted to an integer.
Args:
usecase_name: The name of use case.
Returns:
The use case name hashed.
"""
hash_val = int(str(hash(usecase_name))[0:9])
return hash_val if hash_val > 0 else hash_val * -1
@classmethod
def _get_mapping(cls, dims_dict: dict, dimensions: str, from_date: str, to_date: str, metrics_dict: dict) -> dict:
"""
Get mappings based on the dimensions defined on the widget setup.
Args:
dims_dict: A dictionary of dimensions.
dimensions: Store supporting information to the fact table.
from_date: Aggregating date column for the use case.
to_date: Contains the current date (default value is to_date).
Information used as template for the framework.
metrics_dict: The dictionary of metrics and their setup.
Returns:
mapping_dict: A dictionary of mappings configuration.
"""
mapping_dict = {}
for key in dims_dict:
mapping_dict.update({dims_dict[key]["name"]: {"dimensions": {}, "metric": {}, "filter": {}}})
i = 0
for d in dimensions.split(","):
i = i + 1
mapping_dict[dims_dict[key]["name"]]["dimensions"].update(
{"from_date": from_date, "to_date": to_date, f"d{i}": d.strip()}
)
mapping_dict[dims_dict[key]["name"]]["metric"].update(metrics_dict)
if dims_dict[key]["filter"]:
mapping_dict[dims_dict[key]["name"]]["filter"] = dims_dict[key]["filter"]
return mapping_dict
@classmethod
def _print_dims_dict(cls, dims_dict: dict):
"""
Print the dictionary of dimensions and views formatted.
Args:
dims_dict: The dictionary of views and their setup.
"""
if dims_dict:
print("VIEWS CONFIGURED:")
for key in dims_dict:
print(f"{key}:")
keys = [k for k, v in dims_dict[key].items()]
for k in keys:
print(f"\t{k}:", dims_dict[key][k].replace("#+#-#", '"'))
@classmethod
def _print_derived_metrics(cls, key_metrics: str, derived_metric: str, metrics_dict: dict):
"""
Print the derived dict formatted.
Args:
key_metrics: The key name of each metric configured (e.g. m1, m2, m3).
derived_metric: The name of the derived metric configuration (e.g. last_cadence, last_year_cadence,
derived_metric, window_function).
metrics_dict: The dictionary of metrics and their setup.
"""
if derived_metric == "derived_metric":
if metrics_dict[key_metrics][derived_metric]:
print(f"\t- {derived_metric}:")
derived_metric_val_list = [k for k, v in metrics_dict[key_metrics][derived_metric][0].items()]
for derived_metric_val in derived_metric_val_list:
print(
f"\t - {derived_metric_val} = "
f"{metrics_dict[key_metrics][derived_metric][0][derived_metric_val]}"
)
def _print_metrics_dict(self, key_metrics: str, metrics_dict: dict):
"""
Print the metrics configured formatted.
Args:
key_metrics: The key name of each metric configured (e.g. m1, m2, m3).
metrics_dict: The dictionary of metrics and their setup.
"""
print(f"{key_metrics}:")
list_key_metrics = [k for k, v in metrics_dict[key_metrics].items()]
if list_key_metrics:
for metric in list_key_metrics:
if metric == "metric_name":
print(f" {metric} = {metrics_dict[key_metrics][metric]}")
else:
for derived_metric in metrics_dict[key_metrics][metric]:
if derived_metric in ["last_cadence", "last_year_cadence", "window_function"]:
print(f"\t- {derived_metric}:")
derived_metric_val_list = [
k for k, v in metrics_dict[key_metrics][metric][derived_metric][0].items()
]
for derived_metric_val in derived_metric_val_list:
print(
f"\t - {derived_metric_val} = "
f"{metrics_dict[key_metrics][metric][derived_metric][0][derived_metric_val]}"
)
else:
self._print_derived_metrics(key_metrics, metric, metrics_dict)
@classmethod
def _print_recon_dict(cls, recon_dict: dict):
"""
Print the recon dict formatted.
Args:
recon_dict: A dictionary of reconciliation setup.
"""
if recon_dict:
print("RECON CONFIGURED:")
for key_cadence in recon_dict:
if recon_dict[f"{key_cadence}"] == {}:
print(f"{key_cadence}")
else:
print(f"{key_cadence}:")
keys_recon = [k for k, v in recon_dict[key_cadence].items()]
if keys_recon:
for k_recon in keys_recon:
print(f" {k_recon}:")
keys_recon = [k for k, v in recon_dict[key_cadence][k_recon].items()]
for recon_val in keys_recon:
print(
f"\t- {recon_val}:snapshot = {recon_dict[key_cadence][k_recon][recon_val]['snapshot']}"
)
@classmethod
def _print_stages_dict(cls, stages_dict: dict):
"""
Print the dictionary of stages formatted.
Args:
stages_dict: A dictionary of stages and their setup.
"""
if stages_dict:
print("STEPS CONFIGURED:")
for key_stages in stages_dict:
print(f"step {key_stages}:")
keys_stages = [k for k, v in stages_dict[key_stages].items()]
for k_stages in keys_stages:
if k_stages != "repartition":
print(f" - {k_stages} = {stages_dict[key_stages][k_stages]}")
else:
repartition_stages = [k for k, v in stages_dict[key_stages][k_stages].items()]
for stg in repartition_stages:
print(" - repartition_type:")
print(f"\t {stg} = {stages_dict[key_stages][k_stages][stg]}")
@classmethod
def _sort_files(cls, sql_files: str) -> list:
"""
Create a list sorted alphabetically based on the sql files provided.
Args:
sql_files: Name of the SQL files that will be sent to the framework
to process (e.g. file1.sql, file2.sql).
Returns:
A list of sql files sorted alphabetically.
"""
fileslist = sql_files.split(",")
# remove extra spaces from items in the list
fileslist = [x.strip() for x in fileslist]
for file in range(len(fileslist)):
fileslist[file] = fileslist[file].lower().strip()
# apply bubble sort to sort the words
for n in range(len(fileslist) - 1, 0, -1):
for i in range(n):
if fileslist[i] > fileslist[i + 1]:
# swap data if the element is less than the next element in the array
fileslist[i], fileslist[i + 1] = fileslist[i + 1], fileslist[i]
return fileslist
@classmethod
def _validate_metrics_config(cls, calc_metric: str, metrics_dict: dict, widget_index: int):
"""
Validate the metrics widgets setup.
Args:
calc_metric: Name of the metric calculation set (e.g. last_cadence, last_year_cadence).
metrics_dict: The dictionary of metrics and their setup.
widget_index: Index of the widget selected to be validated.
"""
if calc_metric == "last_cadence":
if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "":
try:
int(dbutils.widgets.get(f"{widget_index}_{calc_metric}_window"))
metrics_dict[f"m{widget_index}"]["calculated_metric"].update(
{
f"{calc_metric}": [
{
"label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"),
"window": dbutils.widgets.get(f"{widget_index}_{calc_metric}_window"),
}
]
}
)
print(f"{calc_metric} configuration status: OK")
except Exception:
print(f"{calc_metric} - WRONG CONFIGURATION:")
print(f"\t- The {calc_metric} window value must be INTEGER.")
else:
print(f"{calc_metric} - WRONG CONFIGURATION:")
print(f"\t- The {calc_metric} label is mandatory.")
elif calc_metric == "last_year_cadence":
if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "":
metrics_dict[f"m{widget_index}"]["calculated_metric"].update(
{
f"{calc_metric}": [
{
"label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"),
"window": 1,
}
]
}
)
print(f"{calc_metric} configuration status: OK")
else:
print(f"{calc_metric} - WRONG CONFIGURATION:")
print(f"\t- The {calc_metric} label is mandatory.")
elif calc_metric == "window_function":
if dbutils.widgets.get(f"{widget_index}_{calc_metric}_label").strip() != "":
window_list = dbutils.widgets.get(f"{widget_index}_{calc_metric}_window").split(",")
if len(window_list) > 1:
metrics_dict[f"m{widget_index}"]["calculated_metric"].update(
{
f"{calc_metric}": [
{
"label": dbutils.widgets.get(f"{widget_index}_{calc_metric}_label"),
"window": [int(x.strip()) for x in window_list],
"agg_func": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_agg_func"),
}
]
}
)
print(f"{calc_metric} configuration status: OK")
else:
print(f"{calc_metric} - WRONG CONFIGURATION:")
print(
"\t- The window function must follow the pattern of "
"two integer digits separated with comma (e.g. 3,1)."
)
else:
print(f"{calc_metric} - WRONG CONFIGURATION:")
print("\t- The window_function label is mandatory.")
elif calc_metric == "derived_metric":
if (
dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_label").strip() != ""
and dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_formula").strip() != ""
):
metrics_dict[f"m{widget_index}"].update(
{
f"{calc_metric}": [
{
"label": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_label"),
"formula": dbutils.widgets.get(name=f"{widget_index}_{calc_metric}_formula"),
}
]
}
)
print(f"{calc_metric} configuration status: OK")
else:
print(f"{calc_metric} - WRONG CONFIGURATION:")
print("\t- The derived_metric label and formula are mandatory.")
================================================
FILE: cicd/.bumpversion.cfg
================================================
[bumpversion]
current_version = 2.0.0
commit = False
tag = False
[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"
================================================
FILE: cicd/Dockerfile
================================================
ARG PYTHON_IMAGE=python:3.12-slim-bullseye
FROM $PYTHON_IMAGE
ARG USER_ID=1000
ARG GROUP_ID=1000
ARG CPU_ARCHITECTURE
# Install Prerequisites
RUN mkdir -p /usr/share/man/man1 && \
apt-get -y update && \
apt-get install -y wget=1.21* gnupg2=2.2* git=1:2* g++=4:10.2.1* rsync=3.2* && \
apt-get -y clean
# Install jdk
RUN mkdir -p /etc/apt/keyrings && \
wget -qO - https://packages.adoptium.net/artifactory/api/gpg/key/public | gpg --dearmor | tee /etc/apt/trusted.gpg.d/adoptium.gpg > /dev/null && \
echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list && \
apt-get -y update && \
apt-get -y install temurin-17-jdk && \
apt-get -y clean
ENV JAVA_HOME=/usr/lib/jvm/temurin-17-jdk-${CPU_ARCHITECTURE}
# useradd -l is necessary to avoid docker build hanging in export image phase when using large uids
RUN groupadd -g ${GROUP_ID} appuser && \
useradd -rm -l -u ${USER_ID} -d /home/appuser -s /bin/bash -g appuser appuser
COPY cicd/requirements_full.lock /tmp/requirements.txt
USER appuser
ENV PATH="/home/appuser/.local/bin:$PATH"
RUN python -m pip install --upgrade pip==25.2 setuptools==74.* --user
RUN python -m pip install --user -r /tmp/requirements.txt
RUN mkdir /home/appuser/.ssh/ && touch /home/appuser/.ssh/known_hosts
RUN echo Image built for $CPU_ARCHITECTURE with python image $PYTHON_IMAGE.
================================================
FILE: cicd/Jenkinsfile
================================================
@Library(['GlobalJenkinsLibrary']) _
pipeline {
options {
buildDiscarder(logRotator(numToKeepStr: '30', artifactNumToKeepStr: '30'))
timeout(time: 2, unit: 'HOURS')
disableConcurrentBuilds()
skipDefaultCheckout(true)
ansiColor('xterm')
timestamps()
}
agent {
node {
label 'lakehouse_base'
}
}
environment {
VERSION = env.BRANCH_NAME.replaceAll("[/-]", "_").toLowerCase()
GIT_CREDENTIALS_ID = "git-lakehouse-cicd"
}
stages {
stage('cleanup workspace') {
steps {
cleanWs(disableDeferredWipeout: true, deleteDirs: true)
}
}
stage('Clone') {
steps {
retry(3) {
script {
checkout([
$class : 'GitSCM',
branches : scm.branches,
userRemoteConfigs: [[url: 'https://bitbucket.tools.3stripes.net/scm/lak/lakehouse-engine.git', credentialsId: GIT_CREDENTIALS_ID]]
])
}
}
}
}
stage('Build Image') {
steps {
sh 'make build-image version=$VERSION'
}
}
stage('Create Docs') {
steps {
sh 'make docs version=$VERSION'
}
}
stage('Parallel') {
parallel {
stage('Lint') {
steps {
sh 'make lint version=$VERSION'
}
}
stage('Test Security') {
steps {
sh 'make test-security version=$VERSION'
}
}
stage('Audit Dependency Safety'){
steps{
catchError(message: "${STAGE_NAME} is unstable", buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
sh 'make audit-dep-safety version=$VERSION'
}
}
}
stage('Test dependencies') {
steps {
sh 'make test-deps version=$VERSION'
}
}
stage('Test') {
steps {
sh 'make test version=$VERSION'
}
}
}
}
stage('Sonar') {
steps {
script {
tools.sonar.run(env: 'COMMUNITY-PRD', version: '1.0', branch: env.BRANCH_NAME)
}
}
}
}
post {
always {
archiveArtifacts artifacts: 'artefacts/docs/**/*'
archiveArtifacts artifacts: 'artefacts/*.json'
junit 'artefacts/tests.xml'
step([$class: 'CoberturaPublisher', coberturaReportFile: 'artefacts/coverage.xml'])
}
}
}
================================================
FILE: cicd/Jenkinsfile_deploy
================================================
pipeline {
parameters {
string(name: 'BRANCH', defaultValue: 'master', description: 'Branch to use for the deployment process.')
string(name: 'VERSION', defaultValue: null, description: 'Version to deploy (git tag in master branch without the "v"). E.g., 0.2.0. If you are deploying to dev, from your branch, ignore this.')
booleanParam(name: 'SKIP_VALIDATIONS', defaultValue: false, description: 'Whether to skip the validations. Only applicable for feature releases to make them faster.')
booleanParam(name: 'SKIP_OS_DEPLOYMENT', defaultValue: false, description: 'Whether to skip the OS Deployment related stages or not.')
booleanParam(name: 'NOTIFY', defaultValue: true, description: 'Whether to notify the release or not.')
}
options {
buildDiscarder(logRotator(numToKeepStr: '100', artifactNumToKeepStr: '30'))
timeout(time: 2, unit: 'HOURS')
disableConcurrentBuilds()
skipDefaultCheckout(true)
ansiColor('xterm')
timestamps()
}
agent {
node {
label 'lakehouse_base'
}
}
environment {
PYPI_CREDENTIALS = credentials('pypi-credentials')
ARTIFACTORY_CREDENTIALS = credentials('artifactory-credentials')
GIT_CREDENTIALS_ID = "git-lakehouse-cicd"
GIT_CREDENTIALS_LAK = credentials('push-to-github-lak')
GIT_CREDENTIALS_LAK_DOCS = credentials('push-to-github-lak-docs')
DEPLOY_VERSION = getDeploymentVersion()
DEPLOY_GIT_OBJECT = getDeploymentGitObject()
}
stages {
stage('cleanup workspace') {
steps {
cleanWs(disableDeferredWipeout: true, deleteDirs: true)
}
}
stage('Clone') {
steps {
retry(3) {
script {
checkout([
$class : 'GitSCM',
branches : [['name': env.DEPLOY_GIT_OBJECT]],
userRemoteConfigs: [[url: 'https://bitbucket.tools.3stripes.net/scm/lak/lakehouse-engine.git', credentialsId: GIT_CREDENTIALS_ID]]
])
}
}
}
}
stage('Build Image') {
steps {
sh 'make build-image version=' + "${env.DEPLOY_VERSION}"
}
}
stage('Parallel') {
when {
expression {
(!params.SKIP_VALIDATIONS && params.BRANCH != 'master')
}
}
parallel {
stage('Lint') {
steps {
sh 'make lint version=' + "${env.DEPLOY_VERSION}"
}
}
stage('Test Security') {
steps {
sh 'make test-security version=' + "${env.DEPLOY_VERSION}"
}
}
stage('Audit Dependency Safety'){
steps{
catchError(message: "${STAGE_NAME} is unstable", buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
sh 'make audit-dep-safety version=$VERSION'
}
}
}
stage('Test dependencies') {
steps {
sh 'make test-deps version=' + "${env.DEPLOY_VERSION}"
}
}
stage('Test') {
steps {
sh 'make test version=' + "${env.DEPLOY_VERSION}"
}
}
}
}
stage('Deploy') {
steps {
script {
sh 'make deploy version=' + "${env.DEPLOY_VERSION}" + ' artifactory_credentials_file=$ARTIFACTORY_CREDENTIALS'
}
}
}
stage('Open Source Deployment') {
when {
expression {
(params.BRANCH == 'master' && !params.SKIP_OS_DEPLOYMENT)
}
}
stages {
stage('Sync Code with GitHub') {
steps {
script {
sh 'make sync-to-github version=' + "${env.DEPLOY_VERSION}" + ' git_credentials_file=$GIT_CREDENTIALS_LAK repository=lakehouse-engine'
}
}
}
stage('Deploy Docs to Github') {
steps {
script {
sh 'make deploy-docs-to-github version=' + "${env.DEPLOY_VERSION}" + ' git_credentials_file=$GIT_CREDENTIALS_LAK_DOCS repository=lakehouse-engine-docs os_deployment=True'
}
}
}
stage('Deploy to Pypi') {
steps {
script {
// we are forcing make build as it was not happening sometimes, for no reason.
sh 'make build os_deployment=True'
sh 'make deploy-to-pypi-and-clean os_deployment=True version=' + "${env.DEPLOY_VERSION}" + ' pypi_credentials_file=$PYPI_CREDENTIALS'
}
}
}
}
}
stage('Notify') {
when {
expression {
params.BRANCH == 'master' && params.NOTIFY
}
}
steps {
script {
params = readYaml file: 'cicd/meta.yaml'
release_notes = sh(script:'cat CHANGELOG.md | cut -d ")" -f 2 | head -n 10', returnStdout: true).trim()
recipients = params["mail_recipients"].join(";")
emailext(
attachLog: false,
compressLog: true,
body: """
<BR>A new version <b>$env.DEPLOY_VERSION</b> of the <b>Lakehouse Engine</b> was deployed into Artifactory.<BR><BR>
You can install it just like any other python library, either notebook scoped with pip install or cluster scoped
by specifying the library in the cluster configuration.:
You can check the lakehouse-engine documentation here: ${params["engine_docs"]}.
Check the latest updates here:<BR>
<pre>
${release_notes}
</pre><BR>
For more details, please check the complete changelog and/or the additional resources listed below:
<ul>
<li>${params["changelog_url"]}</li>
<li>${params["code_url"]}</li>
<li>${params["confluence_url"]}</li>
</ul>
""",
mimeType: 'text/html',
replyTo: "${params['reply_to']}",
from: "${params['from']}",
to: recipients,
subject: "Lakehouse Engine Updates - $env.DEPLOY_VERSION"
)
}
}
}
}
}
/**
* Get deployment git object (branch name or tag reference) given certain Jenkins parameters and the team's deployment guidelines.
* @return git object (branch or tag)
*/
def String getDeploymentGitObject() {
gitObject = params.BRANCH
if (params.BRANCH == 'master') {
if (params.VERSION ==~ '[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}') {
// force the git object to checkout to be a version tag
gitObject = "refs/tags/v${params.VERSION}"
return gitObject
}
else {
throw new Exception("Version ${params.VERSION} does not match valid git version tag. It should be in the form of <major>.<minor>.<patch>.")
}
} else {
return gitObject
}
}
/**
* Get deployment version given certain Jenkins parameters and the team's deployment guidelines.
* @return deployment version
*/
def String getDeploymentVersion() {
version = params.VERSION
if (params.BRANCH == 'master') {
if (version ==~ '[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}') {
return version
}
else {
throw new Exception("Version ${version} does not match valid git version tag. It should be in the form of <major>.<minor>.<patch>.")
}
} else {
// force branch as the version to be deployed when we are dealing with feature branches.
return params.BRANCH.replaceAll("[/-]", "_").toLowerCase()
}
}
================================================
FILE: cicd/bandit.yaml
================================================
assert_used:
skips: ['*test*']
================================================
FILE: cicd/code_doc/content.css
================================================
/*
This CSS file contains all style definitions for documentation content.
All selectors are scoped with ".pdoc".
This makes sure that the pdoc styling doesn't leak to the rest of the page when pdoc is embedded.
*/
.pdoc {
color: var(--text);
/* enforce some styling even if bootstrap reboot is not included */
box-sizing: border-box;
line-height: 1.5;
/* override background from pygments */
/*unnecessary since pdoc 10, only left here to keep old custom templates working. */
background: none;
}
.pdoc .pdoc-button {
cursor: pointer;
display: inline-block;
border: solid black 1px;
border-radius: 2px;
font-size: .75rem;
padding: calc(0.5em - 1px) 1em;
transition: 100ms all;
}
/* Admonitions */
.pdoc .pdoc-alert {
padding: 1rem 1rem 1rem calc(1.5rem + 24px);
border: 1px solid transparent;
border-radius: .25rem;
background-repeat: no-repeat;
background-position: 1rem center;
margin-bottom: 1rem;
}
.pdoc .pdoc-alert > *:last-child {
margin-bottom: 0;
}
/* Admonitions are currently not stylable via theme.css */
.pdoc .pdoc-alert-note {
color: #000000;
background-color: #f1efef;
border-color: #f1f1f1;
background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/info-circle-fill.svg' %}{% endfilter %}");
}
.pdoc .pdoc-alert-warning {
color: #664d03;
background-color: #fff3cd;
border-color: #ffecb5;
background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/exclamation-triangle-fill.svg' %}{% endfilter %}");
}
.pdoc .pdoc-alert-danger {
color: #842029;
background-color: #f8d7da;
border-color: #f5c2c7;
background-image: url("data:image/svg+xml,{% filter urlencode %}{% include 'resources/lightning-fill.svg' %}{% endfilter %}");
}
.pdoc .visually-hidden {
position: absolute !important;
width: 1px !important;
height: 1px !important;
padding: 0 !important;
margin: -1px !important;
overflow: hidden !important;
clip: rect(0, 0, 0, 0) !important;
white-space: nowrap !important;
border: 0 !important;
}
.pdoc h1, .pdoc h2, .pdoc h3 {
font-weight: 300;
margin: .3em 0;
padding: .2em 0;
}
.pdoc > section:not(.module-info) h1 {
font-size: 1.5rem;
font-weight: 500;
}
.pdoc > section:not(.module-info) h2 {
font-size: 1.4rem;
font-weight: 500;
}
.pdoc > section:not(.module-info) h3 {
font-size: 1.3rem;
font-weight: 500;
}
.pdoc > section:not(.module-info) h4 {
font-size: 1.2rem;
}
.pdoc > section:not(.module-info) h5 {
font-size: 1.1rem;
}
.pdoc a {
text-decoration: none;
color: var(--link);
}
.pdoc a:hover {
color: var(--link-hover);
}
.pdoc blockquote {
margin-left: 2rem;
}
.pdoc pre {
border-top: 1px solid var(--accent2);
border-bottom: 1px solid var(--accent2);
margin-top: 0;
margin-bottom: 1em;
padding: .5rem 0 .5rem .5rem;
overflow-x: auto;
/*unnecessary since pdoc 10, only left here to keep old custom templates working. */
background-color: var(--code);
}
.pdoc code {
color: var(--text);
padding: .2em .4em;
margin: 0;
font-size: 85%;
background-color: var(--accent);
border-radius: 6px;
}
.pdoc a > code {
color: inherit;
}
.pdoc pre > code {
display: inline-block;
font-size: inherit;
background: none;
border: none;
padding: 0;
}
.pdoc > section:not(.module-info) {
/* this margin should collapse with docstring margin,
but not for the module docstr which is followed by view_source. */
margin-bottom: 1.5rem;
}
/* Page Heading */
.pdoc .modulename {
margin-top: 0;
font-weight: bold;
}
.pdoc .modulename a {
color: var(--link);
transition: 100ms all;
}
/* GitHub Button */
.pdoc .git-button {
float: right;
border: solid var(--link) 1px;
}
.pdoc .git-button:hover {
background-color: var(--link);
color: var(--pdoc-background);
}
.view-source-toggle-state,
.view-source-toggle-state ~ .pdoc-code {
display: none;
}
.view-source-toggle-state:checked ~ .pdoc-code {
display: block;
}
.view-source-button {
display: inline-block;
float: right;
font-size: .75rem;
line-height: 1.5rem;
color: var(--muted);
padding: 0 .4rem 0 1.3rem;
cursor: pointer;
/* odd hack to reduce space between "bullet" and text */
text-indent: -2px;
}
.view-source-button > span {
visibility: hidden;
}
.module-info .view-source-button {
float: none;
display: flex;
justify-content: flex-end;
margin: -1.2rem .4rem -.2rem 0;
}
.view-source-button::before {
/* somewhat awkward recreation of a <summary> element. ideally we'd just use `display: inline list-item`, but
that does not work in Chrome (yet), see https://crbug.com/995106. */
position: absolute;
content: "View Source";
display: list-item;
list-style-type: disclosure-closed;
}
.view-source-toggle-state:checked ~ .attr .view-source-button::before,
.view-source-toggle-state:checked ~ .view-source-button::before {
list-style-type: disclosure-open;
}
/* Docstrings */
.pdoc .docstring {
margin-bottom: 1.5rem;
}
.pdoc section:not(.module-info) .docstring {
margin-left: clamp(0rem, 5vw - 2rem, 1rem);
}
.pdoc .docstring .pdoc-code {
margin-left: 1em;
margin-right: 1em;
}
/* Highlight focused element */
.pdoc h1:target,
.pdoc h2:target,
.pdoc h3:target,
.pdoc h4:target,
.pdoc h5:target,
.pdoc h6:target,
.pdoc .pdoc-code > pre > span:target {
background-color: var(--active);
box-shadow: -1rem 0 0 0 var(--active);
}
.pdoc .pdoc-code > pre > span:target {
/* make the highlighted line full width so that the background extends */
display: block;
}
.pdoc div:target > .attr,
.pdoc section:target > .attr,
.pdoc dd:target > a {
background-color: var(--active);
}
.pdoc * {
scroll-margin: 2rem;
}
.pdoc .pdoc-code .linenos {
user-select: none;
}
.pdoc .attr:hover {
filter: contrast(0.95);
}
/* Header link */
.pdoc section, .pdoc .classattr {
position: relative;
}
.pdoc .headerlink {
--width: clamp(1rem, 3vw, 2rem);
position: absolute;
top: 0;
left: calc(0rem - var(--width));
transition: all 100ms ease-in-out;
opacity: 0;
}
.pdoc .headerlink::before {
content: "#";
display: block;
text-align: center;
width: var(--width);
height: 2.3rem;
line-height: 2.3rem;
font-size: 1.5rem;
}
.pdoc .attr:hover ~ .headerlink,
.pdoc *:target > .headerlink,
.pdoc .headerlink:hover {
opacity: 1;
}
/* Attributes */
.pdoc .attr {
display: block;
margin: .5rem 0 .5rem;
padding: .4rem .4rem .4rem 1rem;
background-color: var(--accent);
overflow-x: auto;
}
.pdoc .classattr {
margin-left: 2rem;
}
.pdoc .name {
color: var(--name);
font-weight: bold;
}
.pdoc .def {
color: var(--def);
font-weight: bold;
}
.pdoc .signature {
/* override pygments background color */
background-color: transparent;
}
.pdoc .param, .pdoc .return-annotation {
white-space: pre;
}
.pdoc .signature.multiline .param {
display: block;
}
.pdoc .signature.condensed .param {
display:inline-block;
}
.pdoc .annotation {
color: var(--annotation);
}
/* Show/Hide buttons for long default values */
.pdoc .view-value-toggle-state,
.pdoc .view-value-toggle-state ~ .default_value {
display: none;
}
.pdoc .view-value-toggle-state:checked ~ .default_value {
display: inherit;
}
.pdoc .view-value-button {
font-size: .5rem;
vertical-align: middle;
border-style: dashed;
margin-top: -0.1rem;
}
.pdoc .view-value-button:hover {
background: white;
}
.pdoc .view-value-button::before {
content: "show";
text-align: center;
width: 2.2em;
display: inline-block;
}
.pdoc .view-value-toggle-state:checked ~ .view-value-button::before {
content: "hide";
}
/* Inherited Members */
.pdoc .inherited {
margin-left: 2rem;
}
.pdoc .inherited dt {
font-weight: 700;
}
.pdoc .inherited dt, .pdoc .inherited dd {
display: inline;
margin-left: 0;
margin-bottom: .5rem;
}
.pdoc .inherited dd:not(:last-child):after {
content: ", ";
}
.pdoc .inherited .class:before {
content: "class ";
}
.pdoc .inherited .function a:after {
content: "()";
}
/* Search results */
.pdoc .search-result .docstring {
overflow: auto;
max-height: 25vh;
}
.pdoc .search-result.focused > .attr {
background-color: var(--active);
}
/* "built with pdoc" attribution */
.pdoc .attribution {
margin-top: 2rem;
display: block;
opacity: 0.5;
transition: all 200ms;
filter: grayscale(100%);
}
.pdoc .attribution:hover {
opacity: 1;
filter: grayscale(0%);
}
.pdoc .attribution img {
margin-left: 5px;
height: 35px;
vertical-align: middle;
width: 70px;
transition: all 200ms;
}
.pdoc table {
display: block;
width: max-content;
max-width: 150%;
overflow: auto;
margin-bottom: 1rem;
}
.pdoc table th, .pdoc table td {
padding: 12px 13px;
border: 1px solid var(--accent2);
}
.pdoc table th {
font-weight: 600;
}
================================================
FILE: cicd/code_doc/custom_example_macros.py
================================================
"""Macro methods to be used on Lakehouse Engine Docs."""
import warnings
import json
import pygments.formatters.html
from markupsafe import Markup
STACK_LEVEL = 2
def _search_files(file: dict, search_string: str) -> list:
"""Searches for a string and outputs the line.
Search for a given string in a file and output the line where it is first
found.
Args:
file: path of the file to be searched.
search_string: string that will be searched for.
Returns:
The number of the first line where a given search_string appears.
"""
range_lines = []
with open(file) as f:
for num, line in enumerate(f, 1):
if search_string in line:
range_lines.append(num - 1)
return range_lines[0]
def _link_example(method_name: str) -> str or None:
"""Searches for a link in a dict.
Searches for the link of a given method_name, in a specific config file and
outputs it.
Args:
method_name: name of the method to be searched for.
Returns:
None or the example link for the given method_name.
"""
if method_name in list(lakehouse_engine_examples.keys()):
file_link = lakehouse_engine_examples[str(method_name)]
return lakehouse_engine_examples["base_link"] + file_link if file_link != "" else None
else:
warnings.warn(
"No entry provided for the following transformer: "
+ method_name,
RuntimeWarning,
STACK_LEVEL,
)
return None
def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
"""Searches for a transformer and returns the first dictionary occurrence.
Search for a given transformer in a dictionary and return the first occurrence.
Args:
dict_to_search: path of the file to be searched.
transformer: string that will be searched for.
Returns:
First dictionary where a given transformer is found.
"""
dict_transformer = []
for spec in dict_to_search["transform_specs"]:
for transformer_dict in spec["transformers"]:
if transformer_dict["function"] == transformer:
dict_transformer.append(transformer_dict)
return json.dumps(dict_transformer[0], indent=4)
def _highlight_examples(method_name: str) -> str or None:
"""Creates a code snippet.
Constructs and exposes the code snippet of a given method_name.
Args:
method_name: name of the module to be searched for.
Returns:
None or the code snippet wrapped in html tags.
"""
for key, item in lakehouse_engine_examples.items():
if method_name == key:
file_path = f"../../{item}"
if file_path == "../../":
warnings.warn(
"No unit testing for the following transformer: " + method_name,
RuntimeWarning,
STACK_LEVEL,
)
return None
first_line = _search_files(file_path, f'"function": "{method_name}"')
with open(file_path) as json_file:
acon_file = json.load(json_file)
code_snippet = _get_dict_transformer(acon_file, method_name)
# Defining the lexer which will parse through the snippet of code we want
# to highlight
lexer = pygments.lexers.JsonLexer()
# Defining the format that will be outputted by the pygments library
# (on our case it will output the code within html tags)
formatter = pygments.formatters.html.HtmlFormatter(
linenos="inline",
anchorlinenos=True,
)
formatter.linenostart = first_line
return Markup(pygments.highlight(code_snippet, lexer, formatter))
def get_example(method_name: str) -> str:
"""Get example based on given argument.
Args:
method_name: name of the module to be searched for.
Returns:
A example.
"""
example_link = _link_example(method_name=method_name)
json_example = _highlight_examples(method_name=method_name)
if example_link:
return (
"""<details class="example">\n"""
f"""<summary>View Example of {method_name} (See full example <a href="{example_link}">here</a>)</summary>"""
f"""<div class="language-json highlight"><pre><span></span><code>{json_example}</code></pre></div>\n"""
"""</details>"""
)
else:
return ""
with open("./examples.json") as json_file:
lakehouse_engine_examples = json.load(json_file)
def define_env(env):
"Declare environment for jinja2 templates for markdown"
for fn in [get_example]:
env.macro(fn)
# get mkdocstrings' Python handler
python_handler = env.conf["plugins"]["mkdocstrings"].get_handler("python")
# get the `update_env` method of the Python handler
update_env = python_handler.update_env
# override the `update_env` method of the Python handler
def patched_update_env(md, config):
update_env(md, config)
# get the `convert_markdown` filter of the env
convert_markdown = python_handler.env.filters["convert_markdown"]
# build a chimera made of macros+mkdocstrings
def render_convert(markdown: str, *args, **kwargs):
return convert_markdown(env.render(markdown), *args, **kwargs)
# patch the filter
python_handler.env.filters["convert_markdown"] = render_convert
# patch the method
python_handler.update_env = patched_update_env
================================================
FILE: cicd/code_doc/examples.json
================================================
{
"base_link":"https://github.com/adidas/lakehouse-engine/blob/master/",
"get_max_value": "tests/resources/feature/delta_load/merge_options/update_column_set/batch_delta.json",
"with_row_id": "tests/resources/feature/transformations/chain_transformations/acons/streaming_batch.json",
"with_auto_increment_id": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json",
"with_literals": "tests/resources/feature/transformations/column_creators/batch.json",
"cast": "tests/resources/feature/schema_evolution/delta_load/batch_delta_disabled.json",
"column_selector": "",
"flatten_schema": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
"explode_columns": "tests/resources/feature/transformations/column_reshapers/explode_arrays/batch.json",
"with_expressions": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
"rename": "tests/resources/feature/schema_evolution/append_load/batch_append_disabled.json",
"from_avro": "",
"from_avro_with_registry": "",
"from_json": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
"to_json": "tests/resources/feature/transformations/column_reshapers/flatten_schema/batch.json",
"condense_record_mode_cdc": "tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_init.json",
"group_and_rank": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json",
"hash_masker": "tests/resources/feature/transformations/data_maskers/hash_masking.json",
"column_dropper": "tests/resources/feature/transformations/data_maskers/drop_columns.json",
"add_current_date": "tests/resources/feature/transformations/date_transformers/streaming.json",
"convert_to_date": "tests/resources/feature/transformations/date_transformers/streaming.json",
"convert_to_timestamp": "tests/resources/feature/transformations/date_transformers/streaming.json",
"format_date": "tests/resources/feature/transformations/date_transformers/streaming.json",
"get_date_hierarchy": "tests/resources/feature/transformations/date_transformers/streaming.json",
"incremental_filter": "tests/resources/feature/delta_load/record_mode_cdc/backfill/batch_delta.json",
"expression_filter": "tests/resources/feature/full_load/with_filter/batch.json",
"column_filter_exp": "tests/resources/feature/transformations/multiple_transform/batch.json",
"join": "tests/resources/feature/transformations/joiners/batch.json",
"replace_nulls": "tests/resources/feature/transformations/null_handlers/replace_nulls_col_subset.json",
"with_regex_value": "tests/resources/feature/delta_load/group_and_rank/fail_with_duplicates_in_same_file/batch_delta.json",
"coalesce": "tests/resources/feature/writers/acons/write_batch_console.json",
"repartition": "tests/resources/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming_delta.json",
"get_transformer": "",
"with_watermark": "tests/resources/feature/transformations/watermarker/streaming_drop_duplicates_overall_watermark/streaming_drop_duplicates_overall_watermark.json"
}
================================================
FILE: cicd/code_doc/gen_ref_nav.py
================================================
"""Module to generate code reference docs."""
# Import necessary libraries
from pathlib import Path
import mkdocs_gen_files
# Create a new navigation structure
nav = mkdocs_gen_files.Nav()
# Define the root directory and the source directory
root = Path(__file__).parent
src = root / "mkdocs/lakehouse_engine"
print(f"Looking for files in {src}")
# Loop over all Python files in the source directory
for path in sorted(src.rglob("*.py")):
# Get the module path and the documentation path for each file
module_path = path.relative_to(src).with_suffix("")
doc_path = path.relative_to(src / "").with_suffix(".md")
full_doc_path = Path("reference", doc_path)
# Split the module path into parts
parts = tuple(module_path.parts)
# Skip files that start with an underscore or have no parts
if not parts:
continue
# If the file is an __init__.py file, remove the last part and rename the doc file to index.md
if parts[-1] == "__init__" and str(parts[:-1]) != "()":
parts = parts[:-1]
doc_path = doc_path.with_name("index.md")
full_doc_path = full_doc_path.with_name("index.md")
elif parts[-1].startswith("_"):
continue
# Skip the loop iteration if there is no doc path
if not doc_path:
continue
# If the doc path has at least one part, add it to the navigation
if len(doc_path.parts) >= 1:
nav_parts = [f"{part}" for part in parts]
nav[tuple(nav_parts)] = doc_path.as_posix()
# Open the full doc path and write the module identifier to it
with mkdocs_gen_files.open(full_doc_path, "w") as fd:
ident = ".".join(parts)
fd.write(f"::: {ident}")
# Set the edit path for the file
mkdocs_gen_files.set_edit_path(
full_doc_path, ".." / path.relative_to(root))
# Open the index.md file and write the built navigation to it
with mkdocs_gen_files.open("reference/index.md", "w") as nav_file:
nav_file.writelines(nav.build_literate_nav())
================================================
FILE: cicd/code_doc/index.html.jinja2
================================================
{% set root_module_name = "" %}
{% extends "default/index.html.jinja2" %}
{% block title %}Lakehouse Engine Documentation{% endblock %}
{% block nav %}
<img src="{{ logo }}" class="logo" alt="project logo"/>
<input type="search" placeholder="Search..." role="searchbox" aria-label="search"
pattern=".+" required>
<h2>Available Modules</h2>
<ul>
{% for submodule in all_modules if "." not in submodule and not submodule.startswith("_") %}
<li><a href="{{ submodule.replace(".","/") }}.html">{{ submodule.replace("_"," ").title() }}</a></li>
{% endfor %}
</ul>
{% endblock %}
{% block content %}
<header class="pdoc">
<h1>Lakehouse Engine Documentation</h1>
</header>
<main class="pdoc">
{% filter to_html %}
{% include "README.md" %}
{% endfilter %}
</main>
{% if search %}
{% include "search.html.jinja2" %}
{% endif %}
{% endblock %}
================================================
FILE: cicd/code_doc/mkdocs.yml
================================================
site_name: Lakehouse Engine Documentation
site_url: https://adidas.github.io/lakehouse-engine-docs
repo_url: https://github.com/adidas/lakehouse-engine
repo_name: lakehouse-engine
docs_dir: "mkdocs/docs"
nav:
- Lakehouse Engine: index.md
- How to use the Lakehouse Engine?:
- Overview: lakehouse_engine_usage/lakehouse_engine_usage.md
- Algorithms:
- Data Loader:
- Overview: lakehouse_engine_usage/data_loader/data_loader.md
- Scenarios:
- Append Load from JDBC with PERMISSIVE mode (default): lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/append_load_from_jdbc_with_permissive_mode.md
- Append Load with FAILFAST: lakehouse_engine_usage/data_loader/append_load_with_failfast/append_load_with_failfast.md
- Batch Delta Load Init, Delta and Backfill with Merge: lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/batch_delta_load_init_delta_backfill_with_merge.md
- Custom Transformer: lakehouse_engine_usage/data_loader/custom_transformer/custom_transformer.md
- Custom Transformer (SQL): lakehouse_engine_usage/data_loader/custom_transformer_sql/custom_transformer_sql.md
- Extract from SAP B4 ADSOs: lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md
- Extract from SAP BW DSOs: lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md
- Extract from SFTP: lakehouse_engine_usage/data_loader/extract_from_sftp/extract_from_sftp.md
- Extract using JDBC connection: lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/extract_using_jdbc_connection.md
- Filtered Full Load: lakehouse_engine_usage/data_loader/filtered_full_load/filtered_full_load.md
- Filtered Full Load with Selective Replace: lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/filtered_full_load_with_selective_replace.md
- Flatten Schema and Explode Columns: lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/flatten_schema_and_explode_columns.md
- Full Load: lakehouse_engine_usage/data_loader/full_load/full_load.md
- Read from Dataframe: lakehouse_engine_usage/data_loader/read_from_dataframe/read_from_dataframe.md
- Read from Sharepoint: lakehouse_engine_usage/data_loader/read_from_sharepoint/read_from_sharepoint.md
- Streaming Append Load with DROPMALFORMED: lakehouse_engine_usage/data_loader/streaming_append_load_with_malformed/streaming_append_load_with_malformed.md
- Streaming Append Load with Optimize Dataset Terminator: lakehouse_engine_usage/data_loader/streaming_append_load_with_terminator/streaming_append_load_with_terminator.md
- Streaming Delta Load with Group and Rank Condensation: lakehouse_engine_usage/data_loader/streaming_delta_load_with_group_and_rank_condensation/streaming_delta_load_with_group_and_rank_condensation.md
- Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking): lakehouse_engine_usage/data_loader/streaming_delta_with_late_arriving_and_out_of_order_events/streaming_delta_with_late_arriving_and_out_of_order_events.md
- Write and Read Dataframe: lakehouse_engine_usage/data_loader/write_and_read_dataframe/write_and_read_dataframe.md
- Write to Console: lakehouse_engine_usage/data_loader/write_to_console/write_to_console.md
- Write to REST API: lakehouse_engine_usage/data_loader/write_to_rest_api/write_to_rest_api.md
- Write to Sharepoint: lakehouse_engine_usage/data_loader/write_to_sharepoint/write_to_sharepoint.md
- Data Quality:
- Overview: lakehouse_engine_usage/data_quality/data_quality.md
- Scenarios:
- Custom Expectations: lakehouse_engine_usage/data_quality/custom_expectations/custom_expectations.md
- Data Quality Validator: lakehouse_engine_usage/data_quality/data_quality_validator/data_quality_validator.md
- Minimal Example: lakehouse_engine_usage/data_quality/minimal_example/minimal_example.md
- Prisma: lakehouse_engine_usage/data_quality/prisma/prisma.md
- Result Sink: lakehouse_engine_usage/data_quality/result_sink/result_sink.md
- Row Tagging: lakehouse_engine_usage/data_quality/row_tagging/row_tagging.md
- Validations Failing: lakehouse_engine_usage/data_quality/validations_failing/validations_failing.md
- Reconciliator:
- Overview: lakehouse_engine_usage/reconciliator/reconciliator.md
- Sensors:
- Overview: lakehouse_engine_usage/sensors/sensors.md
- Sensor:
- Overview: lakehouse_engine_usage/sensors/sensor/sensor.md
- Supported Sources:
- Delta Table: lakehouse_engine_usage/sensors/sensor/delta_table/delta_table.md
- Sensor from other Sensor Delta Table: lakehouse_engine_usage/sensors/sensor/delta_upstream_sensor_table/delta_upstream_sensor_table.md
- Sensor from Files: lakehouse_engine_usage/sensors/sensor/file/file.md
- Sensor from JDBC: lakehouse_engine_usage/sensors/sensor/jdbc_table/jdbc_table.md
- Sensor from Kafka: lakehouse_engine_usage/sensors/sensor/kafka/kafka.md
- Sensor from SAP: lakehouse_engine_usage/sensors/sensor/sap_bw_b4/sap_bw_b4.md
- Update Sensor control Delta Table after processing the data: lakehouse_engine_usage/sensors/sensor/update_sensor_status/update_sensor_status.md
- Heartbeat Sensor:
- Overview: lakehouse_engine_usage/sensors/heartbeat/heartbeat.md
- Supported Sources:
- Delta Table: lakehouse_engine_usage/sensors/heartbeat/delta_table/delta_table.md
- Kafka: lakehouse_engine_usage/sensors/heartbeat/kafka/kafka.md
- Manual Table: lakehouse_engine_usage/sensors/heartbeat/manual_table/manual_table.md
- SAP BW/4HANA: lakehouse_engine_usage/sensors/heartbeat/sap_bw_b4/sap_bw_b4.md
- Trigger File: lakehouse_engine_usage/sensors/heartbeat/trigger_file/trigger_file.md
- Feed Heartbeat Sensor Control Delta Table: lakehouse_engine_usage/sensors/heartbeat/heartbeat_sensor_data_feed/heartbeat_sensor_data_feed.md
- Update Heartbeat Sensor control Delta Table after processing the data: lakehouse_engine_usage/sensors/heartbeat/update_heartbeat_sensor_status/update_heartbeat_sensor_status.md
- GAB:
- Overview: lakehouse_engine_usage/gab/gab.md
- Step-by-Step: lakehouse_engine_usage/gab/step_by_step/step_by_step.md
- Tools:
- Table & File Manager Helper: lakehouse_engine_usage/managerhelper/managerhelper.md
- API Documentation: reference/ # (1)!
theme:
name: material
language: en
logo: assets/img/lakehouse_engine_logo.png
favicon: assets/img/lakehouse_engine_logo_symbol_large.png
icon:
repo: fontawesome/brands/github-alt
palette:
- media: "(prefers-color-scheme: light)"
scheme: default
primary: blue
accent: yellow
toggle:
icon: material/toggle-switch
name: Switch to dark mode
- media: "(prefers-color-scheme: dark), (prefers-color-scheme: no-preference)"
scheme: slate
primary: blue
accent: yellow
toggle:
icon: material/toggle-switch-off
name: Switch to light mode
features:
- content.code.annotate
- content.code.annotation
- content.code.copy
- content.code.select
- content.tabs.link
- content.tooltips
- navigation.indexes
- navigation.path
- navigation.tabs
- navigation.tabs.instant
- navigation.tabs.sticky
- navigation.top
- navigation.sections
- toc.follow
- toc.integrate
- search.highlight
- search.suggest
extra:
social:
- icon: fontawesome/brands/github-alt
link: https://adidas.github.io/lakehouse-engine
version:
provider: mike
name: Version
plugins:
- search
- markdown-exec
- offline
- section-index
- mkdocstrings:
enabled: !ENV [ENABLE_MKDOCSTRINGS, true]
default_handler: python
handlers:
python:
paths: [mkdocs/lakehouse_engine]
options:
show_source: true
- macros:
module_name: mkdocs_macros
- gen-files:
scripts:
- gen_ref_nav.py
- literate-nav:
nav_file: SUMMARY.md
- mike:
alias_type: symlink
canonical_version: latest
extra:
social:
- icon: fontawesome/brands/github-alt
link: https://adidas.github.io/lakehouse-engine
markdown_extensions:
- admonition
- attr_list
- extra
- footnotes
- markdown_include.include:
base_path: mkdocs/docs
- md_in_html
- pymdownx.arithmatex:
generic: true
- pymdownx.details
- pymdownx.emoji:
emoji_index: !!python/name:materialx.emoji.twemoji
emoji_generator: !!python/name:materialx.emoji.to_svg
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.mark
- pymdownx.tabbed:
alternate_style: true
- pymdownx.snippets
- pymdownx.superfences:
custom_fences:
- name: mermaid
class: mermaid
format: !!python/name:pymdownx.superfences.fence_code_format ''
- toc:
permalink: true
copyright: |
© 2025 <a href="https://github.com/adidas" target="_blank" rel="noopener">adidas</a>
================================================
FILE: cicd/code_doc/mkdocs_macros.py
================================================
"""Macro methods to be used on Lakehouse Engine Docs."""
import warnings
import json
import pygments.formatters.html
from markupsafe import Markup
STACK_LEVEL = 2
def _search_files(file: dict, search_string: str) -> list:
"""Searches for a string and outputs the line.
Search for a given string in a file and output the line where it is first
found.
Args:
file: path of the file to be searched.
search_string: string that will be searched for.
Returns:
The number of the first line where a given search_string appears.
"""
range_lines = []
with open(file) as f:
for num, line in enumerate(f, 1):
if search_string in line:
range_lines.append(num - 1)
return range_lines[0]
def _link_example(method_name: str) -> str or None:
"""Searches for a link in a dict.
Searches for the link of a given method_name, in a specific config file and
outputs it.
Args:
method_name: name of the method to be searched for.
Returns:
None or the example link for the given method_name.
"""
if method_name in list(lakehouse_engine_examples.keys()):
file_link = lakehouse_engine_examples[str(method_name)]
return lakehouse_engine_examples["base_link"] + file_link if file_link != "" else None
else:
warnings.warn(
"No entry provided for the following transformer: "
+ method_name,
RuntimeWarning,
STACK_LEVEL,
)
return None
def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
"""Searches for a transformer and returns the first dictionary occurrence.
Search for a given transformer in a dictionary and return the first occurrence.
Args:
dict_to_search: path of the file to be searched.
transformer: string that will be searched for.
Returns:
First dictionary where a given transformer is found.
"""
dict_transformer = []
for spec in dict_to_search["transform_specs"]:
for transformer_dict in spec["transformers"]:
if transformer_dict["function"] == transformer:
dict_transformer.append(transformer_dict)
return json.dumps(dict_transformer[0], indent=4)
def _highlight_examples(method_name: str) -> str or None:
"""Creates a code snippet.
Constructs and exposes the code snippet of a given method_name.
Args:
method_name: name of the module to be searched for.
Returns:
None or the code snippet wrapped in html tags.
"""
for key, item in lakehouse_engine_examples.items():
if method_name == key:
file_path = f"../../{item}"
if file_path == "../../":
warnings.warn(
"No unit testing for the following transformer: " + method_name,
RuntimeWarning,
STACK_LEVEL,
)
return None
first_line = _search_files(file_path, f'"function": "{method_name}"')
with open(file_path) as json_file:
acon_file = json.load(json_file)
code_snippet = _get_dict_transformer(acon_file, method_name)
# Defining the lexer which will parse through the snippet of code we want
# to highlight
lexer = pygments.lexers.JsonLexer()
# Defining the format that will be outputted by the pygments library
# (on our case it will output the code within html tags)
formatter = pygments.formatters.html.HtmlFormatter(
linenos="inline",
anchorlinenos=True,
)
formatter.linenostart = first_line
return Markup(pygments.highlight(code_snippet, lexer, formatter))
def get_example(method_name: str) -> str:
"""Get example based on given argument.
Args:
method_name: name of the module to be searched for.
Returns:
A example.
"""
example_link = _link_example(method_name=method_name)
json_example = _highlight_examples(method_name=method_name)
if example_link:
return (
"""<details class="example">\n"""
f"""<summary>View Example of {method_name} (See full example <a href="{example_link}">here</a>)</summary>"""
f"""<div class="language-json highlight"><pre><span></span><code>{json_example}</code></pre></div>\n"""
"""</details>"""
)
else:
return ""
with open("./examples.json") as json_file:
lakehouse_engine_examples = json.load(json_file)
def format_operations_table(operations_dict: dict) -> str:
"""Format operations dictionary into a markdown table.
Args:
operations_dict: Dictionary containing operations and their parameters.
Returns:
A markdown formatted table with operation details.
"""
if not operations_dict:
return ""
markdown_output = "\n\n**Available Operations:**\n\n"
markdown_output += "| Operation | Parameters | Type | Mandatory |\n"
markdown_output += "|-----------|------------|------|----------|\n"
for operation, params in sorted(operations_dict.items()):
if not params:
markdown_output += f"| `{operation}` | - | - | - |\n"
else:
first_param = True
for param_name, param_info in params.items():
if first_param:
markdown_output += f"| `{operation}` | `{param_name}` | {param_info.get('type', 'N/A')} | {param_info.get('mandatory', False)} |\n"
first_param = False
else:
markdown_output += f"| | `{param_name}` | {param_info.get('type', 'N/A')} | {param_info.get('mandatory', False)} |\n"
return markdown_output
def get_table_manager_operations() -> str:
"""Get formatted table of TableManager operations.
Returns:
A markdown formatted table with TableManager operations.
"""
from lakehouse_engine.core.definitions import TABLE_MANAGER_OPERATIONS
return format_operations_table(TABLE_MANAGER_OPERATIONS)
def get_file_manager_operations() -> str:
"""Get formatted table of FileManager operations.
Returns:
A markdown formatted table with FileManager operations.
"""
from lakehouse_engine.core.definitions import FILE_MANAGER_OPERATIONS
return format_operations_table(FILE_MANAGER_OPERATIONS)
def define_env(env):
"Declare environment for jinja2 templates for markdown"
for fn in [get_example, get_table_manager_operations, get_file_manager_operations]:
env.macro(fn)
# get mkdocstrings' Python handler
python_handler = env.conf["plugins"]["mkdocstrings"].get_handler("python")
# get the `update_env` method of the Python handler
update_env = python_handler.update_env
# override the `update_env` method of the Python handler
def patched_update_env(config):
update_env(config)
# get the `convert_markdown` filter of the env
convert_markdown = python_handler.env.filters["convert_markdown"]
# build a chimera made of macros+mkdocstrings
def render_convert(markdown: str, *args, **kwargs):
return convert_markdown(env.render(markdown), *args, **kwargs)
# patch the filter
python_handler.env.filters["convert_markdown"] = render_convert
# patch the method
python_handler.update_env = patched_update_env
================================================
FILE: cicd/code_doc/module.html.jinja2
================================================
{#
On this Jinja template we're extending a pre-existing template,
copying the block on which we would like to make changes and
adding both the "View Example" summary tag and the "View Full Acon" button.
#}
{% extends "default/module.html.jinja2" %}
{% block title %}{{ module.modulename }}{% endblock %}
{% block nav_submodules %}
{% if module.submodules %}
<h2>Submodules</h2>
<ul>
{% for submodule in module.submodules if is_public(submodule) | trim %}
<li><a href="./{{ module.name }}/{{ submodule.name }}.html">{{ submodule.name.replace("_"," ").title() }}</a></li>
{% endfor %}
</ul>
{% endif %}
{% endblock %}
{% block module_contents %}
{% for m in module.flattened_own_members if is_public(m) | trim %}
<section id="{{ m.qualname or m.name }}">
{{ member(m) }}
{% if m.type == "class" %}
{% for m in m.own_members if m.type != "class" and is_public(m) | trim %}
<div id="{{ m.qualname }}" class="classattr">
{{ member(m) }}
{% if m.fullname | highlight_examples %}
{{ view_example(m.fullname) }}
{% endif %}
{% if m.fullname | link_example %}
{{ view_full_acon(m.fullname) }}
{% endif %}
</div>
{% endfor %}
{% set inherited_members = inherited(m) | trim %}
{% if inherited_members %}
<div class="inherited">
<h5>Inherited Members</h5>
<dl>
{{ inherited_members }}
</dl>
</div>
{% endif %}
{% endif %}
</section>
{% endfor %}
{% endblock %}
{% block attribution %}
{% endblock %}
{% block module_info %}
<section class="module-info">
{% block edit_button %}
{% if edit_url %}
{% if "github.com" in edit_url %}
{% set edit_text = "Edit on GitHub" %}
{% elif "gitlab" in edit_url %}
{% set edit_text = "Edit on GitLab" %}
{% else %}
{% set edit_text = "Edit Source" %}
{% endif %}
<a class="pdoc-button git-button" href="{{ edit_url }}">{{ edit_text }}</a>
{% endif %}
{% endblock %}
{% if "lakehouse_engine" == module.modulename.split(".")[0] %}
{{ module_name() }}
{% endif %}
{{ docstring(module) }}
{% if "lakehouse_engine" == module.modulename.split(".")[0] %}
{{ view_source_state(module) }}
{{ view_source_button(module) }}
{{ view_source_code(module) }}
{% endif %}
</section>
{% endblock %}
{#
On this macro we're creating the "View Example" structure.
#}
{% defaultmacro view_example(doc) %}
<details>
<summary>View Example</summary>
{{ doc | highlight_examples }}
</details>
{% enddefaultmacro %}
{#
On this macro we're creating the "View Full Acon" structure.
#}
{% defaultmacro view_full_acon(doc) %}
<section>
{% set edit_text = "View Full Acon" %}
<a class="pdoc-button git-button" href="{{ doc | link_example }}" target="_blank">{{ edit_text }}</a>
</section>
</br>
</br>
{% enddefaultmacro %}
================================================
FILE: cicd/code_doc/render_doc.py
================================================
"""Module for customizing pdoc documentation."""
import json
import os
import shutil
import warnings
from pathlib import Path
import pygments.formatters.html
from markupsafe import Markup
from pdoc import pdoc, render
STACK_LEVEL = 2
logo_path = (
"https://github.com/adidas/lakehouse-engine/blob/master/assets/img/"
"lakehouse_engine_logo_no_bg_160.png?raw=true"
)
def _get_project_version() -> str:
version = (
os.popen(
"cat cicd/.bumpversion.cfg | grep 'current_version =' | cut -f 3 -d ' '"
)
.read()
.replace("\n", "")
)
return version
def _search_files(file: dict, search_string: str) -> list:
"""Searches for a string and outputs the line.
Search for a given string in a file and output the line where it is first
found.
:param file: path of the file to be searched.
:param search_string: string that will be searched for.
:returns: the number of the first line where a given search_string appears.
"""
range_lines = []
with open(file) as f:
for num, line in enumerate(f, 1):
if search_string in line:
range_lines.append(num - 1)
return range_lines[0]
def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
"""Searches for a transformer and returns the first dictionary occurrence.
Search for a given transformer in a dictionary and return the first occurrence.
:param dict_to_search: path of the file to be searched.
:param transformer: string that will be searched for.
:returns: first dictionary where a given transformer is found.
"""
dict_transformer = []
for spec in dict_to_search["transform_specs"]:
for transformer_dict in spec["transformers"]:
if transformer_dict["function"] == transformer:
dict_transformer.append(transformer_dict)
return json.dumps(dict_transformer[0], indent=4)
def _link_example(module_name: str) -> str or None:
"""Searches for a link in a dict.
Searches for the link of a given module_name, in a specific config file and
outputs it.
:param module_name: name of the module to be searched for.
:returns: None or the example link for the given module_name.
"""
if module_name in list(link_dict.keys()):
file_link = link_dict[str(module_name)]
return link_dict["base_link"] + file_link if file_link != "" else None
else:
return None
def _highlight_examples(module_name: str) -> str or None:
"""Creates a code snippet.
Constructs and exposes the code snippet of a given module_name.
:param module_name: name of the module to be searched for.
:returns: None or the code snippet wrapped in html tags.
"""
transformers_to_ignore = [
"UNSUPPORTED_STREAMING_TRANSFORMERS",
"AVAILABLE_TRANSFORMERS",
"__init__",
]
if module_name.split(".")[1] == "transformers":
if m
gitextract_pl4w_c1i/
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ └── feature_request.md
│ └── pull_request_template.md
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.txt
├── Makefile
├── README.md
├── assets/
│ └── gab/
│ ├── metadata/
│ │ ├── gab/
│ │ │ └── f_agg_dummy_sales_kpi/
│ │ │ ├── 1_article_category.sql
│ │ │ └── 2_f_agg_dummy_sales_kpi.sql
│ │ └── tables/
│ │ ├── dim_calendar.sql
│ │ ├── dummy_sales_kpi.sql
│ │ ├── gab_log_events.sql
│ │ ├── gab_use_case_results.sql
│ │ └── lkp_query_builder.sql
│ ├── notebooks/
│ │ ├── gab.py
│ │ ├── gab_dim_calendar.py
│ │ ├── gab_job_manager.py
│ │ └── query_builder_helper.py
│ └── utils/
│ ├── databricks_job_utils.py
│ └── query_builder_utils.py
├── cicd/
│ ├── .bumpversion.cfg
│ ├── Dockerfile
│ ├── Jenkinsfile
│ ├── Jenkinsfile_deploy
│ ├── bandit.yaml
│ ├── code_doc/
│ │ ├── content.css
│ │ ├── custom_example_macros.py
│ │ ├── examples.json
│ │ ├── gen_ref_nav.py
│ │ ├── index.html.jinja2
│ │ ├── mkdocs.yml
│ │ ├── mkdocs_macros.py
│ │ ├── module.html.jinja2
│ │ ├── render_doc.py
│ │ └── render_docs.py
│ ├── flake8.conf
│ ├── meta.yaml
│ ├── requirements.txt
│ ├── requirements_azure.txt
│ ├── requirements_cicd.txt
│ ├── requirements_dq.txt
│ ├── requirements_os.txt
│ ├── requirements_sftp.txt
│ └── requirements_sharepoint.txt
├── lakehouse_engine/
│ ├── __init__.py
│ ├── algorithms/
│ │ ├── __init__.py
│ │ ├── algorithm.py
│ │ ├── data_loader.py
│ │ ├── dq_validator.py
│ │ ├── exceptions.py
│ │ ├── gab.py
│ │ ├── reconciliator.py
│ │ ├── sensor.py
│ │ └── sensors/
│ │ ├── __init__.py
│ │ ├── heartbeat.py
│ │ └── sensor.py
│ ├── configs/
│ │ ├── __init__.py
│ │ └── engine.yaml
│ ├── core/
│ │ ├── __init__.py
│ │ ├── dbfs_file_manager.py
│ │ ├── definitions.py
│ │ ├── exec_env.py
│ │ ├── executable.py
│ │ ├── file_manager.py
│ │ ├── gab_manager.py
│ │ ├── gab_sql_generator.py
│ │ ├── s3_file_manager.py
│ │ ├── sensor_manager.py
│ │ └── table_manager.py
│ ├── dq_processors/
│ │ ├── __init__.py
│ │ ├── custom_expectations/
│ │ │ ├── __init__.py
│ │ │ ├── expect_column_pair_a_to_be_not_equal_to_b.py
│ │ │ ├── expect_column_pair_a_to_be_smaller_or_equal_than_b.py
│ │ │ ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py
│ │ │ ├── expect_column_values_to_be_date_not_older_than.py
│ │ │ ├── expect_column_values_to_not_be_null_or_empty_string.py
│ │ │ ├── expect_multicolumn_column_a_must_equal_b_or_c.py
│ │ │ └── expect_queried_column_agg_value_to_be.py
│ │ ├── dq_factory.py
│ │ ├── exceptions.py
│ │ └── validator.py
│ ├── engine.py
│ ├── io/
│ │ ├── __init__.py
│ │ ├── exceptions.py
│ │ ├── reader.py
│ │ ├── reader_factory.py
│ │ ├── readers/
│ │ │ ├── __init__.py
│ │ │ ├── dataframe_reader.py
│ │ │ ├── file_reader.py
│ │ │ ├── jdbc_reader.py
│ │ │ ├── kafka_reader.py
│ │ │ ├── query_reader.py
│ │ │ ├── sap_b4_reader.py
│ │ │ ├── sap_bw_reader.py
│ │ │ ├── sftp_reader.py
│ │ │ ├── sharepoint_reader.py
│ │ │ └── table_reader.py
│ │ ├── writer.py
│ │ ├── writer_factory.py
│ │ └── writers/
│ │ ├── __init__.py
│ │ ├── console_writer.py
│ │ ├── dataframe_writer.py
│ │ ├── delta_merge_writer.py
│ │ ├── file_writer.py
│ │ ├── jdbc_writer.py
│ │ ├── kafka_writer.py
│ │ ├── rest_api_writer.py
│ │ ├── sharepoint_writer.py
│ │ └── table_writer.py
│ ├── terminators/
│ │ ├── __init__.py
│ │ ├── cdf_processor.py
│ │ ├── dataset_optimizer.py
│ │ ├── notifier.py
│ │ ├── notifier_factory.py
│ │ ├── notifiers/
│ │ │ ├── __init__.py
│ │ │ ├── email_notifier.py
│ │ │ ├── exceptions.py
│ │ │ └── notification_templates.py
│ │ ├── sensor_terminator.py
│ │ ├── spark_terminator.py
│ │ └── terminator_factory.py
│ ├── transformers/
│ │ ├── __init__.py
│ │ ├── aggregators.py
│ │ ├── column_creators.py
│ │ ├── column_reshapers.py
│ │ ├── condensers.py
│ │ ├── custom_transformers.py
│ │ ├── data_maskers.py
│ │ ├── date_transformers.py
│ │ ├── exceptions.py
│ │ ├── filters.py
│ │ ├── joiners.py
│ │ ├── null_handlers.py
│ │ ├── optimizers.py
│ │ ├── regex_transformers.py
│ │ ├── repartitioners.py
│ │ ├── transformer_factory.py
│ │ ├── unions.py
│ │ └── watermarker.py
│ └── utils/
│ ├── __init__.py
│ ├── acon_utils.py
│ ├── configs/
│ │ ├── __init__.py
│ │ └── config_utils.py
│ ├── databricks_utils.py
│ ├── dq_utils.py
│ ├── engine_usage_stats.py
│ ├── expectations_utils.py
│ ├── extraction/
│ │ ├── __init__.py
│ │ ├── jdbc_extraction_utils.py
│ │ ├── sap_b4_extraction_utils.py
│ │ ├── sap_bw_extraction_utils.py
│ │ └── sftp_extraction_utils.py
│ ├── file_utils.py
│ ├── gab_utils.py
│ ├── logging_handler.py
│ ├── rest_api.py
│ ├── schema_utils.py
│ ├── sharepoint_utils.py
│ ├── spark_utils.py
│ ├── sql_parser_utils.py
│ └── storage/
│ ├── __init__.py
│ ├── dbfs_storage.py
│ ├── file_storage.py
│ ├── file_storage_functions.py
│ ├── local_fs_storage.py
│ └── s3_storage.py
├── lakehouse_engine_usage/
│ ├── __init__.py
│ ├── data_loader/
│ │ ├── __init__.py
│ │ ├── append_load_from_jdbc_with_permissive_mode/
│ │ │ ├── __init__.py
│ │ │ └── append_load_from_jdbc_with_permissive_mode.md
│ │ ├── append_load_with_failfast/
│ │ │ ├── __init__.py
│ │ │ └── append_load_with_failfast.md
│ │ ├── batch_delta_load_init_delta_backfill_with_merge/
│ │ │ ├── __init__.py
│ │ │ └── batch_delta_load_init_delta_backfill_with_merge.md
│ │ ├── custom_transformer/
│ │ │ ├── __init__.py
│ │ │ ├── custom_transformer.md
│ │ │ └── sql_custom_transformer.md
│ │ ├── custom_transformer_sql/
│ │ │ ├── __init__.py
│ │ │ └── custom_transformer_sql.md
│ │ ├── data_loader.md
│ │ ├── extract_from_sap_b4_adso/
│ │ │ ├── __init__.py
│ │ │ └── extract_from_sap_b4_adso.md
│ │ ├── extract_from_sap_bw_dso/
│ │ │ ├── __init__.py
│ │ │ └── extract_from_sap_bw_dso.md
│ │ ├── extract_from_sftp/
│ │ │ ├── __init__.py
│ │ │ └── extract_from_sftp.md
│ │ ├── extract_using_jdbc_connection/
│ │ │ ├── __init__.py
│ │ │ └── extract_using_jdbc_connection.md
│ │ ├── filtered_full_load/
│ │ │ ├── __init__.py
│ │ │ └── filtered_full_load.md
│ │ ├── filtered_full_load_with_selective_replace/
│ │ │ ├── __init__.py
│ │ │ └── filtered_full_load_with_selective_replace.md
│ │ ├── flatten_schema_and_explode_columns/
│ │ │ ├── __init__.py
│ │ │ └── flatten_schema_and_explode_columns.md
│ │ ├── full_load/
│ │ │ ├── __init__.py
│ │ │ └── full_load.md
│ │ ├── read_from_dataframe/
│ │ │ ├── __init__.py
│ │ │ └── read_from_dataframe.md
│ │ ├── read_from_sharepoint/
│ │ │ ├── __init__.py
│ │ │ └── read_from_sharepoint.md
│ │ ├── streaming_append_load_with_malformed/
│ │ │ ├── __init__.py
│ │ │ └── streaming_append_load_with_malformed.md
│ │ ├── streaming_append_load_with_terminator/
│ │ │ ├── __init__.py
│ │ │ └── streaming_append_load_with_terminator.md
│ │ ├── streaming_delta_load_with_group_and_rank_condensation/
│ │ │ ├── __init__.py
│ │ │ └── streaming_delta_load_with_group_and_rank_condensation.md
│ │ ├── streaming_delta_with_late_arriving_and_out_of_order_events/
│ │ │ ├── __init__.py
│ │ │ └── streaming_delta_with_late_arriving_and_out_of_order_events.md
│ │ ├── write_and_read_dataframe/
│ │ │ ├── __init__.py
│ │ │ └── write_and_read_dataframe.md
│ │ ├── write_to_console/
│ │ │ ├── __init__.py
│ │ │ └── write_to_console.md
│ │ ├── write_to_rest_api/
│ │ │ ├── __init__.py
│ │ │ └── write_to_rest_api.md
│ │ └── write_to_sharepoint/
│ │ ├── __init__.py
│ │ └── write_to_sharepoint.md
│ ├── data_quality/
│ │ ├── __init__.py
│ │ ├── custom_expectations/
│ │ │ ├── __init__.py
│ │ │ └── custom_expectations.md
│ │ ├── data_quality.md
│ │ ├── data_quality_validator/
│ │ │ ├── __init__.py
│ │ │ └── data_quality_validator.md
│ │ ├── minimal_example/
│ │ │ ├── __init__.py
│ │ │ └── minimal_example.md
│ │ ├── prisma/
│ │ │ ├── __init__.py
│ │ │ └── prisma.md
│ │ ├── result_sink/
│ │ │ ├── __init__.py
│ │ │ └── result_sink.md
│ │ ├── row_tagging/
│ │ │ ├── __init__.py
│ │ │ └── row_tagging.md
│ │ └── validations_failing/
│ │ ├── __init__.py
│ │ └── validations_failing.md
│ ├── gab/
│ │ ├── __init__.py
│ │ ├── gab.md
│ │ └── step_by_step/
│ │ ├── __init__.py
│ │ └── step_by_step.md
│ ├── lakehouse_engine_usage.md
│ ├── managerhelper/
│ │ ├── managerhelper.md
│ │ ├── operations-script.js
│ │ ├── operations-styles-mkdocs.css
│ │ └── styles-mkdocs.css
│ ├── reconciliator/
│ │ ├── __init__.py
│ │ └── reconciliator.md
│ ├── sensor/
│ │ ├── __init__.py
│ │ ├── delta_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_table.md
│ │ ├── delta_upstream_sensor_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_upstream_sensor_table.md
│ │ ├── file/
│ │ │ ├── __init__.py
│ │ │ └── file.md
│ │ ├── jdbc_table/
│ │ │ ├── __init__.py
│ │ │ └── jdbc_table.md
│ │ ├── kafka/
│ │ │ ├── __init__.py
│ │ │ └── kafka.md
│ │ ├── sap_bw_b4/
│ │ │ ├── __init__.py
│ │ │ └── sap_bw_b4.md
│ │ ├── sensor.md
│ │ └── update_sensor_status/
│ │ ├── __init__.py
│ │ └── update_sensor_status.md
│ └── sensors/
│ ├── __init__.py
│ ├── heartbeat/
│ │ ├── __init__.py
│ │ ├── delta_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_table.md
│ │ ├── heartbeat.md
│ │ ├── heartbeat_sensor_data_feed/
│ │ │ ├── __init__.py
│ │ │ └── heartbeat_sensor_data_feed.md
│ │ ├── kafka/
│ │ │ ├── __init__.py
│ │ │ └── kafka.md
│ │ ├── manual_table/
│ │ │ ├── __init__.py
│ │ │ └── manual_table.md
│ │ ├── sap_bw_b4/
│ │ │ ├── __init__.py
│ │ │ └── sap_bw_b4.md
│ │ ├── trigger_file/
│ │ │ ├── __init__.py
│ │ │ └── trigger_file.md
│ │ └── update_heartbeat_sensor_status/
│ │ ├── __init__.py
│ │ └── update_heartbeat_sensor_status.md
│ ├── sensor/
│ │ ├── __init__.py
│ │ ├── delta_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_table.md
│ │ ├── delta_upstream_sensor_table/
│ │ │ ├── __init__.py
│ │ │ └── delta_upstream_sensor_table.md
│ │ ├── file/
│ │ │ ├── __init__.py
│ │ │ └── file.md
│ │ ├── jdbc_table/
│ │ │ ├── __init__.py
│ │ │ └── jdbc_table.md
│ │ ├── kafka/
│ │ │ ├── __init__.py
│ │ │ └── kafka.md
│ │ ├── sap_bw_b4/
│ │ │ ├── __init__.py
│ │ │ └── sap_bw_b4.md
│ │ ├── sensor.md
│ │ └── update_sensor_status/
│ │ ├── __init__.py
│ │ └── update_sensor_status.md
│ └── sensors.md
├── pyproject.toml
├── samples/
│ ├── cricket_dq_tutorial.py
│ └── tpch_load_and_analysis_tutorial.py
└── tests/
├── __init__.py
├── configs/
│ ├── __init__.py
│ └── engine.yaml
├── conftest.py
├── feature/
│ ├── __init__.py
│ ├── custom_expectations/
│ │ ├── __init__.py
│ │ ├── test_custom_expectations.py
│ │ └── test_expectation_validity.py
│ ├── data_loader_custom_transformer/
│ │ ├── __init__.py
│ │ ├── test_data_loader_custom_transformer_calculate_kpi.py
│ │ ├── test_data_loader_custom_transformer_delta_load.py
│ │ └── test_data_loader_custom_transformer_sql_transformation.py
│ ├── delta_load/
│ │ ├── __init__.py
│ │ ├── test_delta_load_group_and_rank.py
│ │ ├── test_delta_load_merge_options.py
│ │ └── test_delta_load_record_mode_cdc.py
│ ├── test_append_load.py
│ ├── test_data_quality.py
│ ├── test_dq_validator.py
│ ├── test_engine_usage_stats.py
│ ├── test_extract_from_sap_b4.py
│ ├── test_extract_from_sap_bw.py
│ ├── test_file_manager.py
│ ├── test_file_manager_dbfs.py
│ ├── test_file_manager_s3.py
│ ├── test_full_load.py
│ ├── test_gab.py
│ ├── test_heartbeat.py
│ ├── test_jdbc_reader.py
│ ├── test_materialize_cdf.py
│ ├── test_notification.py
│ ├── test_reconciliation.py
│ ├── test_schema_evolution.py
│ ├── test_sensors.py
│ ├── test_sftp_reader.py
│ ├── test_sharepoint_reader.py
│ ├── test_sharepoint_writer.py
│ ├── test_table_manager.py
│ ├── test_writers.py
│ └── transformations/
│ ├── __init__.py
│ ├── test_chain_transformations.py
│ ├── test_column_creators.py
│ ├── test_column_reshapers.py
│ ├── test_data_maskers.py
│ ├── test_date_transformers.py
│ ├── test_drop_duplicate_rows.py
│ ├── test_joiners.py
│ ├── test_multiple_transformations.py
│ ├── test_null_handlers.py
│ ├── test_optimizers.py
│ ├── test_regex_transformers.py
│ ├── test_unions.py
│ └── test_watermarker.py
├── resources/
│ ├── feature/
│ │ ├── append_load/
│ │ │ ├── failfast/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ └── part-03.csv
│ │ │ ├── jdbc_permissive/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ └── part-03.csv
│ │ │ ├── streaming_dropmalformed/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ └── part-03.csv
│ │ │ │ └── streaming.json
│ │ │ └── streaming_with_terminators/
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ └── streaming.json
│ │ ├── custom_expectations/
│ │ │ ├── expect_column_pair_a_to_be_not_equal_to_b/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_pair_a_to_be_smaller_or_equal_than_b/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_values_to_be_date_not_older_than/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_column_values_to_not_be_null_or_empty_string/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── expect_multicolumn_column_a_must_equal_b_or_c/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── dq_control_success.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── dq_sales_schema.json
│ │ │ │ └── streaming.json
│ │ │ └── expect_queried_column_agg_value_to_be/
│ │ │ ├── batch.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── dq_control_success.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── dq_sales_schema.json
│ │ │ └── streaming.json
│ │ ├── data_loader_custom_transformer/
│ │ │ ├── calculate_kpi/
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source_schema.json
│ │ │ ├── delta_load/
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ └── sql_transformation/
│ │ │ ├── control_schema.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ └── source_schema.json
│ │ ├── data_quality/
│ │ │ ├── build_data_docs/
│ │ │ │ ├── with_data_docs_local_fs/
│ │ │ │ │ └── 20240410-080323-dq_success-sales_orders-checkpoint/
│ │ │ │ │ └── 20240410T080323.289170Z/
│ │ │ │ │ └── 7ba399ea28cc40bf8c79213a440aeb91.json
│ │ │ │ └── without_data_docs_local_fs/
│ │ │ │ └── 20240409-143548-dq_validator-sales_source-checkpoint/
│ │ │ │ └── 20240409T143548.454043Z/
│ │ │ │ └── f0d7bd293d22bcfd3c1fec5a7d566638.json
│ │ │ ├── load_with_dq_table/
│ │ │ │ ├── delta_with_dupl_tag_gen_fail/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ ├── dq_functions/
│ │ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_init.csv
│ │ │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_delta_with_dupl_tag_gen_fail_new.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── delta_with_duplicates_tag/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ ├── dq_functions/
│ │ │ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_init.csv
│ │ │ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_delta_with_duplicates_tag_new.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ └── full_overwrite_tag/
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── batch_new.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── data_validator.json
│ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ ├── sales.json
│ │ │ │ │ └── sales_schema.json
│ │ │ │ ├── dq_functions/
│ │ │ │ │ ├── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_init.csv
│ │ │ │ │ └── test_db.dq_functions_source_load_with_dq_table_full_overwrite_tag_new.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── load_with_dq_validator/
│ │ │ │ ├── delta_with_dupl_tag_gen_fail/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── delta_with_duplicates/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ └── data_validator_schema.json
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── delta_with_duplicates_tag/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── part-01.csv
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ └── part-04.csv
│ │ │ │ │ ├── streaming_init.json
│ │ │ │ │ └── streaming_new.json
│ │ │ │ ├── full_overwrite/
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ ├── batch_new.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ └── data_validator_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── full_overwrite_tag/
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ ├── batch_new.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ ├── data_validator_schema.json
│ │ │ │ │ │ ├── sales.json
│ │ │ │ │ │ └── sales_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── no_transformers/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── data_validator.json
│ │ │ │ │ │ └── data_validator_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ └── part-04.csv
│ │ │ │ ├── streaming_init.json
│ │ │ │ └── streaming_new.json
│ │ │ └── validator/
│ │ │ └── data/
│ │ │ ├── control/
│ │ │ │ └── data_validator.csv
│ │ │ ├── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_dq_failure.csv
│ │ │ │ ├── test_db.dq_functions_source_dq_failure_error_disabled.csv
│ │ │ │ ├── test_db.dq_functions_source_dq_failure_max_percentage.csv
│ │ │ │ └── test_db.dq_functions_source_dq_success.csv
│ │ │ └── source/
│ │ │ └── part-01.csv
│ │ ├── delta_load/
│ │ │ ├── group_and_rank/
│ │ │ │ ├── fail_with_duplicates_in_same_file/
│ │ │ │ │ ├── batch_delta.json
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ ├── control_batch_schema.json
│ │ │ │ │ ├── control_streaming_schema.json
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ ├── batch.csv
│ │ │ │ │ │ │ └── streaming.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ │ ├── WE_SO_SCL_202108111500000000.csv
│ │ │ │ │ │ └── WE_SO_SCL_202108111600000000.csv
│ │ │ │ │ ├── source_schema.json
│ │ │ │ │ └── streaming_delta.json
│ │ │ │ └── with_duplicates_in_same_file/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── control_batch_schema.json
│ │ │ │ ├── control_streaming_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── batch.csv
│ │ │ │ │ │ └── streaming.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ ├── WE_SO_SCL_202108111500000000.csv
│ │ │ │ │ └── WE_SO_SCL_202108111600000000.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming_delta.json
│ │ │ ├── merge_options/
│ │ │ │ ├── control_batch_schema.json
│ │ │ │ ├── insert_column_set/
│ │ │ │ │ ├── batch_delta.json
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── batch.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ └── WE_SO_SCL_202108111500000000.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ ├── update_all/
│ │ │ │ │ ├── batch_delta.json
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── batch.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ │ └── WE_SO_SCL_202108111500000000.csv
│ │ │ │ └── update_column_set/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── batch.csv
│ │ │ │ └── source/
│ │ │ │ ├── WE_SO_SCL_202108111400000000.csv
│ │ │ │ └── WE_SO_SCL_202108111500000000.csv
│ │ │ └── record_mode_cdc/
│ │ │ ├── backfill/
│ │ │ │ ├── batch_backfill.json
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ ├── part-04.csv
│ │ │ │ └── part-05.csv
│ │ │ ├── direct_silver_load/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ ├── late_arriving_changes/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ └── part-04.csv
│ │ │ │ └── streaming_delta.json
│ │ │ ├── out_of_order_changes/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ └── part-04.csv
│ │ │ │ └── streaming_delta.json
│ │ │ ├── with_deletes_additional_columns/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ ├── with_duplicates/
│ │ │ │ ├── batch_delta.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ ├── part-02.csv
│ │ │ │ ├── part-03.csv
│ │ │ │ └── part-04.csv
│ │ │ └── with_upserts_only_removed_columns/
│ │ │ ├── batch_delta.json
│ │ │ ├── batch_init.json
│ │ │ └── data/
│ │ │ ├── control/
│ │ │ │ └── part-01.csv
│ │ │ └── source/
│ │ │ ├── part-01.json
│ │ │ ├── part-02.json
│ │ │ ├── part-03.json
│ │ │ └── part-04.json
│ │ ├── dq_validator/
│ │ │ ├── batch.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── data_restore_control.csv
│ │ │ │ │ ├── dq_control_failure.csv
│ │ │ │ │ ├── dq_control_failure_disabled.csv
│ │ │ │ │ ├── dq_control_success.csv
│ │ │ │ │ ├── dq_control_success_explode.csv
│ │ │ │ │ └── dq_control_success_explode_disabled.csv
│ │ │ │ ├── dq_functions/
│ │ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── dq_sales_schema.json
│ │ │ ├── streaming.json
│ │ │ ├── streaming_dataframe_two_runs/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_streaming_dataframe_two_runs_first_run.csv
│ │ │ │ └── test_db.dq_functions_streaming_dataframe_two_runs_second_run.csv
│ │ │ ├── table_batch_dataframe_failure_disabled/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_batch_dataframe_success/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_batch_dq_rule/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_table_rule_id_failure.csv
│ │ │ │ └── test_db.dq_table_rule_id_success.csv
│ │ │ ├── table_batch_failure_disabled/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_batch_success/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ ├── table_streaming_dq_rule/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_table_rule_id_failure.csv
│ │ │ │ └── test_db.dq_table_rule_id_success.csv
│ │ │ ├── table_streaming_failure_disabled/
│ │ │ │ └── data/
│ │ │ │ └── dq_functions/
│ │ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ │ └── table_streaming_success/
│ │ │ └── data/
│ │ │ └── dq_functions/
│ │ │ ├── test_db.dq_functions_source_table_failure.csv
│ │ │ └── test_db.dq_functions_source_table_success.csv
│ │ ├── engine_usage_stats/
│ │ │ ├── dq_validator/
│ │ │ │ └── data/
│ │ │ │ ├── control.json
│ │ │ │ └── source.csv
│ │ │ ├── load_custom_transf_and_df/
│ │ │ │ └── data/
│ │ │ │ ├── control.json
│ │ │ │ └── source.csv
│ │ │ ├── load_simple_acon/
│ │ │ │ └── data/
│ │ │ │ ├── control.json
│ │ │ │ └── source.csv
│ │ │ └── table_manager/
│ │ │ └── data/
│ │ │ └── control.json
│ │ ├── extract_from_sap_b4/
│ │ │ ├── extract_aq_dso/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_1.csv
│ │ │ │ │ ├── dummy_table_2.csv
│ │ │ │ │ └── rspmrequest.csv
│ │ │ │ ├── dummy_table_schema.json
│ │ │ │ └── rspmrequest_schema.json
│ │ │ └── extract_cl_dso/
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ └── source/
│ │ │ │ ├── dummy_table.csv
│ │ │ │ ├── dummy_table_cl_1.csv
│ │ │ │ ├── dummy_table_cl_2.csv
│ │ │ │ └── rspmrequest.csv
│ │ │ ├── dummy_table_cl_schema.json
│ │ │ ├── dummy_table_schema.json
│ │ │ └── rspmrequest_schema.json
│ │ ├── extract_from_sap_bw/
│ │ │ ├── derive_changelog_table_name/
│ │ │ │ ├── RSBASIDOC_schema.json
│ │ │ │ ├── RSTSODS_schema.json
│ │ │ │ └── data/
│ │ │ │ └── source/
│ │ │ │ ├── RSBASIDOC.csv
│ │ │ │ └── RSTSODS.csv
│ │ │ ├── extract_dso/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_cl_1.csv
│ │ │ │ │ ├── dummy_table_cl_2.csv
│ │ │ │ │ └── rsodsactreq.csv
│ │ │ │ ├── dummy_table_cl_schema.json
│ │ │ │ ├── dummy_table_schema.json
│ │ │ │ └── rsodsactreq_schema.json
│ │ │ └── extract_write_optimised_dso/
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ ├── dummy_table.csv
│ │ │ │ │ ├── dummy_table_actreq_timestamp.csv
│ │ │ │ │ ├── dummy_table_join_condition.csv
│ │ │ │ │ └── dummy_table_schema.json
│ │ │ │ └── source/
│ │ │ │ ├── dummy_table.csv
│ │ │ │ ├── dummy_table_1.csv
│ │ │ │ ├── dummy_table_2.csv
│ │ │ │ └── rsodsactreq.csv
│ │ │ ├── dummy_table_schema.json
│ │ │ └── rsodsactreq_schema.json
│ │ ├── file_manager/
│ │ │ ├── check_restore_status/
│ │ │ │ ├── acon_check_restore_status_directory.json
│ │ │ │ └── acon_check_restore_status_single_object.json
│ │ │ ├── copy_object/
│ │ │ │ ├── acon_copy_directory.json
│ │ │ │ ├── acon_copy_directory_dry_run.json
│ │ │ │ ├── acon_copy_single_object.json
│ │ │ │ └── acon_copy_single_object_dry_run.json
│ │ │ ├── delete_objects/
│ │ │ │ ├── acon_delete_objects.json
│ │ │ │ └── acon_delete_objects_dry_run.json
│ │ │ ├── request_restore/
│ │ │ │ ├── acon_request_restore_directory.json
│ │ │ │ └── acon_request_restore_single_object.json
│ │ │ └── request_restore_to_destination_and_wait/
│ │ │ ├── acon_request_restore_to_destination_and_wait_directory.json
│ │ │ ├── acon_request_restore_to_destination_and_wait_single_object.json
│ │ │ └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json
│ │ ├── file_manager_dbfs/
│ │ │ ├── copy_objects/
│ │ │ │ ├── acon_copy_directory.json
│ │ │ │ ├── acon_copy_directory_dry_run.json
│ │ │ │ └── acon_copy_single_object.json
│ │ │ ├── delete_objects/
│ │ │ │ ├── acon_delete_objects.json
│ │ │ │ └── acon_delete_objects_dry_run.json
│ │ │ └── move_objects/
│ │ │ ├── acon_move_objects.json
│ │ │ └── acon_move_objects_dry_run.json
│ │ ├── file_manager_s3/
│ │ │ ├── check_restore_status/
│ │ │ │ ├── acon_check_restore_status_directory.json
│ │ │ │ └── acon_check_restore_status_single_object.json
│ │ │ ├── copy_objects/
│ │ │ │ ├── acon_copy_directory.json
│ │ │ │ ├── acon_copy_directory_dry_run.json
│ │ │ │ ├── acon_copy_single_object.json
│ │ │ │ └── acon_copy_single_object_dry_run.json
│ │ │ ├── delete_objects/
│ │ │ │ ├── acon_delete_objects.json
│ │ │ │ └── acon_delete_objects_dry_run.json
│ │ │ ├── request_restore/
│ │ │ │ ├── acon_request_restore_directory.json
│ │ │ │ └── acon_request_restore_single_object.json
│ │ │ └── request_restore_to_destination_and_wait/
│ │ │ ├── acon_request_restore_to_destination_and_wait_directory.json
│ │ │ ├── acon_request_restore_to_destination_and_wait_single_object.json
│ │ │ └── acon_request_restore_to_destination_and_wait_single_object_raise_error.json
│ │ ├── full_load/
│ │ │ ├── full_overwrite/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ ├── with_filter/
│ │ │ │ ├── batch.json
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ └── with_filter_partition_overwrite/
│ │ │ ├── batch.json
│ │ │ ├── batch_init.json
│ │ │ └── data/
│ │ │ ├── control/
│ │ │ │ └── part-01.csv
│ │ │ └── source/
│ │ │ ├── part-01.csv
│ │ │ └── part-02.csv
│ │ ├── gab/
│ │ │ ├── control/
│ │ │ │ ├── data/
│ │ │ │ │ ├── vw_dummy_sales_kpi.csv
│ │ │ │ │ ├── vw_nam_orders_all_snapshot.csv
│ │ │ │ │ ├── vw_nam_orders_filtered_snapshot.csv
│ │ │ │ │ ├── vw_negative_offset_orders_all.csv
│ │ │ │ │ ├── vw_negative_offset_orders_filtered.csv
│ │ │ │ │ ├── vw_orders_all.csv
│ │ │ │ │ ├── vw_orders_all_snapshot.csv
│ │ │ │ │ ├── vw_orders_filtered.csv
│ │ │ │ │ └── vw_orders_filtered_snapshot.csv
│ │ │ │ └── schema/
│ │ │ │ ├── vw_dummy_sales_kpi.json
│ │ │ │ └── vw_orders.json
│ │ │ ├── setup/
│ │ │ │ ├── column_list/
│ │ │ │ │ ├── calendar.json
│ │ │ │ │ ├── dummy_sales_kpi.json
│ │ │ │ │ ├── gab_log_events.json
│ │ │ │ │ ├── gab_use_case_results.json
│ │ │ │ │ ├── lkp_query_builder.json
│ │ │ │ │ └── order_events.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── dummy_sales_kpi.csv
│ │ │ │ │ ├── lkp_query_builder.csv
│ │ │ │ │ └── order_events.csv
│ │ │ │ └── schema/
│ │ │ │ ├── dummy_sales_kpi.json
│ │ │ │ ├── lkp_query_builder.json
│ │ │ │ └── order_events.json
│ │ │ └── usecases/
│ │ │ ├── dummy_sales_kpi/
│ │ │ │ ├── 1_article_category.sql
│ │ │ │ ├── 2_dummy_sales_kpi.sql
│ │ │ │ └── scenario/
│ │ │ │ └── dummy_sales_kpi.json
│ │ │ └── order_events/
│ │ │ ├── 1_order_events.sql
│ │ │ └── scenario/
│ │ │ ├── order_events.json
│ │ │ ├── order_events_nam.json
│ │ │ ├── order_events_negative_timezone_offset.json
│ │ │ ├── order_events_snapshot.json
│ │ │ ├── skip_use_case_by_empty_reconciliation.json
│ │ │ ├── skip_use_case_by_empty_requested_cadence.json
│ │ │ ├── skip_use_case_by_not_configured_cadence.json
│ │ │ └── skip_use_case_by_unexisting_cadence.json
│ │ ├── heartbeat/
│ │ │ ├── control/
│ │ │ │ ├── default/
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── ctr_heart_tbl_heartb_feed.csv
│ │ │ │ │ │ ├── ctrl_heart_tbl_exec_sensor.csv
│ │ │ │ │ │ ├── ctrl_heart_tbl_trigger_job.csv
│ │ │ │ │ │ ├── ctrl_heart_tbl_updated.csv
│ │ │ │ │ │ └── ctrl_sensor_tbl_upd_status.json
│ │ │ │ │ └── schema/
│ │ │ │ │ ├── ctrl_heart_tbl_schema.json
│ │ │ │ │ └── ctrl_heart_tbl_trig_schema.json
│ │ │ │ └── heartbeat_paused_sensor_new_record/
│ │ │ │ ├── data/
│ │ │ │ │ ├── ctr_heart_tbl_heartb_feed.csv
│ │ │ │ │ ├── ctrl_heart_tbl_exec_sensor.csv
│ │ │ │ │ ├── ctrl_heart_tbl_trigger_job.csv
│ │ │ │ │ ├── ctrl_heart_tbl_updated.csv
│ │ │ │ │ └── ctrl_sensor_tbl_upd_status.json
│ │ │ │ └── schema/
│ │ │ │ └── ctrl_heart_tbl_schema.json
│ │ │ └── setup/
│ │ │ ├── default/
│ │ │ │ ├── column_list/
│ │ │ │ │ ├── heartbeat_sensor_control_table.json
│ │ │ │ │ └── sensor_table.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── setup_heartbeat_data.csv
│ │ │ │ │ └── setup_sensor_data.json
│ │ │ │ └── schema/
│ │ │ │ └── schema_sensor_df.json
│ │ │ └── heartbeat_paused_sensor_new_record/
│ │ │ ├── column_list/
│ │ │ │ ├── heartbeat_sensor_control_table.json
│ │ │ │ └── sensor_table.json
│ │ │ ├── data/
│ │ │ │ ├── setup_heartbeat_data.csv
│ │ │ │ └── setup_sensor_data.json
│ │ │ └── schema/
│ │ │ └── schema_sensor_df.json
│ │ ├── jdbc_reader/
│ │ │ ├── jdbc_format/
│ │ │ │ ├── correct_arguments/
│ │ │ │ │ ├── batch_init.json
│ │ │ │ │ └── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── predicates/
│ │ │ │ │ └── batch_init.json
│ │ │ │ └── wrong_arguments/
│ │ │ │ └── batch_init.json
│ │ │ └── jdbc_function/
│ │ │ ├── correct_arguments/
│ │ │ │ ├── batch_init.json
│ │ │ │ └── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ └── wrong_arguments/
│ │ │ └── batch_init.json
│ │ ├── materialize_cdf/
│ │ │ ├── acon_create_table.json
│ │ │ ├── control_schema.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-01_cdf.csv
│ │ │ │ ├── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── table/
│ │ │ │ └── streaming_with_cdf.sql
│ │ │ ├── streaming_with_clean_and_vacuum.json
│ │ │ └── streaming_without_clean_cdf.json
│ │ ├── notification/
│ │ │ └── test_attachement.txt
│ │ ├── reconciliation/
│ │ │ └── data/
│ │ │ ├── current.json
│ │ │ ├── current_different_rows.json
│ │ │ ├── current_fail.json
│ │ │ ├── current_nulls_and_zeros.json
│ │ │ ├── current_nulls_and_zeros_fail.json
│ │ │ ├── truth.json
│ │ │ ├── truth_different_rows.json
│ │ │ ├── truth_empty.json
│ │ │ ├── truth_nulls_and_zeros.json
│ │ │ └── truth_nulls_and_zeros_fail.json
│ │ ├── schema_evolution/
│ │ │ ├── append_load/
│ │ │ │ ├── batch_append_disabled.json
│ │ │ │ ├── batch_append_disabled_cast.json
│ │ │ │ ├── batch_append_enabled.json
│ │ │ │ ├── batch_append_enabled_cast.json
│ │ │ │ ├── batch_init_disabled.json
│ │ │ │ ├── batch_init_enabled.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ ├── part-05.csv
│ │ │ │ │ │ └── part-06.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ ├── part-04.csv
│ │ │ │ │ ├── part-05.csv
│ │ │ │ │ └── part-06.csv
│ │ │ │ └── schema/
│ │ │ │ ├── control/
│ │ │ │ │ ├── control_schema.json
│ │ │ │ │ ├── control_schema_add_column.json
│ │ │ │ │ └── control_schema_rename.json
│ │ │ │ └── source/
│ │ │ │ ├── source_part-01_schema.json
│ │ │ │ ├── source_part-02_schema.json
│ │ │ │ ├── source_part-03_schema.json
│ │ │ │ ├── source_part-04_schema.json
│ │ │ │ ├── source_part-05_schema.json
│ │ │ │ └── source_part-06_schema.json
│ │ │ ├── delta_load/
│ │ │ │ ├── batch_delta_disabled.json
│ │ │ │ ├── batch_delta_disabled_rename.json
│ │ │ │ ├── batch_delta_enabled.json
│ │ │ │ ├── batch_init_disabled.json
│ │ │ │ ├── batch_init_enabled.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── part-02.csv
│ │ │ │ │ │ ├── part-03.csv
│ │ │ │ │ │ ├── part-04.csv
│ │ │ │ │ │ ├── part-05.csv
│ │ │ │ │ │ └── part-06.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ ├── part-02.csv
│ │ │ │ │ ├── part-03.csv
│ │ │ │ │ ├── part-04.csv
│ │ │ │ │ ├── part-05.csv
│ │ │ │ │ └── part-06.csv
│ │ │ │ └── schema/
│ │ │ │ ├── control/
│ │ │ │ │ ├── control_schema.json
│ │ │ │ │ ├── control_schema_add_column.json
│ │ │ │ │ └── control_schema_rename.json
│ │ │ │ └── source/
│ │ │ │ ├── source_part-01_schema.json
│ │ │ │ ├── source_part-02_schema.json
│ │ │ │ ├── source_part-03_schema.json
│ │ │ │ ├── source_part-04_schema.json
│ │ │ │ ├── source_part-05_schema.json
│ │ │ │ └── source_part-06_schema.json
│ │ │ └── full_load/
│ │ │ ├── batch_init.json
│ │ │ ├── batch_merge_disabled.json
│ │ │ ├── batch_merge_enabled.json
│ │ │ ├── batch_overwrite.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── part-02.csv
│ │ │ │ └── source/
│ │ │ │ ├── part-01.csv
│ │ │ │ └── part-02.csv
│ │ │ └── schema/
│ │ │ ├── control/
│ │ │ │ ├── control_schema_merge_enabled.json
│ │ │ │ └── control_schema_overwrite.json
│ │ │ └── source/
│ │ │ ├── source_part-01_schema.json
│ │ │ └── source_part-02_schema.json
│ │ ├── sftp_reader/
│ │ │ └── data/
│ │ │ ├── file.csv
│ │ │ ├── file1.csv
│ │ │ ├── file2.csv
│ │ │ ├── file3.json
│ │ │ ├── file4.xml
│ │ │ └── file5.txt
│ │ ├── sharepoint/
│ │ │ ├── exceptions/
│ │ │ │ ├── acons/
│ │ │ │ │ ├── drive_exception.json
│ │ │ │ │ ├── endpoint_exception.json
│ │ │ │ │ ├── local_path_exception.json
│ │ │ │ │ ├── site_exception.json
│ │ │ │ │ └── streaming_exception.json
│ │ │ │ └── schemas/
│ │ │ │ └── schema.json
│ │ │ ├── reader/
│ │ │ │ ├── acons/
│ │ │ │ │ ├── read_file_name_and_file_pattern_conflict_should_fail.json
│ │ │ │ │ ├── read_file_name_unsupported_extension_should_fail.json
│ │ │ │ │ ├── read_folder_csv_archive_enabled_success.json
│ │ │ │ │ ├── read_folder_csv_archive_success_subfolder_override_success.json
│ │ │ │ │ ├── read_folder_csv_no_csv_files_should_fail.json
│ │ │ │ │ ├── read_folder_csv_one_file_schema_mismatch_custom_error_subfolder_should_archive_error.json
│ │ │ │ │ ├── read_folder_csv_one_file_schema_mismatch_should_archive_error.json
│ │ │ │ │ ├── read_folder_csv_pattern_matches_no_files_should_fail.json
│ │ │ │ │ ├── read_folder_csv_pattern_success.json
│ │ │ │ │ ├── read_folder_csv_success.json
│ │ │ │ │ ├── read_folder_path_does_not_exist_should_fail.json
│ │ │ │ │ ├── read_folder_relative_path_looks_like_file_unsupported_extension_should_fail.json
│ │ │ │ │ ├── read_single_csv_archive_default_enabled_success.json
│ │ │ │ │ ├── read_single_csv_archive_enabled_success.json
│ │ │ │ │ ├── read_single_csv_archive_success_subfolder_override_success.json
│ │ │ │ │ ├── read_single_csv_download_error_should_archive_error.json
│ │ │ │ │ ├── read_single_csv_empty_file_should_archive_error.json
│ │ │ │ │ ├── read_single_csv_full_path_success.json
│ │ │ │ │ ├── read_single_csv_full_path_with_file_name_should_fail.json
│ │ │ │ │ ├── read_single_csv_full_path_with_file_pattern_should_fail.json
│ │ │ │ │ ├── read_single_csv_full_path_with_file_type_should_fail.json
│ │ │ │ │ ├── read_single_csv_spark_load_fails_should_archive_error.json
│ │ │ │ │ ├── read_single_csv_success.json
│ │ │ │ │ └── read_unsupported_file_type_should_fail.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── bad_schema.csv
│ │ │ │ │ ├── other.csv
│ │ │ │ │ ├── sample_1.csv
│ │ │ │ │ └── sample_2.csv
│ │ │ │ └── mocks/
│ │ │ │ ├── get_drive_id.json
│ │ │ │ ├── get_file_metadata.json
│ │ │ │ ├── get_site_id.json
│ │ │ │ └── rename_file.json
│ │ │ └── writer/
│ │ │ ├── acons/
│ │ │ │ └── write_to_local_success.json
│ │ │ ├── data/
│ │ │ │ ├── file_control.csv
│ │ │ │ └── file_source.csv
│ │ │ ├── mocks/
│ │ │ │ ├── create_upload_session.json
│ │ │ │ ├── get_drive_id.json
│ │ │ │ └── get_site_id.json
│ │ │ └── schemas/
│ │ │ └── schema.json
│ │ ├── table_manager/
│ │ │ ├── compute_table_statistics/
│ │ │ │ ├── table_stats_complex_default_scenario1.json
│ │ │ │ ├── table_stats_complex_default_scenario2.json
│ │ │ │ ├── table_stats_complex_different_delimiter_scenario1.json
│ │ │ │ ├── table_stats_complex_different_delimiter_scenario2.json
│ │ │ │ └── table_stats_simple_split_scenario.json
│ │ │ ├── create/
│ │ │ │ ├── acon_create_table.json
│ │ │ │ ├── acon_create_table_complex_default_scenario.json
│ │ │ │ ├── acon_create_table_complex_different_delimiter_scenario.json
│ │ │ │ ├── acon_create_table_simple_split_scenario.json
│ │ │ │ ├── acon_create_view.json
│ │ │ │ ├── acon_create_view_complex_default_scenario.json
│ │ │ │ ├── acon_create_view_complex_different_delimiter_scenario.json
│ │ │ │ ├── acon_create_view_simple_split_scenario.json
│ │ │ │ ├── table/
│ │ │ │ │ ├── test_table_complex_default_scenario.sql
│ │ │ │ │ ├── test_table_complex_different_delimiter_scenario.sql
│ │ │ │ │ └── test_table_simple_split_scenario.sql
│ │ │ │ └── view/
│ │ │ │ ├── test_view_complex_default_scenario.sql
│ │ │ │ ├── test_view_complex_different_delimiter_scenario.sql
│ │ │ │ └── test_view_simple_split_scenario.sql
│ │ │ ├── delete/
│ │ │ │ └── acon_delete_where_table_simple_split_scenario.json
│ │ │ ├── describe/
│ │ │ │ └── acon_describe_simple_split_scenario.json
│ │ │ ├── drop/
│ │ │ │ ├── acon_drop_table_simple_split_scenario.json
│ │ │ │ └── acon_drop_view_simple_split_scenario.json
│ │ │ ├── execute_sql/
│ │ │ │ ├── acon_execute_sql_complex_default_scenario.json
│ │ │ │ ├── acon_execute_sql_complex_different_delimiter_scenario.json
│ │ │ │ └── acon_execute_sql_simple_split_scenario.json
│ │ │ ├── get_tbl_pk/
│ │ │ │ └── get_tbl_pk_simple_split_scenario.json
│ │ │ ├── optimize/
│ │ │ │ ├── optimize_location.json
│ │ │ │ ├── optimize_location_simple_split_scenario.json
│ │ │ │ ├── optimize_table.json
│ │ │ │ └── optimize_table_simple_split_scenario.json
│ │ │ ├── show_tbl_properties/
│ │ │ │ └── show_tbl_properties_simple_split_scenario.json
│ │ │ └── vacuum/
│ │ │ ├── acon_vacuum_location.json
│ │ │ ├── acon_vacuum_location_simple_split_scenario.json
│ │ │ └── acon_vacuum_table_simple_split_scenario.json
│ │ ├── transformations/
│ │ │ ├── chain_transformations/
│ │ │ │ ├── acons/
│ │ │ │ │ ├── batch.json
│ │ │ │ │ ├── streaming.json
│ │ │ │ │ ├── streaming_batch.json
│ │ │ │ │ ├── write_streaming_struct_data.json
│ │ │ │ │ └── write_streaming_struct_data_fail.json
│ │ │ │ ├── control/
│ │ │ │ │ ├── chain_control.csv
│ │ │ │ │ └── struct_data.json
│ │ │ │ ├── schema/
│ │ │ │ │ ├── customer_schema.json
│ │ │ │ │ ├── sales_schema.json
│ │ │ │ │ └── struct_data_schema.json
│ │ │ │ └── source/
│ │ │ │ ├── customers.csv
│ │ │ │ ├── sales_historical.csv
│ │ │ │ ├── sales_new.csv
│ │ │ │ └── struct_data.csv
│ │ │ ├── column_creators/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── column_reshapers/
│ │ │ │ ├── explode_arrays/
│ │ │ │ │ ├── batch.json
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ ├── source_schema.json
│ │ │ │ │ └── streaming.json
│ │ │ │ ├── flatten_and_explode_arrays_and_maps/
│ │ │ │ │ ├── batch.json
│ │ │ │ │ ├── data/
│ │ │ │ │ │ ├── control/
│ │ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ │ └── source/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ ├── source_schema.json
│ │ │ │ │ └── streaming.json
│ │ │ │ └── flatten_schema/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.json
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── data_maskers/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── drop_columns.csv
│ │ │ │ │ │ └── hash_masking.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── drop_columns.json
│ │ │ │ ├── drop_columns_control_schema.json
│ │ │ │ ├── hash_masking.json
│ │ │ │ ├── hash_masking_control_schema.json
│ │ │ │ └── source_schema.json
│ │ │ ├── date_transformers/
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── drop_duplicate_rows/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── batch_distinct.json
│ │ │ │ │ │ ├── batch_drop_duplicates.json
│ │ │ │ │ │ ├── streaming_distinct.json
│ │ │ │ │ │ └── streaming_drop_duplicates.json
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming.json
│ │ │ ├── joiners/
│ │ │ │ ├── batch.json
│ │ │ │ ├── control_scenario_1_and_2_schema.json
│ │ │ │ ├── control_scenario_3_schema.json
│ │ │ │ ├── customer_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── control_scenario_1_and_2.csv
│ │ │ │ │ │ └── control_scenario_3.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── customer-part-01.csv
│ │ │ │ │ ├── sales-part-01.csv
│ │ │ │ │ └── sales-part-02.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── streaming.json
│ │ │ │ ├── streaming_foreachBatch.json
│ │ │ │ ├── streaming_without_broadcast.json
│ │ │ │ └── streaming_without_column_rename.json
│ │ │ ├── multiple_transform/
│ │ │ │ ├── batch.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.json
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ └── source_schema.json
│ │ │ ├── null_handlers/
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── replace_nulls.csv
│ │ │ │ │ │ └── replace_nulls_col_subset.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── part-01.csv
│ │ │ │ ├── replace_nulls.json
│ │ │ │ ├── replace_nulls_col_subset.json
│ │ │ │ └── source_schema.json
│ │ │ ├── optimizers/
│ │ │ │ └── data/
│ │ │ │ └── source/
│ │ │ │ └── part-01.csv
│ │ │ ├── regex_transformers/
│ │ │ │ └── with_regex_value/
│ │ │ │ ├── batch.json
│ │ │ │ ├── control_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── part-01.csv
│ │ │ │ │ └── source/
│ │ │ │ │ └── WE_SO_SCL_202108111400000029.csv
│ │ │ │ └── source_schema.json
│ │ │ ├── unions/
│ │ │ │ ├── batch_union.json
│ │ │ │ ├── batch_unionByName.json
│ │ │ │ ├── batch_unionByName_diff_schema.json
│ │ │ │ ├── batch_unionByName_diff_schema_error.json
│ │ │ │ ├── batch_union_diff_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ ├── control_sales.csv
│ │ │ │ │ │ ├── control_sales_shipment.csv
│ │ │ │ │ │ ├── control_sales_shipment_streaming.csv
│ │ │ │ │ │ ├── control_sales_shipment_streaming_foreachBatch.csv
│ │ │ │ │ │ ├── control_sales_streaming.csv
│ │ │ │ │ │ └── control_sales_streaming_foreachBatch.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── sales-historical-part-01.csv
│ │ │ │ │ ├── sales-historical-part-02.csv
│ │ │ │ │ ├── sales-new-part-01.csv
│ │ │ │ │ ├── sales-new-part-02.csv
│ │ │ │ │ ├── sales-shipment-part-01.csv
│ │ │ │ │ └── sales-shipment-part-02.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── sales_shipment_schema.json
│ │ │ │ ├── streaming_union.json
│ │ │ │ ├── streaming_unionByName_diff_schema.json
│ │ │ │ ├── streaming_unionByName_diff_schema_foreachBatch.json
│ │ │ │ └── streaming_union_foreachBatch.json
│ │ │ └── watermarker/
│ │ │ ├── streaming_drop_duplicates/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_drop_duplicates.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming_drop_duplicates.json
│ │ │ ├── streaming_drop_duplicates_overall_watermark/
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_drop_duplicates_overall_watermark.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── part-01.csv
│ │ │ │ │ └── part-02.csv
│ │ │ │ ├── source_schema.json
│ │ │ │ └── streaming_drop_duplicates_overall_watermark.json
│ │ │ ├── streaming_inner_join/
│ │ │ │ ├── customer_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_inner_join.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── customer-part-01.csv
│ │ │ │ │ ├── sales-part-01.csv
│ │ │ │ │ └── sales-part-02.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── streaming_inner_join.json
│ │ │ │ └── streaming_inner_join_control_schema.json
│ │ │ ├── streaming_left_outer_join/
│ │ │ │ ├── customer_schema.json
│ │ │ │ ├── data/
│ │ │ │ │ ├── control/
│ │ │ │ │ │ └── streaming_left_outer_join.csv
│ │ │ │ │ └── source/
│ │ │ │ │ ├── customer-part-01.csv
│ │ │ │ │ ├── customer-part-02.csv
│ │ │ │ │ ├── customer-part-03.csv
│ │ │ │ │ ├── customer-part-04.csv
│ │ │ │ │ ├── customer-part-05.csv
│ │ │ │ │ ├── sales-part-01.csv
│ │ │ │ │ ├── sales-part-02.csv
│ │ │ │ │ ├── sales-part-03.csv
│ │ │ │ │ ├── sales-part-04.csv
│ │ │ │ │ └── sales-part-05.csv
│ │ │ │ ├── sales_schema.json
│ │ │ │ ├── streaming_left_outer_join.json
│ │ │ │ └── streaming_left_outer_join_control_schema.json
│ │ │ └── streaming_right_outer_join/
│ │ │ ├── customer_schema.json
│ │ │ ├── data/
│ │ │ │ ├── control/
│ │ │ │ │ └── streaming_right_outer_join.csv
│ │ │ │ └── source/
│ │ │ │ ├── customer-part-01.csv
│ │ │ │ ├── sales-part-01.csv
│ │ │ │ └── sales-part-02.csv
│ │ │ ├── sales_schema.json
│ │ │ ├── streaming_right_outer_join.json
│ │ │ └── streaming_right_outer_join_control_schema.json
│ │ └── writers/
│ │ ├── acons/
│ │ │ ├── write_batch_console.json
│ │ │ ├── write_batch_dataframe.json
│ │ │ ├── write_batch_files.json
│ │ │ ├── write_batch_jdbc.json
│ │ │ ├── write_batch_rest_api.json
│ │ │ ├── write_batch_table.json
│ │ │ ├── write_streaming_console.json
│ │ │ ├── write_streaming_dataframe.json
│ │ │ ├── write_streaming_df_with_checkpoint.json
│ │ │ ├── write_streaming_files.json
│ │ │ ├── write_streaming_foreachBatch_console.json
│ │ │ ├── write_streaming_foreachBatch_dataframe.json
│ │ │ ├── write_streaming_foreachBatch_df_with_checkpoint.json
│ │ │ ├── write_streaming_foreachBatch_files.json
│ │ │ ├── write_streaming_foreachBatch_jdbc.json
│ │ │ ├── write_streaming_foreachBatch_table.json
│ │ │ ├── write_streaming_multiple_dfs.json
│ │ │ ├── write_streaming_rest_api.json
│ │ │ └── write_streaming_table.json
│ │ ├── control/
│ │ │ ├── writers_control.csv
│ │ │ ├── writers_control_streaming_dataframe_1.csv
│ │ │ ├── writers_control_streaming_dataframe_2.csv
│ │ │ ├── writers_control_streaming_dataframe_foreachBatch_1.csv
│ │ │ └── writers_control_streaming_dataframe_foreachBatch_2.csv
│ │ ├── schema/
│ │ │ └── sales_schema.json
│ │ └── source/
│ │ ├── sales_historical_1.csv
│ │ ├── sales_historical_2.csv
│ │ ├── sales_new_1.csv
│ │ └── sales_new_2.csv
│ └── unit/
│ ├── custom_configs/
│ │ └── custom_engine_config.yaml
│ ├── heartbeat/
│ │ ├── heartbeat_acon_creation/
│ │ │ └── setup/
│ │ │ └── column_list/
│ │ │ ├── heartbeat_sensor_control_table.json
│ │ │ └── sensor_table.json
│ │ └── heartbeat_anchor_job/
│ │ └── setup/
│ │ └── column_list/
│ │ ├── heartbeat_sensor_control_table.json
│ │ └── sensor_table.json
│ └── sharepoint_reader/
│ └── data/
│ ├── sample_ok.csv
│ └── sample_other_delim.csv
├── unit/
│ ├── __init__.py
│ ├── test_acon_validation.py
│ ├── test_custom_configs.py
│ ├── test_databricks_utils.py
│ ├── test_failure_notification_creation.py
│ ├── test_heartbeat_acon_creation.py
│ ├── test_heartbeat_anchor_job.py
│ ├── test_log_filter_sensitive_data.py
│ ├── test_notification_creation.py
│ ├── test_notification_factory.py
│ ├── test_prisma_dq_rule_id.py
│ ├── test_prisma_function_definition.py
│ ├── test_rest_api_functions.py
│ ├── test_sensor.py
│ ├── test_sensor_manager.py
│ ├── test_sharepoint_csv_reader.py
│ ├── test_spark_session.py
│ └── test_version.py
└── utils/
├── __init__.py
├── dataframe_helpers.py
├── dq_rules_table_utils.py
├── exec_env_helpers.py
├── local_storage.py
├── mocks.py
└── smtp_server.py
SYMBOL INDEX (1206 symbols across 194 files)
FILE: assets/gab/metadata/tables/dim_calendar.sql
type `database` (line 2) | CREATE EXTERNAL TABLE `database`.dim_calendar (
FILE: assets/gab/metadata/tables/dummy_sales_kpi.sql
type `database` (line 2) | CREATE EXTERNAL TABLE `database`.`dummy_sales_kpi` (
FILE: assets/gab/metadata/tables/gab_log_events.sql
type `database` (line 2) | CREATE EXTERNAL TABLE `database`.`gab_log_events`
FILE: assets/gab/metadata/tables/gab_use_case_results.sql
type `database` (line 2) | CREATE EXTERNAL TABLE `database`.`gab_use_case_results`
FILE: assets/gab/metadata/tables/lkp_query_builder.sql
type `database` (line 2) | CREATE EXTERNAL TABLE `database`.`lkp_query_builder`
FILE: assets/gab/notebooks/gab.py
function flatten_extend (line 19) | def flatten_extend(list_to_flatten: list) -> list:
FILE: assets/gab/notebooks/gab_job_manager.py
function divide_chunks (line 56) | def divide_chunks(input_list: list, max_number_of_jobs: int) -> list:
function get_run_regions (line 81) | def get_run_regions(job_schedule: dict, job_info: dict) -> list:
FILE: assets/gab/utils/databricks_job_utils.py
class BearerAuth (line 12) | class BearerAuth:
method __init__ (line 15) | def __init__(self, token):
method __call__ (line 19) | def __call__(self, r):
class ResultState (line 28) | class ResultState(str, enum.Enum):
class DatabricksJobs (line 37) | class DatabricksJobs:
method __init__ (line 51) | def __init__(self, databricks_instance: str, auth: str):
method _check_response (line 63) | def _check_response(response):
method list_jobs (line 67) | def list_jobs(self, name: str = None, limit: int = 20, offset: int = 0...
method run_now (line 95) | def run_now(self, job_id: int, notebook_params: dict, idempotency_toke...
method get_output (line 123) | def get_output(self, run_id: int) -> dict:
method get_job (line 147) | def get_job(self, run_id: int) -> dict:
method cancel_job (line 166) | def cancel_job(self, run_id: int) -> dict:
method trigger_job_by_name (line 185) | def trigger_job_by_name(self, job_name: str, notebook_params: dict, id...
method get_job_status (line 202) | def get_job_status(self, run_id: int) -> Tuple[bool, dict]:
method job_id_extraction (line 215) | def job_id_extraction(self, job_name: str) -> int:
FILE: assets/gab/utils/query_builder_utils.py
class QueryBuilderUtils (line 8) | class QueryBuilderUtils:
method __init__ (line 11) | def __init__(self):
method check_config_inputs (line 16) | def check_config_inputs(
method create_sql_statement (line 112) | def create_sql_statement(
method get_dimensions (line 188) | def get_dimensions(self, num_dimensions: str) -> str:
method get_recon_choices (line 219) | def get_recon_choices(cls) -> list:
method get_metric_configuration (line 266) | def get_metric_configuration(cls, num_of_metrics: str) -> dict:
method get_recon_config (line 335) | def get_recon_config(self, recon_list: list) -> dict:
method get_stages (line 385) | def get_stages(self, sql_files_list: list, usecase_name: str) -> dict:
method get_view_information (line 437) | def get_view_information(self, num_of_views: str) -> dict:
method insert_data_into_lkp_query_builder (line 467) | def insert_data_into_lkp_query_builder(cls, delete_sttmt: str, insert_...
method print_definitions (line 483) | def print_definitions(
method set_dimensions (line 549) | def set_dimensions(cls, num_dimensions: str):
method set_extra_metric_config (line 564) | def set_extra_metric_config(self, num_of_metrics: str, metrics_dict: d...
method set_metric (line 582) | def set_metric(cls, num_of_metrics: str):
method set_stages (line 601) | def set_stages(self, sql_files: list) -> list:
method set_views (line 647) | def set_views(cls, num_of_views: str):
method _format_keys_list (line 664) | def _format_keys_list(cls, key_str: str) -> list:
method _generate_query_id (line 682) | def _generate_query_id(cls, usecase_name: str) -> int:
method _get_mapping (line 699) | def _get_mapping(cls, dims_dict: dict, dimensions: str, from_date: str...
method _print_dims_dict (line 731) | def _print_dims_dict(cls, dims_dict: dict):
method _print_derived_metrics (line 747) | def _print_derived_metrics(cls, key_metrics: str, derived_metric: str,...
method _print_metrics_dict (line 767) | def _print_metrics_dict(self, key_metrics: str, metrics_dict: dict):
method _print_recon_dict (line 797) | def _print_recon_dict(cls, recon_dict: dict):
method _print_stages_dict (line 822) | def _print_stages_dict(cls, stages_dict: dict):
method _sort_files (line 844) | def _sort_files(cls, sql_files: str) -> list:
method _validate_metrics_config (line 870) | def _validate_metrics_config(cls, calc_metric: str, metrics_dict: dict...
FILE: cicd/code_doc/custom_example_macros.py
function _search_files (line 10) | def _search_files(file: dict, search_string: str) -> list:
function _link_example (line 31) | def _link_example(method_name: str) -> str or None:
function _get_dict_transformer (line 58) | def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
function _highlight_examples (line 79) | def _highlight_examples(method_name: str) -> str or None:
function get_example (line 120) | def get_example(method_name: str) -> str:
function define_env (line 146) | def define_env(env):
FILE: cicd/code_doc/mkdocs_macros.py
function _search_files (line 10) | def _search_files(file: dict, search_string: str) -> list:
function _link_example (line 31) | def _link_example(method_name: str) -> str or None:
function _get_dict_transformer (line 58) | def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
function _highlight_examples (line 79) | def _highlight_examples(method_name: str) -> str or None:
function get_example (line 120) | def get_example(method_name: str) -> str:
function format_operations_table (line 147) | def format_operations_table(operations_dict: dict) -> str:
function get_table_manager_operations (line 178) | def get_table_manager_operations() -> str:
function get_file_manager_operations (line 188) | def get_file_manager_operations() -> str:
function define_env (line 198) | def define_env(env):
FILE: cicd/code_doc/render_doc.py
function _get_project_version (line 21) | def _get_project_version() -> str:
function _search_files (line 32) | def _search_files(file: dict, search_string: str) -> list:
function _get_dict_transformer (line 51) | def _get_dict_transformer(dict_to_search: dict, transformer: str) -> dict:
function _link_example (line 69) | def _link_example(module_name: str) -> str or None:
function _highlight_examples (line 86) | def _highlight_examples(module_name: str) -> str or None:
FILE: cicd/code_doc/render_docs.py
function _copy_documentation (line 44) | def _copy_documentation(directories: list = "", files: list = ""):
FILE: lakehouse_engine/algorithms/algorithm.py
class Algorithm (line 14) | class Algorithm(Executable):
method __init__ (line 17) | def __init__(self, acon: dict):
method get_dq_spec (line 26) | def get_dq_spec(
method _get_dq_functions (line 120) | def _get_dq_functions(spec: dict, function_key: str) -> List[DQFunctio...
method _validate_dq_tag_strategy (line 144) | def _validate_dq_tag_strategy(spec: DQSpec) -> None:
FILE: lakehouse_engine/algorithms/data_loader.py
class DataLoader (line 33) | class DataLoader(Algorithm):
method __init__ (line 51) | def __init__(self, acon: dict):
method read (line 85) | def read(self) -> OrderedDict:
method transform (line 97) | def transform(self, data: OrderedDict) -> OrderedDict:
method process_dq (line 130) | def process_dq(
method write (line 183) | def write(self, data: OrderedDict) -> OrderedDict:
method terminate (line 214) | def terminate(self, data: OrderedDict) -> None:
method execute (line 227) | def execute(self) -> Optional[OrderedDict]:
method _get_input_specs (line 257) | def _get_input_specs(self) -> List[InputSpec]:
method _get_transform_specs (line 265) | def _get_transform_specs(self) -> List[TransformSpec]:
method _get_dq_specs (line 332) | def _get_dq_specs(self) -> List[DQSpec]:
method _get_output_specs (line 375) | def _get_output_specs(self) -> List[OutputSpec]:
method _get_streaming_transformer_plan (line 433) | def _get_streaming_transformer_plan(
method _get_terminate_specs (line 464) | def _get_terminate_specs(self) -> List[TerminatorSpec]:
method _move_to_streaming_micro_batch_transformers (line 472) | def _move_to_streaming_micro_batch_transformers(
method _move_to_streaming_micro_batch_dq_processors (line 499) | def _move_to_streaming_micro_batch_dq_processors(
method _get_input_read_types (line 529) | def _get_input_read_types(list_of_specs: List) -> dict:
method _get_transform_input_ids (line 541) | def _get_transform_input_ids(list_of_specs: List) -> dict:
method _get_previous_spec_read_types (line 553) | def _get_previous_spec_read_types(
method _verify_dq_rule_id_uniqueness (line 577) | def _verify_dq_rule_id_uniqueness(
FILE: lakehouse_engine/algorithms/dq_validator.py
class DQValidator (line 20) | class DQValidator(Algorithm):
method __init__ (line 32) | def __init__(self, acon: dict):
method read (line 55) | def read(self) -> DataFrame:
method process_dq (line 65) | def process_dq(self, data: DataFrame) -> DataFrame:
method execute (line 87) | def execute(self) -> None:
method _get_dq_spec (line 138) | def _get_dq_spec(input_dq_spec: dict) -> DQSpec:
method _restore_prev_version (line 154) | def _restore_prev_version(self) -> None:
FILE: lakehouse_engine/algorithms/exceptions.py
class ReconciliationFailedException (line 4) | class ReconciliationFailedException(Exception):
class NoNewDataException (line 10) | class NoNewDataException(Exception):
class SensorAlreadyExistsException (line 16) | class SensorAlreadyExistsException(Exception):
class RestoreTypeNotFoundException (line 22) | class RestoreTypeNotFoundException(Exception):
FILE: lakehouse_engine/algorithms/gab.py
class GAB (line 32) | class GAB(Algorithm):
method __init__ (line 41) | def __init__(self, acon: dict):
method execute (line 49) | def execute(self) -> None:
method _process_use_case (line 98) | def _process_use_case(
method _set_use_case_stage_template_file (line 168) | def _set_use_case_stage_template_file(
method _process_use_case_query_cadence (line 192) | def _process_use_case_query_cadence(
method _process_reconciliation_cadence (line 253) | def _process_reconciliation_cadence(
method _process_use_case_query_step (line 374) | def _process_use_case_query_step(
method _get_filtered_cadences (line 455) | def _get_filtered_cadences(
method _get_latest_usecase_data (line 474) | def _get_latest_usecase_data(self, query_id: str) -> tuple[datetime, d...
method _get_latest_run_date (line 485) | def _get_latest_run_date(self, query_id: str) -> datetime:
method _get_latest_use_case_date (line 513) | def _get_latest_use_case_date(self, query_id: str) -> datetime:
method _set_week_configuration_by_uc_start_of_week (line 535) | def _set_week_configuration_by_uc_start_of_week(cls, start_of_week: st...
method _update_rendered_item_cadence (line 554) | def _update_rendered_item_cadence(
method _get_rendered_item_cadence (line 573) | def _get_rendered_item_cadence(
method _get_cadence_configuration (line 607) | def _get_cadence_configuration(
method _render_template_query (line 686) | def _render_template_query(
method _create_stage_view (line 734) | def _create_stage_view(
method _generate_view_statement (line 826) | def _generate_view_statement(
method _unpersist_cached_views (line 898) | def _unpersist_cached_views(cls, unpersist_list: list[str]) -> None:
method _generate_ddl (line 909) | def _generate_ddl(
FILE: lakehouse_engine/algorithms/reconciliator.py
class ReconciliationType (line 27) | class ReconciliationType(Enum):
class ReconciliationTransformers (line 34) | class ReconciliationTransformers(Enum):
class Reconciliator (line 43) | class Reconciliator(Executable):
method __init__ (line 84) | def __init__(self, acon: dict):
method get_source_of_truth (line 103) | def get_source_of_truth(self) -> DataFrame:
method get_current_results (line 116) | def get_current_results(self) -> DataFrame:
method execute (line 129) | def execute(self) -> None:
method _apply_preprocess_query_args (line 185) | def _apply_preprocess_query_args(
method _get_recon_results (line 221) | def _get_recon_results(
FILE: lakehouse_engine/algorithms/sensor.py
class Sensor (line 25) | class Sensor(Algorithm):
method __init__ (line 30) | def __init__(self, acon: dict):
method execute (line 44) | def execute(self) -> bool:
method _check_if_sensor_already_exists (line 74) | def _check_if_sensor_already_exists(self) -> bool:
method _run_streaming_sensor (line 91) | def _run_streaming_sensor(
method _run_batch_sensor (line 111) | def _run_batch_sensor(
method _validate_sensor_spec (line 149) | def _validate_sensor_spec(self) -> None:
FILE: lakehouse_engine/algorithms/sensors/heartbeat.py
class Heartbeat (line 42) | class Heartbeat(Algorithm):
method __init__ (line 47) | def __init__(self, acon: dict):
method execute (line 55) | def execute(self) -> None:
method _get_active_heartbeat_jobs (line 91) | def _get_active_heartbeat_jobs(
method generate_unique_column_values (line 114) | def generate_unique_column_values(cls, main_col: str, col_to_append: s...
method _get_sensor_acon_from_heartbeat (line 129) | def _get_sensor_acon_from_heartbeat(
method _enhance_sensor_acon_extra_options (line 189) | def _enhance_sensor_acon_extra_options(
method _get_all_kafka_options (line 272) | def _get_all_kafka_options(
method _execute_batch_of_sensor (line 340) | def _execute_batch_of_sensor(
method _get_heartbeat_sensor_condition (line 365) | def _get_heartbeat_sensor_condition(
method _update_heartbeat_status_with_sensor_info (line 384) | def _update_heartbeat_status_with_sensor_info(
method update_heartbeat_control_table (line 425) | def update_heartbeat_control_table(
method get_heartbeat_jobs_to_trigger (line 447) | def get_heartbeat_jobs_to_trigger(
method get_anchor_job_record (line 546) | def get_anchor_job_record(
method heartbeat_sensor_trigger_jobs (line 587) | def heartbeat_sensor_trigger_jobs(self) -> None:
method _read_heartbeat_sensor_data_feed_csv (line 646) | def _read_heartbeat_sensor_data_feed_csv(
method merge_control_table_data_feed_records (line 666) | def merge_control_table_data_feed_records(
method heartbeat_sensor_control_table_data_feed (line 727) | def heartbeat_sensor_control_table_data_feed(
method update_sensor_processed_status (line 748) | def update_sensor_processed_status(
method update_heartbeat_sensor_completion_status (line 781) | def update_heartbeat_sensor_completion_status(
FILE: lakehouse_engine/algorithms/sensors/sensor.py
class Sensor (line 25) | class Sensor(Algorithm):
method __init__ (line 30) | def __init__(self, acon: dict):
method execute (line 44) | def execute(self) -> bool:
method _check_if_sensor_already_exists (line 74) | def _check_if_sensor_already_exists(self) -> bool:
method _run_streaming_sensor (line 91) | def _run_streaming_sensor(
method _run_batch_sensor (line 109) | def _run_batch_sensor(
method _validate_sensor_spec (line 147) | def _validate_sensor_spec(self) -> None:
FILE: lakehouse_engine/core/dbfs_file_manager.py
function _dry_run (line 8) | def _dry_run(bucket: str, object_paths: list) -> dict:
function _list_objects (line 34) | def _list_objects(path: str, objects_list: list) -> list:
function _get_path (line 56) | def _get_path(bucket: str, path: str) -> str:
class DBFSFileManager (line 79) | class DBFSFileManager(FileManager):
method get_function (line 84) | def get_function(self) -> None:
method _delete_objects (line 102) | def _delete_objects(bucket: str, objects_paths: list) -> None:
method delete_objects (line 129) | def delete_objects(self) -> None:
method copy_objects (line 147) | def copy_objects(self) -> None:
method _copy_objects (line 173) | def _copy_objects(
method move_objects (line 206) | def move_objects(self) -> None:
method _move_objects (line 232) | def _move_objects(
FILE: lakehouse_engine/core/definitions.py
class CollectEngineUsage (line 22) | class CollectEngineUsage(Enum):
class EngineConfig (line 39) | class EngineConfig(object):
class EngineStats (line 80) | class EngineStats(object):
class InputFormat (line 109) | class InputFormat(Enum):
method values (line 128) | def values(cls): # type: ignore
method exists (line 137) | def exists(cls, input_format: str) -> bool:
class SharepointFile (line 163) | class SharepointFile:
method file_extension (line 175) | def file_extension(self) -> str:
method file_path (line 180) | def file_path(self) -> str:
method is_csv (line 187) | def is_csv(self) -> bool:
method is_excel (line 192) | def is_excel(self) -> bool:
method content_size (line 197) | def content_size(self) -> int:
class SharepointOptions (line 203) | class SharepointOptions(object):
method __post_init__ (line 283) | def __post_init__(self) -> None:
method _get_allowed_extensions (line 300) | def _get_allowed_extensions(self) -> set[str]:
method _validate_file_type (line 309) | def _validate_file_type(self, allowed_file_types: set[str]) -> None:
method _normalize_folder_relative_path (line 320) | def _normalize_folder_relative_path(self) -> None:
method _ends_with_supported_extension (line 325) | def _ends_with_supported_extension(
method _validate_single_file_mode_constraints_if_folder_is_file_path (line 336) | def _validate_single_file_mode_constraints_if_folder_is_file_path(
method _validate_file_name_extension (line 365) | def _validate_file_name_extension(self, allowed_extensions: set[str]) ...
method _validate_file_name_and_file_pattern_are_not_both_set (line 376) | def _validate_file_name_and_file_pattern_are_not_both_set(self) -> None:
method _validate_folder_relative_path_extension_if_looks_like_file (line 384) | def _validate_folder_relative_path_extension_if_looks_like_file(
method validate_for_reader (line 405) | def validate_for_reader(self) -> None:
method validate_for_writer (line 421) | def validate_for_writer(self) -> None:
class OutputFormat (line 430) | class OutputFormat(Enum):
method values (line 449) | def values(cls): # type: ignore
method exists (line 458) | def exists(cls, output_format: str) -> bool:
class NotifierType (line 480) | class NotifierType(Enum):
class NotificationRuntimeParameters (line 486) | class NotificationRuntimeParameters(Enum):
class ReadType (line 501) | class ReadType(Enum):
class ReadMode (line 512) | class ReadMode(Enum):
class DQDefaults (line 523) | class DQDefaults(Enum):
class WriteType (line 591) | class WriteType(Enum):
class InputSpec (line 604) | class InputSpec(object):
method __post_init__ (line 666) | def __post_init__(self) -> None:
class TransformerSpec (line 678) | class TransformerSpec(object):
class TransformSpec (line 692) | class TransformSpec(object):
class DQType (line 713) | class DQType(Enum):
class DQResultFormat (line 720) | class DQResultFormat(Enum):
class DQExecutionPoint (line 726) | class DQExecutionPoint(Enum):
class DQTableBaseParameters (line 733) | class DQTableBaseParameters(Enum):
class DQFunctionSpec (line 740) | class DQFunctionSpec(object):
class DQSpec (line 753) | class DQSpec(object):
class MergeOptions (line 881) | class MergeOptions(object):
class OutputSpec (line 910) | class OutputSpec(object):
class TerminatorSpec (line 978) | class TerminatorSpec(object):
class ReconciliatorSpec (line 995) | class ReconciliatorSpec(object):
class DQValidatorSpec (line 1043) | class DQValidatorSpec(object):
class SQLDefinitions (line 1059) | class SQLDefinitions(Enum):
class FileManagerAPIKeys (line 1072) | class FileManagerAPIKeys(Enum):
class SensorSpec (line 1083) | class SensorSpec(object):
method create_from_acon (line 1114) | def create_from_acon(cls, acon: dict): # type: ignore
class SensorStatus (line 1138) | class SensorStatus(Enum):
class SAPLogchain (line 1172) | class SAPLogchain(Enum):
class RestoreType (line 1180) | class RestoreType(Enum):
method values (line 1188) | def values(cls): # type: ignore
method exists (line 1197) | def exists(cls, restore_type: str) -> bool:
class RestoreStatus (line 1209) | class RestoreStatus(Enum):
class SQLParser (line 1224) | class SQLParser(Enum):
class GABDefaults (line 1244) | class GABDefaults(Enum):
class GABStartOfWeek (line 1253) | class GABStartOfWeek(Enum):
method get_start_of_week (line 1260) | def get_start_of_week(cls) -> dict:
method get_values (line 1271) | def get_values(cls) -> set[str]:
class GABSpec (line 1281) | class GABSpec(object):
method create_from_acon (line 1314) | def create_from_acon(cls, acon: dict): # type: ignore
class GABCadence (line 1355) | class GABCadence(Enum):
method get_ordered_cadences (line 1365) | def get_ordered_cadences(cls) -> dict:
method get_cadences (line 1377) | def get_cadences(cls) -> set[str]:
method order_cadences (line 1386) | def order_cadences(cls, cadences_to_order: list[str]) -> list[str]:
class GABKeys (line 1398) | class GABKeys:
class GABReplaceableKeys (line 1406) | class GABReplaceableKeys:
class GABCombinedConfiguration (line 1415) | class GABCombinedConfiguration(Enum):
class HeartbeatConfigSpec (line 1760) | class HeartbeatConfigSpec(object):
method create_from_acon (line 1806) | def create_from_acon(cls, acon: dict): # type: ignore
class HeartbeatSensorSource (line 1829) | class HeartbeatSensorSource(Enum):
method values (line 1840) | def values(cls): # type: ignore
class HeartbeatStatus (line 1849) | class HeartbeatStatus(Enum):
FILE: lakehouse_engine/core/exec_env.py
class ExecEnv (line 13) | class ExecEnv(object):
method set_default_engine_config (line 26) | def set_default_engine_config(
method get_or_create (line 55) | def get_or_create(
method get_for_each_batch_session (line 107) | def get_for_each_batch_session(cls, df: DataFrame) -> None:
method _set_spark_configs (line 117) | def _set_spark_configs(
method get_environment (line 149) | def get_environment(cls) -> str:
FILE: lakehouse_engine/core/executable.py
class Executable (line 7) | class Executable(ABC):
method execute (line 11) | def execute(self) -> Optional[Any]:
FILE: lakehouse_engine/core/file_manager.py
class FileManager (line 10) | class FileManager(ABC): # noqa: B024
method __init__ (line 16) | def __init__(self, configs: dict):
method delete_objects (line 26) | def delete_objects(self) -> None:
method copy_objects (line 35) | def copy_objects(self) -> None:
method move_objects (line 44) | def move_objects(self) -> None:
class FileManagerFactory (line 53) | class FileManagerFactory(ABC): # noqa: B024
method execute_function (line 57) | def execute_function(configs: dict) -> Any:
FILE: lakehouse_engine/core/gab_manager.py
class GABCadenceManager (line 17) | class GABCadenceManager(object):
method extended_window_calculator (line 22) | def extended_window_calculator(
method _get_reconciliation_cadence (line 83) | def _get_reconciliation_cadence(
method get_cadence_start_end_dates (line 121) | def get_cadence_start_end_dates(
method _get_cadence_calculated_date (line 187) | def _get_cadence_calculated_date(
method _get_cadence_base_date (line 211) | def _get_cadence_base_date(
method _get_calculated_week_date (line 234) | def _get_calculated_week_date(
method _get_calculated_month_date (line 257) | def _get_calculated_month_date(
method _get_calculated_quarter_or_year_date (line 283) | def _get_calculated_quarter_or_year_date(
class GABViewManager (line 305) | class GABViewManager(object):
method __init__ (line 310) | def __init__(
method generate_use_case_views (line 330) | def generate_use_case_views(self) -> None:
method _generate_use_case_view (line 366) | def _generate_use_case_view(
method _get_dimensions_and_metrics_from_use_case_view (line 434) | def _get_dimensions_and_metrics_from_use_case_view(
method _get_calculated_and_derived_metrics_from_use_case_view (line 484) | def _get_calculated_and_derived_metrics_from_use_case_view(
method _join_list_to_string_when_present (line 528) | def _join_list_to_string_when_present(
method _get_cadence_snapshot_status (line 546) | def _get_cadence_snapshot_status(cls, result: dict) -> dict:
method _split_cadence_by_snapshot (line 568) | def _split_cadence_by_snapshot(
method _get_calculated_metrics (line 588) | def _get_calculated_metrics(
method _get_derived_metrics (line 630) | def _get_derived_metrics(cls, derived_metric: dict) -> list[str]:
method _get_calculated_metric (line 646) | def _get_calculated_metric(
method _get_window_calculated_metric (line 713) | def _get_window_calculated_metric(
method _get_cadence_calculated_metric (line 774) | def _get_cadence_calculated_metric(
method _get_cadence_item_lag (line 825) | def _get_cadence_item_lag(
method _get_cadence_lag_statement (line 838) | def _get_cadence_lag_statement(
FILE: lakehouse_engine/core/gab_sql_generator.py
function _execute_sql (line 16) | def _execute_sql(func) -> Callable: # type: ignore
class GABSQLGenerator (line 34) | class GABSQLGenerator(ABC):
method generate_sql (line 38) | def generate_sql(self) -> Optional[str]:
class GABInsertGenerator (line 46) | class GABInsertGenerator(GABSQLGenerator):
method __init__ (line 55) | def __init__(
method generate_sql (line 81) | def generate_sql(self) -> Optional[str]:
method _insert_statement_generator (line 87) | def _insert_statement_generator(self) -> str:
method _get_mapping_columns (line 115) | def _get_mapping_columns(cls, mapping: dict) -> tuple[str, str]:
method _join_extracted_column_with_filled_columns (line 134) | def _join_extracted_column_with_filled_columns(
method _fill_empty_columns (line 159) | def _fill_empty_columns(
class GABViewGenerator (line 187) | class GABViewGenerator(GABSQLGenerator):
method __init__ (line 195) | def __init__(
method generate_sql (line 248) | def generate_sql(self) -> Optional[str]:
method _create_consumption_view (line 254) | def _create_consumption_view(self) -> str:
method _generate_consumption_view_statement (line 287) | def _generate_consumption_view_statement(
class GABDeleteGenerator (line 429) | class GABDeleteGenerator(GABSQLGenerator):
method __init__ (line 437) | def __init__(
method generate_sql (line 464) | def generate_sql(self) -> Optional[str]:
method _delete_statement_generator (line 473) | def _delete_statement_generator(self) -> str:
FILE: lakehouse_engine/core/s3_file_manager.py
function _dry_run (line 20) | def _dry_run(bucket: str, object_paths: list) -> dict:
function _list_objects (line 46) | def _list_objects(
function _list_objects_recursively (line 82) | def _list_objects_recursively(bucket: str, path: str) -> list:
function _check_directory (line 109) | def _check_directory(bucket: str, path: str) -> bool:
class S3FileManager (line 124) | class S3FileManager(FileManager):
method get_function (line 129) | def get_function(self) -> None:
method _delete_objects (line 150) | def _delete_objects(self, bucket: str, objects_paths: list) -> None:
method delete_objects (line 186) | def delete_objects(self) -> None:
method copy_objects (line 204) | def copy_objects(self) -> None:
method move_objects (line 224) | def move_objects(self) -> None:
method request_restore (line 232) | def request_restore(self) -> None:
method check_restore_status (line 248) | def check_restore_status(self) -> None:
method request_restore_to_destination_and_wait (line 267) | def request_restore_to_destination_and_wait(self) -> None:
method _copy_objects (line 308) | def _copy_objects(
class ArchiveFileManager (line 370) | class ArchiveFileManager(object):
method _get_archived_object (line 376) | def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]:
method _check_object_restore_status (line 398) | def _check_object_restore_status(
method check_restore_status (line 425) | def check_restore_status(source_bucket: str, source_object: str) -> dict:
method _request_restore_object (line 479) | def _request_restore_object(
method request_restore (line 515) | def request_restore(
method request_restore_and_wait (line 555) | def request_restore_and_wait(
FILE: lakehouse_engine/core/sensor_manager.py
class SensorControlTableManager (line 24) | class SensorControlTableManager(object):
method check_if_sensor_has_acquired_data (line 30) | def check_if_sensor_has_acquired_data(
method update_sensor_status (line 55) | def update_sensor_status(
method _update_sensor_control (line 101) | def _update_sensor_control(
method _convert_sensor_to_data (line 128) | def _convert_sensor_to_data(
method _get_sensor_update_set (line 169) | def _get_sensor_update_set(cls, **kwargs: Optional[str] | List[str]) -...
method read_sensor_table_data (line 190) | def read_sensor_table_data(
class SensorUpstreamManager (line 226) | class SensorUpstreamManager(object):
method generate_filter_exp_query (line 232) | def generate_filter_exp_query(
method generate_sensor_table_preprocess_query (line 307) | def generate_sensor_table_preprocess_query(
method read_new_data (line 331) | def read_new_data(cls, sensor_spec: SensorSpec) -> DataFrame:
method get_new_data (line 349) | def get_new_data(
method generate_sensor_sap_logchain_query (line 365) | def generate_sensor_sap_logchain_query(
class SensorJobRunManager (line 410) | class SensorJobRunManager(object):
method run_job (line 416) | def run_job(cls, job_id: str, token: str, host: str) -> Tuple[int, Opt...
FILE: lakehouse_engine/core/table_manager.py
class TableManager (line 16) | class TableManager(object):
method __init__ (line 22) | def __init__(self, configs: dict):
method get_function (line 32) | def get_function(self) -> None:
method create (line 62) | def create(self) -> None:
method create_many (line 85) | def create_many(self) -> None:
method compute_table_statistics (line 92) | def compute_table_statistics(self) -> None:
method drop_table (line 105) | def drop_table(self) -> None:
method drop_view (line 116) | def drop_view(self) -> None:
method truncate (line 127) | def truncate(self) -> None:
method vacuum (line 138) | def vacuum(self) -> None:
method describe (line 153) | def describe(self) -> None:
method optimize (line 164) | def optimize(self) -> None:
method execute_multiple_sql_files (line 193) | def execute_multiple_sql_files(self) -> None:
method execute_sql (line 216) | def execute_sql(self) -> None:
method show_tbl_properties (line 229) | def show_tbl_properties(self) -> DataFrame:
method get_tbl_pk (line 245) | def get_tbl_pk(self) -> List[str]:
method repair_table (line 263) | def repair_table(self) -> None:
method delete_where (line 277) | def delete_where(self) -> None:
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b.py
class ColumnPairCustom (line 15) | class ColumnPairCustom(ColumnPairMapMetricProvider):
method _spark (line 32) | def _spark(
class ExpectColumnPairAToBeNotEqualToB (line 53) | class ExpectColumnPairAToBeNotEqualToB(ColumnPairMapExpectation):
method _validate (line 157) | def _validate(
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b.py
class ColumnPairCustom (line 15) | class ColumnPairCustom(ColumnPairMapMetricProvider):
method _spark (line 33) | def _spark(
class ExpectColumnPairAToBeSmallerOrEqualThanB (line 63) | class ExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):
method _validate (line 171) | def _validate(
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py
class ColumnPairDateAToBeGreaterOrEqualToDateB (line 17) | class ColumnPairDateAToBeGreaterOrEqualToDateB(ColumnPairMapMetricProvid...
method _spark (line 31) | def _spark(
class ExpectColumnPairDateAToBeGreaterThanOrEqualToDateB (line 52) | class ExpectColumnPairDateAToBeGreaterThanOrEqualToDateB(ColumnPairMapEx...
method _validate (line 169) | def _validate(
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_be_date_not_older_than.py
class ColumnValuesDateNotOlderThan (line 17) | class ColumnValuesDateNotOlderThan(ColumnMapMetricProvider):
method _spark (line 30) | def _spark(
class ExpectColumnValuesToBeDateNotOlderThan (line 67) | class ExpectColumnValuesToBeDateNotOlderThan(ColumnMapExpectation):
method _validate (line 178) | def _validate(
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_not_be_null_or_empty_string.py
class ColumnValuesNotNullOrEpmtyString (line 15) | class ColumnValuesNotNullOrEpmtyString(ColumnMapMetricProvider):
method _spark (line 29) | def _spark(
class ExpectColumnValuesToNotBeNullOrEmptyString (line 46) | class ExpectColumnValuesToNotBeNullOrEmptyString(ColumnMapExpectation):
method _validate (line 144) | def _validate(
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c.py
class MulticolumnCustomMetric (line 15) | class MulticolumnCustomMetric(MulticolumnMapMetricProvider):
method _spark (line 33) | def _spark(
class ExpectMulticolumnColumnAMustEqualBOrC (line 57) | class ExpectMulticolumnColumnAMustEqualBOrC(MulticolumnMapExpectation):
method _validate (line 159) | def _validate(
FILE: lakehouse_engine/dq_processors/custom_expectations/expect_queried_column_agg_value_to_be.py
class ExpectQueriedColumnAggValueToBe (line 15) | class ExpectQueriedColumnAggValueToBe(QueryExpectation):
method validate_configuration (line 53) | def validate_configuration(
method _validate_between (line 68) | def _validate_between(
method _validate_lesser (line 99) | def _validate_lesser(x: str, y: int, expected_max_value: int) -> dict:
method _validate_greater (line 125) | def _validate_greater(x: str, y: int, expected_min_value: int) -> dict:
method _validate_condition (line 150) | def _validate_condition(self, query_result: dict, template_dict: dict)...
method _generate_dict (line 177) | def _generate_dict(query_result: list) -> dict:
method _validate (line 210) | def _validate(
method _validate_template_dict (line 238) | def _validate_template_dict(self: Any) -> dict:
FILE: lakehouse_engine/dq_processors/dq_factory.py
class DQFactory (line 59) | class DQFactory(object):
method _add_critical_function_tag (line 66) | def _add_critical_function_tag(cls, args: dict) -> dict:
method _configure_checkpoint (line 105) | def _configure_checkpoint(
method _check_row_condition (line 178) | def _check_row_condition(
method _add_suite (line 207) | def _add_suite(
method _check_expectation_result (line 258) | def _check_expectation_result(cls, result_dict: dict) -> dict:
method run_dq_process (line 280) | def run_dq_process(cls, dq_spec: DQSpec, data: DataFrame) -> DataFrame:
method _check_critical_functions_tags (line 381) | def _check_critical_functions_tags(cls, failed_expectations: dict) -> ...
method _check_chunk_usage (line 398) | def _check_chunk_usage(cls, results_dict: dict, dq_spec: DQSpec) -> bool:
method _explode_results (line 423) | def _explode_results(
method _get_data_context_config (line 530) | def _get_data_context_config(cls, dq_spec: DQSpec) -> DataContextConfig:
method _get_data_source_defaults (line 563) | def _get_data_source_defaults(cls, dq_spec: DQSpec) -> dict:
method _get_failed_expectations (line 595) | def _get_failed_expectations(
method _get_unexpected_rows_pk (line 652) | def _get_unexpected_rows_pk(cls, dq_spec: DQSpec) -> Optional[list]:
method _log_or_fail (line 675) | def _log_or_fail(
method _transform_checkpoint_results (line 722) | def _transform_checkpoint_results(
method _process_chunk (line 774) | def _process_chunk(
method _cast_columns_to_string (line 836) | def _cast_columns_to_string(cls, df: DataFrame) -> DataFrame:
method _generate_chunks (line 851) | def _generate_chunks(cls, results_dict: dict, dq_spec: DQSpec) -> list:
method _split_into_chunks (line 881) | def _split_into_chunks(cls, results_dict: dict, dq_spec: DQSpec) -> list:
method _write_to_location (line 922) | def _write_to_location(
method split_into_chunks (line 979) | def split_into_chunks(lst: list, chunk_size: int) -> list:
FILE: lakehouse_engine/dq_processors/exceptions.py
class DQValidationsFailedException (line 4) | class DQValidationsFailedException(Exception):
class DQCheckpointsResultsException (line 10) | class DQCheckpointsResultsException(Exception):
class DQSpecMalformedException (line 16) | class DQSpecMalformedException(Exception):
class DQDuplicateRuleIdException (line 22) | class DQDuplicateRuleIdException(Exception):
FILE: lakehouse_engine/dq_processors/validator.py
class Validator (line 24) | class Validator(object):
method get_dq_validator (line 30) | def get_dq_validator(
method tag_source_with_dq (line 73) | def tag_source_with_dq(
method _add_critical_function_tag (line 112) | def _add_critical_function_tag(cls, args: dict) -> dict:
method _get_row_tagged_fail_df (line 136) | def _get_row_tagged_fail_df(
method _join_complementary_data (line 230) | def _join_complementary_data(
FILE: lakehouse_engine/engine.py
function load_data (line 29) | def load_data(
function execute_reconciliation (line 56) | def execute_reconciliation(
function execute_dq_validation (line 85) | def execute_dq_validation(
function manage_table (line 116) | def manage_table(
function execute_manager (line 140) | def execute_manager(
function manage_files (line 170) | def manage_files(
function execute_sensor (line 194) | def execute_sensor(
function execute_sensor_heartbeat (line 220) | def execute_sensor_heartbeat(
function trigger_heartbeat_sensor_jobs (line 272) | def trigger_heartbeat_sensor_jobs(
function execute_heartbeat_sensor_data_feed (line 284) | def execute_heartbeat_sensor_data_feed(
function update_heartbeat_sensor_status (line 303) | def update_heartbeat_sensor_status(
function update_sensor_status (line 326) | def update_sensor_status(
function generate_sensor_query (line 360) | def generate_sensor_query(
function generate_sensor_sap_logchain_query (line 409) | def generate_sensor_sap_logchain_query(
function send_notification (line 438) | def send_notification(args: dict) -> None:
function execute_gab (line 452) | def execute_gab(
FILE: lakehouse_engine/io/exceptions.py
class IncrementalFilterInputNotFoundException (line 4) | class IncrementalFilterInputNotFoundException(Exception):
class WrongIOFormatException (line 15) | class WrongIOFormatException(Exception):
class NotSupportedException (line 21) | class NotSupportedException(RuntimeError):
class InputNotFoundException (line 27) | class InputNotFoundException(Exception):
class EndpointNotFoundException (line 33) | class EndpointNotFoundException(Exception):
class LocalPathNotFoundException (line 39) | class LocalPathNotFoundException(Exception):
class WriteToLocalException (line 45) | class WriteToLocalException(Exception):
class SharePointAPIError (line 51) | class SharePointAPIError(Exception):
class InvalidSharepointPathException (line 57) | class InvalidSharepointPathException(Exception):
FILE: lakehouse_engine/io/reader.py
class Reader (line 11) | class Reader(ABC):
method __init__ (line 14) | def __init__(self, input_spec: InputSpec):
method read (line 24) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/reader_factory.py
class ReaderFactory (line 19) | class ReaderFactory(ABC): # noqa: B024
method get_data (line 23) | def get_data(cls, spec: InputSpec) -> DataFrame:
FILE: lakehouse_engine/io/readers/dataframe_reader.py
class DataFrameReader (line 9) | class DataFrameReader(Reader):
method __init__ (line 12) | def __init__(self, input_spec: InputSpec):
method read (line 20) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/readers/file_reader.py
class FileReader (line 11) | class FileReader(Reader):
method __init__ (line 14) | def __init__(self, input_spec: InputSpec):
method read (line 22) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/readers/jdbc_reader.py
class JDBCReader (line 15) | class JDBCReader(Reader):
method __init__ (line 18) | def __init__(self, input_spec: InputSpec):
method read (line 26) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/readers/kafka_reader.py
class KafkaReader (line 10) | class KafkaReader(Reader):
method __init__ (line 13) | def __init__(self, input_spec: InputSpec):
method read (line 21) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/readers/query_reader.py
class QueryReader (line 10) | class QueryReader(Reader):
method __init__ (line 13) | def __init__(self, input_spec: InputSpec):
method read (line 21) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/readers/sap_b4_reader.py
class SAPB4Reader (line 19) | class SAPB4Reader(Reader):
method __init__ (line 24) | def __init__(self, input_spec: InputSpec):
method read (line 33) | def read(self) -> DataFrame:
method _get_jdbc_utils (line 42) | def _get_jdbc_utils(self) -> SAPB4ExtractionUtils:
method _get_options (line 142) | def _get_options(self) -> Tuple[dict, dict]:
FILE: lakehouse_engine/io/readers/sap_bw_reader.py
class SAPBWReader (line 18) | class SAPBWReader(Reader):
method __init__ (line 23) | def __init__(self, input_spec: InputSpec):
method read (line 32) | def read(self) -> DataFrame:
method _get_jdbc_utils (line 41) | def _get_jdbc_utils(self) -> SAPBWExtractionUtils:
method _get_options (line 147) | def _get_options(self) -> Tuple[dict, dict]:
FILE: lakehouse_engine/io/readers/sftp_reader.py
class SFTPReader (line 23) | class SFTPReader(Reader):
method __init__ (line 28) | def __init__(self, input_spec: InputSpec):
method read (line 36) | def read(self) -> DataFrame:
method _append_files (line 92) | def _append_files(cls, pdf: PandasDataFrame, dfs: List) -> List:
method _read_files (line 107) | def _read_files(
FILE: lakehouse_engine/io/readers/sharepoint_reader.py
class SharepointReader (line 26) | class SharepointReader(Reader):
method __init__ (line 29) | def __init__(self, input_spec: InputSpec):
method read (line 88) | def read(self) -> DataFrame:
method _get_sharepoint_utils (line 110) | def _get_sharepoint_utils(self) -> SharepointUtils:
class SharepointCsvReader (line 134) | class SharepointCsvReader(SharepointReader):
method read (line 141) | def read(self, file_path: str = None, pattern: str = None) -> DataFrame:
method _load_and_archive_file (line 183) | def _load_and_archive_file(self, sp_file: SharepointFile) -> DataFrame:
method _get_csv_files_in_folder (line 251) | def _get_csv_files_in_folder(
method _load_csv_to_spark (line 293) | def _load_csv_to_spark(
method read_csv_folder (line 340) | def read_csv_folder(self, folder_path: str, pattern: str = None) -> Da...
method _validate_and_read_file (line 401) | def _validate_and_read_file(
method _handle_file_error (line 436) | def _handle_file_error(
method detect_delimiter (line 470) | def detect_delimiter(
method resolve_spark_csv_options (line 535) | def resolve_spark_csv_options(self, file_content: bytes) -> dict:
class SharepointExcelReader (line 599) | class SharepointExcelReader(SharepointReader):
method read (line 602) | def read(self) -> DataFrame:
class SharepointReaderFactory (line 614) | class SharepointReaderFactory:
method get_reader (line 621) | def get_reader(input_spec: InputSpec) -> SharepointReader:
FILE: lakehouse_engine/io/readers/table_reader.py
class TableReader (line 10) | class TableReader(Reader):
method __init__ (line 13) | def __init__(self, input_spec: InputSpec):
method read (line 21) | def read(self) -> DataFrame:
FILE: lakehouse_engine/io/writer.py
class Writer (line 14) | class Writer(ABC):
method __init__ (line 17) | def __init__(
method write (line 33) | def write(self) -> Optional[OrderedDict]:
method write_transformed_micro_batch (line 38) | def write_transformed_micro_batch(**kwargs: Any) -> Callable:
method get_transformed_micro_batch (line 59) | def get_transformed_micro_batch(
method get_streaming_trigger (line 90) | def get_streaming_trigger(cls, output_spec: OutputSpec) -> Dict:
method run_micro_batch_dq_process (line 117) | def run_micro_batch_dq_process(df: DataFrame, dq_spec: List[DQSpec]) -...
FILE: lakehouse_engine/io/writer_factory.py
class WriterFactory (line 26) | class WriterFactory(ABC): # noqa: B024
method _get_writer_name (line 42) | def _get_writer_name(cls, spec: OutputSpec) -> str:
method get_writer (line 64) | def get_writer(cls, spec: OutputSpec, df: DataFrame, data: OrderedDict...
FILE: lakehouse_engine/io/writers/console_writer.py
class ConsoleWriter (line 13) | class ConsoleWriter(Writer):
method __init__ (line 18) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 28) | def write(self) -> None:
method _show_df (line 43) | def _show_df(df: DataFrame, output_spec: OutputSpec) -> None:
method _show_streaming_df (line 57) | def _show_streaming_df(output_spec: OutputSpec) -> Callable:
method _write_to_console_in_streaming_mode (line 75) | def _write_to_console_in_streaming_mode(
method _write_transformed_micro_batch (line 103) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/io/writers/dataframe_writer.py
class DataFrameWriter (line 17) | class DataFrameWriter(Writer):
method __init__ (line 22) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 33) | def write(self) -> Optional[OrderedDict]:
method _get_prefixed_view_name (line 61) | def _get_prefixed_view_name(self, stream_df_view_name: str) -> str:
method _create_temp_view (line 65) | def _create_temp_view(self, df: DataFrame, stream_df_view_name: str) -...
method _write_streaming_df (line 80) | def _write_streaming_df(self, stream_df_view_name: str) -> Callable:
method _write_to_dataframe_in_streaming_mode (line 96) | def _write_to_dataframe_in_streaming_mode(
method _table_exists (line 158) | def _table_exists(self, table_name: str) -> bool:
method _write_transformed_micro_batch (line 172) | def _write_transformed_micro_batch(
FILE: lakehouse_engine/io/writers/delta_merge_writer.py
class DeltaMergeWriter (line 14) | class DeltaMergeWriter(Writer):
method __init__ (line 17) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 28) | def write(self) -> None:
method _get_delta_table (line 53) | def _get_delta_table(output_spec: OutputSpec) -> DeltaTable:
method _insert (line 75) | def _insert(
method _merge (line 110) | def _merge(delta_table: DeltaTable, output_spec: OutputSpec, df: DataF...
method _update (line 142) | def _update(
method _write_transformed_micro_batch (line 177) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/io/writers/file_writer.py
class FileWriter (line 12) | class FileWriter(Writer):
method __init__ (line 15) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 25) | def write(self) -> None:
method _write_to_files_in_batch_mode (line 35) | def _write_to_files_in_batch_mode(df: DataFrame, output_spec: OutputSp...
method _write_to_files_in_streaming_mode (line 51) | def _write_to_files_in_streaming_mode(
method _write_transformed_micro_batch (line 87) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/io/writers/jdbc_writer.py
class JDBCWriter (line 12) | class JDBCWriter(Writer):
method __init__ (line 15) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 25) | def write(self) -> None:
method _write_to_jdbc_in_batch_mode (line 49) | def _write_to_jdbc_in_batch_mode(df: DataFrame, output_spec: OutputSpe...
method _write_transformed_micro_batch (line 65) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/io/writers/kafka_writer.py
class KafkaWriter (line 12) | class KafkaWriter(Writer):
method __init__ (line 15) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 25) | def write(self) -> None:
method _write_to_kafka_in_batch_mode (line 35) | def _write_to_kafka_in_batch_mode(df: DataFrame, output_spec: OutputSp...
method _write_to_kafka_in_streaming_mode (line 47) | def _write_to_kafka_in_streaming_mode(
method _write_transformed_micro_batch (line 81) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/io/writers/rest_api_writer.py
class RestApiWriter (line 20) | class RestApiWriter(Writer):
method __init__ (line 25) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 35) | def write(self) -> None:
method _get_func_to_send_payload_to_rest_api (line 45) | def _get_func_to_send_payload_to_rest_api(output_spec: OutputSpec) -> ...
method _write_to_rest_api_in_batch_mode (line 142) | def _write_to_rest_api_in_batch_mode(
method _write_to_rest_api_in_streaming_mode (line 162) | def _write_to_rest_api_in_streaming_mode(
method _write_transformed_micro_batch (line 186) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/io/writers/sharepoint_writer.py
class SharepointWriter (line 19) | class SharepointWriter(Writer):
method __init__ (line 29) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 41) | def write(self) -> None:
method _get_sharepoint_utils (line 54) | def _get_sharepoint_utils(self) -> SharepointUtils:
method _write_to_sharepoint_in_batch_mode (line 72) | def _write_to_sharepoint_in_batch_mode(self, df: DataFrame) -> None:
FILE: lakehouse_engine/io/writers/table_writer.py
class TableWriter (line 12) | class TableWriter(Writer):
method __init__ (line 15) | def __init__(self, output_spec: OutputSpec, df: DataFrame, data: Order...
method write (line 25) | def write(self) -> None:
method _write_to_table_in_batch_mode (line 74) | def _write_to_table_in_batch_mode(df: DataFrame, output_spec: OutputSp...
method _write_to_table_in_streaming_mode (line 99) | def _write_to_table_in_streaming_mode(
method _write_transformed_micro_batch (line 133) | def _write_transformed_micro_batch( # type: ignore
FILE: lakehouse_engine/terminators/cdf_processor.py
class CDFProcessor (line 24) | class CDFProcessor(object):
method expose_cdf (line 30) | def expose_cdf(cls, spec: TerminatorSpec) -> None:
method _write_cdf_to_external (line 65) | def _write_cdf_to_external(
method _get_table_cdf_input_specs (line 90) | def _get_table_cdf_input_specs(spec: TerminatorSpec) -> InputSpec:
method delete_old_data (line 115) | def delete_old_data(cls, spec: TerminatorSpec) -> None:
method vacuum_cdf_data (line 134) | def vacuum_cdf_data(cls, spec: TerminatorSpec) -> None:
FILE: lakehouse_engine/terminators/dataset_optimizer.py
class DatasetOptimizer (line 12) | class DatasetOptimizer(object):
method optimize_dataset (line 18) | def optimize_dataset(
method _compute_table_stats (line 79) | def _compute_table_stats(cls, db_table: str) -> None:
method _vacuum (line 93) | def _vacuum(cls, db_table: str, location: str, hours: int) -> None:
method _optimize (line 115) | def _optimize(
FILE: lakehouse_engine/terminators/notifier.py
class Notifier (line 19) | class Notifier(ABC):
method __init__ (line 24) | def __init__(self, notification_spec: TerminatorSpec):
method create_notification (line 34) | def create_notification(self) -> None:
method send_notification (line 39) | def send_notification(self) -> None:
method _render_notification_field (line 43) | def _render_notification_field(self, template_field: str) -> str:
method check_if_notification_is_failure_notification (line 70) | def check_if_notification_is_failure_notification(
FILE: lakehouse_engine/terminators/notifier_factory.py
class NotifierFactory (line 9) | class NotifierFactory(object):
method get_notifier (line 15) | def get_notifier(cls, spec: TerminatorSpec) -> Notifier:
method generate_failure_notification (line 35) | def generate_failure_notification(spec: list, exception: Exception) ->...
FILE: lakehouse_engine/terminators/notifiers/email_notifier.py
class EmailNotifier (line 24) | class EmailNotifier(Notifier):
method __init__ (line 29) | def __init__(self, notification_spec: TerminatorSpec):
method create_notification (line 37) | def create_notification(self) -> None:
method send_notification (line 68) | def send_notification(self) -> None:
method _authenticate_and_send_office365 (line 89) | def _authenticate_and_send_office365(self) -> None:
method _authenticate_and_send_simple_smtp (line 118) | def _authenticate_and_send_simple_smtp(self) -> None:
method _validate_email_notification (line 181) | def _validate_email_notification(self) -> None:
method _get_importance (line 204) | def _get_importance(self, importance: str) -> Any:
method _create_graph_api_email_body (line 229) | def _create_graph_api_email_body(self) -> Any:
method _set_graph_api_recipients (line 293) | def _set_graph_api_recipients(self, recipient_type: str) -> list:
FILE: lakehouse_engine/terminators/notifiers/exceptions.py
class NotifierNotFoundException (line 4) | class NotifierNotFoundException(Exception):
class NotifierConfigException (line 10) | class NotifierConfigException(Exception):
class NotifierTemplateNotFoundException (line 16) | class NotifierTemplateNotFoundException(Exception):
class NotifierTemplateConfigException (line 22) | class NotifierTemplateConfigException(Exception):
FILE: lakehouse_engine/terminators/notifiers/notification_templates.py
class NotificationsTemplates (line 4) | class NotificationsTemplates(object):
FILE: lakehouse_engine/terminators/sensor_terminator.py
class SensorTerminator (line 11) | class SensorTerminator(object):
method update_sensor_status (line 17) | def update_sensor_status(
FILE: lakehouse_engine/terminators/spark_terminator.py
class SparkTerminator (line 7) | class SparkTerminator(object):
method terminate_spark (line 13) | def terminate_spark(cls) -> None:
FILE: lakehouse_engine/terminators/terminator_factory.py
class TerminatorFactory (line 13) | class TerminatorFactory(object):
method execute_terminator (line 19) | def execute_terminator(
FILE: lakehouse_engine/transformers/aggregators.py
class Aggregators (line 11) | class Aggregators(object):
method get_max_value (line 17) | def get_max_value(input_col: str, output_col: str = "latest") -> Calla...
FILE: lakehouse_engine/transformers/column_creators.py
class ColumnCreators (line 15) | class ColumnCreators(object):
method with_row_id (line 21) | def with_row_id(
method with_auto_increment_id (line 47) | def with_auto_increment_id(
method with_literals (line 86) | def with_literals(
FILE: lakehouse_engine/transformers/column_reshapers.py
class ColumnReshapers (line 24) | class ColumnReshapers(object):
method cast (line 30) | def cast(cls, cols: Dict[str, str]) -> Callable:
method column_selector (line 54) | def column_selector(cls, cols: OrderedDict) -> Callable:
method flatten_schema (line 72) | def flatten_schema(
method explode_columns (line 115) | def explode_columns(
method _get_columns (line 160) | def _get_columns(
method with_expressions (line 181) | def with_expressions(cls, cols_and_exprs: Dict[str, str]) -> Callable:
method rename (line 207) | def rename(cls, cols: Dict[str, str], escape_col_names: bool = True) -...
method from_avro (line 238) | def from_avro(
method from_avro_with_registry (line 285) | def from_avro_with_registry(
method from_json (line 349) | def from_json(
method to_json (line 412) | def to_json(
method _explode_arrays (line 441) | def _explode_arrays(cls, df: DataFrame, cols_to_explode: List[str]) ->...
method _explode_maps (line 460) | def _explode_maps(cls, df: DataFrame, cols_to_explode: List[str]) -> D...
FILE: lakehouse_engine/transformers/condensers.py
class Condensers (line 15) | class Condensers(object):
method condense_record_mode_cdc (line 21) | def condense_record_mode_cdc(
method group_and_rank (line 91) | def group_and_rank(
FILE: lakehouse_engine/transformers/custom_transformers.py
class CustomTransformers (line 8) | class CustomTransformers(object):
method custom_transformation (line 12) | def custom_transformation(custom_transformer: Callable) -> Callable:
method sql_transformation (line 43) | def sql_transformation(sql: str) -> Callable:
FILE: lakehouse_engine/transformers/data_maskers.py
class DataMaskers (line 12) | class DataMaskers(object):
method hash_masker (line 18) | def hash_masker(
method column_dropper (line 55) | def column_dropper(cls, cols: List[str]) -> Callable:
FILE: lakehouse_engine/transformers/date_transformers.py
class DateTransformers (line 12) | class DateTransformers(object):
method add_current_date (line 18) | def add_current_date(output_col: str) -> Callable:
method convert_to_date (line 38) | def convert_to_date(
method convert_to_timestamp (line 66) | def convert_to_timestamp(
method format_date (line 95) | def format_date(cols: List[str], target_format: Optional[str] = None) ...
method get_date_hierarchy (line 121) | def get_date_hierarchy(cols: List[str], formats: Optional[dict] = None...
FILE: lakehouse_engine/transformers/exceptions.py
class WrongArgumentsException (line 4) | class WrongArgumentsException(Exception):
class UnsupportedStreamingTransformerException (line 10) | class UnsupportedStreamingTransformerException(Exception):
FILE: lakehouse_engine/transformers/filters.py
class Filters (line 12) | class Filters(object):
method incremental_filter (line 18) | def incremental_filter(
method expression_filter (line 89) | def expression_filter(exp: str) -> Callable:
method column_filter_exp (line 107) | def column_filter_exp(exp: List[str]) -> Callable:
method drop_duplicate_rows (line 125) | def drop_duplicate_rows(
FILE: lakehouse_engine/transformers/joiners.py
class Joiners (line 14) | class Joiners(object):
method join (line 20) | def join(
FILE: lakehouse_engine/transformers/null_handlers.py
class NullHandlers (line 10) | class NullHandlers(object):
method replace_nulls (line 16) | def replace_nulls(
FILE: lakehouse_engine/transformers/optimizers.py
class Optimizers (line 11) | class Optimizers(object):
method cache (line 17) | def cache(cls) -> Callable:
method persist (line 34) | def persist(cls, storage_level: str = None) -> Callable:
method unpersist (line 58) | def unpersist(cls, blocking: bool = False) -> Callable:
FILE: lakehouse_engine/transformers/regex_transformers.py
class RegexTransformers (line 11) | class RegexTransformers(object):
method with_regex_value (line 17) | def with_regex_value(
FILE: lakehouse_engine/transformers/repartitioners.py
class Repartitioners (line 11) | class Repartitioners(object):
method coalesce (line 17) | def coalesce(cls, num_partitions: int) -> Callable:
method repartition (line 35) | def repartition(
FILE: lakehouse_engine/transformers/transformer_factory.py
class TransformerFactory (line 24) | class TransformerFactory(object):
method get_transformer (line 80) | def get_transformer(spec: TransformerSpec, data: OrderedDict = None) -...
method _get_spec_args_copy (line 129) | def _get_spec_args_copy(spec_args: dict) -> dict:
FILE: lakehouse_engine/transformers/unions.py
class Unions (line 11) | class Unions(object):
method union (line 17) | def union(
method union_by_name (line 42) | def union_by_name(
FILE: lakehouse_engine/transformers/watermarker.py
class Watermarker (line 10) | class Watermarker(object):
method with_watermark (line 16) | def with_watermark(watermarker_column: str, watermarker_time: str) -> ...
FILE: lakehouse_engine/utils/acon_utils.py
function validate_manager_list (line 17) | def validate_manager_list(acon: dict) -> list:
function validate_and_resolve_acon (line 38) | def validate_and_resolve_acon(acon: dict, execution_point: str = "") -> ...
function validate_readers (line 62) | def validate_readers(acon: dict) -> None:
function validate_writers (line 82) | def validate_writers(acon: dict) -> None:
function validate_managers (line 99) | def validate_managers(acon: dict, error_list: list = None) -> None:
function validate_mandatory_parameters (line 155) | def validate_mandatory_parameters(acon: dict, expected_params: dict) -> ...
function validate_parameter_types (line 173) | def validate_parameter_types(acon: dict, expected_params: dict) -> list:
function resolve_dq_functions (line 207) | def resolve_dq_functions(acon: dict, execution_point: str) -> dict:
FILE: lakehouse_engine/utils/configs/config_utils.py
class ConfigUtils (line 13) | class ConfigUtils(object):
method get_acon (line 29) | def get_acon(
method get_config (line 52) | def get_config(package: str = "lakehouse_engine.configs") -> Any:
method get_config_from_file (line 68) | def get_config_from_file(config_file_path: str) -> Any:
method get_engine_version (line 83) | def get_engine_version(cls) -> str:
method read_json_acon (line 97) | def read_json_acon(path: str, disable_dbfs_retry: bool = False) -> Any:
method read_sql (line 110) | def read_sql(path: str, disable_dbfs_retry: bool = False) -> Any:
method remove_sensitive_info (line 123) | def remove_sensitive_info(cls, dict_to_replace: dict | list) -> dict |...
FILE: lakehouse_engine/utils/databricks_utils.py
class DatabricksUtils (line 15) | class DatabricksUtils(object):
method is_serverless_workload (line 21) | def is_serverless_workload() -> bool:
method get_db_utils (line 33) | def get_db_utils(spark: SparkSession) -> Any:
method get_databricks_job_information (line 56) | def get_databricks_job_information(spark: SparkSession) -> Tuple[str, ...
method _get_dp_name (line 80) | def _get_dp_name(job_name: str) -> str:
method get_spark_conf_values (line 96) | def get_spark_conf_values(usage_stats: dict, spark_confs: dict) -> None:
method get_usage_context_for_serverless (line 137) | def get_usage_context_for_serverless(cls, usage_stats: dict) -> None:
FILE: lakehouse_engine/utils/dq_utils.py
class DQUtils (line 15) | class DQUtils:
method import_dq_rules_from_table (line 19) | def import_dq_rules_from_table(
method validate_dq_functions (line 117) | def validate_dq_functions(
class PrismaUtils (line 166) | class PrismaUtils:
method build_prisma_dq_spec (line 170) | def build_prisma_dq_spec(spec: dict, execution_point: str) -> dict:
method validate_rule_id_duplication (line 240) | def validate_rule_id_duplication(
FILE: lakehouse_engine/utils/engine_usage_stats.py
class EngineUsageStats (line 15) | class EngineUsageStats(object):
method store_engine_usage (line 21) | def store_engine_usage(
method _should_collect_usage (line 75) | def _should_collect_usage(cls, collect_engine_usage: str) -> bool:
method _prepare_usage_stats (line 84) | def _prepare_usage_stats(cls, acon: dict, spark_confs: dict) -> dict:
method _select_usage_path (line 93) | def _select_usage_path(
method _add_metadata_to_stats (line 103) | def _add_metadata_to_stats(
FILE: lakehouse_engine/utils/expectations_utils.py
function validate_result (line 6) | def validate_result(
function _get_example_unexpected_index_list (line 36) | def _get_example_unexpected_index_list(expectation_configuration: Any) -...
function _get_test_unexpected_index_list (line 68) | def _get_test_unexpected_index_list(metric_name: str, metrics: Dict) -> ...
FILE: lakehouse_engine/utils/extraction/jdbc_extraction_utils.py
class JDBCExtractionType (line 14) | class JDBCExtractionType(Enum):
class JDBCExtraction (line 22) | class JDBCExtraction(object):
class JDBCExtractionUtils (line 101) | class JDBCExtractionUtils(object):
method __init__ (line 104) | def __init__(self, jdbc_extraction: Any):
method get_additional_spark_options (line 115) | def get_additional_spark_options(
method get_predicates (line 163) | def get_predicates(self, predicates_query: str) -> List:
method get_spark_jdbc_options (line 207) | def get_spark_jdbc_options(self) -> Tuple[dict, dict]:
method get_spark_jdbc_optimal_upper_bound (line 248) | def get_spark_jdbc_optimal_upper_bound(self) -> Any:
method _get_extraction_partition_opts (line 302) | def _get_extraction_partition_opts(
method _get_max_timestamp (line 322) | def _get_max_timestamp(self, max_timestamp_query: str) -> str:
method _get_delta_query (line 361) | def _get_delta_query(self) -> Tuple[str, str]:
method _get_init_query (line 366) | def _get_init_query(self) -> Tuple[str, str]:
FILE: lakehouse_engine/utils/extraction/sap_b4_extraction_utils.py
class ADSOTypes (line 18) | class ADSOTypes(Enum):
class SAPB4Extraction (line 27) | class SAPB4Extraction(JDBCExtraction):
class SAPB4ExtractionUtils (line 81) | class SAPB4ExtractionUtils(JDBCExtractionUtils):
method __init__ (line 84) | def __init__(self, sap_b4_extraction: SAPB4Extraction):
method get_data_target (line 104) | def get_data_target(input_spec_opt: dict) -> str:
method _get_init_query (line 126) | def _get_init_query(self) -> Tuple[str, str]:
method _get_init_extraction_query (line 143) | def _get_init_extraction_query(self) -> str:
method _get_delta_query (line 166) | def _get_delta_query(self) -> Tuple[str, str]:
method _get_req_status_tbl_filter (line 261) | def _get_req_status_tbl_filter(self) -> Any:
FILE: lakehouse_engine/utils/extraction/sap_bw_extraction_utils.py
class SAPBWExtraction (line 18) | class SAPBWExtraction(JDBCExtraction):
class SAPBWExtractionUtils (line 78) | class SAPBWExtractionUtils(JDBCExtractionUtils):
method __init__ (line 81) | def __init__(self, sap_bw_extraction: SAPBWExtraction):
method get_changelog_table (line 99) | def get_changelog_table(self) -> str:
method get_odsobject (line 175) | def get_odsobject(input_spec_opt: dict) -> str:
method get_logsys_cond (line 194) | def get_logsys_cond(self) -> str:
method _get_init_query (line 208) | def _get_init_query(self) -> Tuple[str, str]:
method _get_init_extraction_query (line 236) | def _get_init_extraction_query(self) -> str:
method _get_init_extraction_query_act_req_timestamp (line 265) | def _get_init_extraction_query_act_req_timestamp(self) -> str:
method _get_delta_query (line 287) | def _get_delta_query(self) -> Tuple[str, str]:
FILE: lakehouse_engine/utils/extraction/sftp_extraction_utils.py
class SFTPInputFormat (line 19) | class SFTPInputFormat(Enum):
class SFTPExtractionFilter (line 28) | class SFTPExtractionFilter(Enum):
class SFTPExtractionUtils (line 38) | class SFTPExtractionUtils(object):
method get_files_list (line 44) | def get_files_list(
method get_sftp_client (line 103) | def get_sftp_client(
method validate_format (line 228) | def validate_format(cls, files_format: str) -> str:
method validate_location (line 252) | def validate_location(cls, location: str) -> str:
method _file_has_pattern (line 264) | def _file_has_pattern(cls, item: SFTPAttributes, options_args: dict) -...
method _file_in_date_interval (line 287) | def _file_in_date_interval(
method _get_earliest_latest_file (line 345) | def _get_earliest_latest_file(
method _get_folder_items (line 389) | def _get_folder_items(
method _get_host_keys (line 421) | def _get_host_keys(cls, pkey: str, key_type: str) -> PKey:
method _is_compressed (line 442) | def _is_compressed(cls, filename: str) -> Any:
method _validate_date (line 454) | def _validate_date(cls, date_text: str) -> datetime:
FILE: lakehouse_engine/utils/file_utils.py
function get_file_names_without_file_type (line 8) | def get_file_names_without_file_type(
function get_directory_path (line 33) | def get_directory_path(path: str) -> str:
FILE: lakehouse_engine/utils/gab_utils.py
class GABUtils (line 18) | class GABUtils(object):
method logger (line 23) | def logger(
method _escape_quote (line 90) | def _escape_quote(cls, to_escape: str) -> str:
method get_json_column_as_dict (line 99) | def get_json_column_as_dict(
method extract_columns_from_mapping (line 125) | def extract_columns_from_mapping(
method _extract_column_with_alias (line 182) | def _extract_column_with_alias(
method _get_column_format_without_alias (line 211) | def _get_column_format_without_alias(
method get_cadence_configuration_at_end_date (line 236) | def get_cadence_configuration_at_end_date(cls, end_date: datetime) -> ...
method get_reconciliation_cadences (line 283) | def get_reconciliation_cadences(
method _get_cadences_to_execute (line 307) | def _get_cadences_to_execute(
method _sort_cadences_to_execute (line 337) | def _sort_cadences_to_execute(
method _get_configured_cadences_by_snapshot (line 367) | def _get_configured_cadences_by_snapshot(
method _generate_reconciliation_by_snapshot (line 410) | def _generate_reconciliation_by_snapshot(
method _add_cadence_snapshot_to_cadence_snapshot_config (line 454) | def _add_cadence_snapshot_to_cadence_snapshot_config(
method format_datetime_to_default (line 474) | def format_datetime_to_default(cls, date_to_format: datetime) -> str:
class GABPartitionUtils (line 483) | class GABPartitionUtils(object):
method get_years (line 489) | def get_years(cls, start_date: str, end_date: str) -> list[str]:
method get_partition_condition (line 509) | def get_partition_condition(cls, start_date: str, end_date: str) -> str:
method _get_multiple_years_partition (line 526) | def _get_multiple_years_partition(
method _get_single_year_partition (line 588) | def _get_single_year_partition(cls, start_date: str, end_date: str) ->...
method _extract_date_part_from_date (line 641) | def _extract_date_part_from_date(cls, part: str, date: str) -> str:
FILE: lakehouse_engine/utils/logging_handler.py
class FilterSensitiveData (line 32) | class FilterSensitiveData(logging.Filter):
method filter (line 35) | def filter(self, record: logging.LogRecord) -> bool: # noqa: A003
class LoggingHandler (line 53) | class LoggingHandler(object):
method __init__ (line 56) | def __init__(self, class_name: str):
method get_logger (line 72) | def get_logger(self) -> logging.Logger:
FILE: lakehouse_engine/utils/rest_api.py
class RestMethods (line 16) | class RestMethods(Enum):
class RestStatusCodes (line 24) | class RestStatusCodes(Enum):
class RESTApiException (line 31) | class RESTApiException(requests.RequestException):
method __init__ (line 34) | def __init__(self, message: str) -> None:
function get_basic_auth (line 43) | def get_basic_auth(username: str, password: str) -> requests.auth.HTTPBa...
function get_configured_session (line 56) | def get_configured_session(
function execute_api_request (line 97) | def execute_api_request(
FILE: lakehouse_engine/utils/schema_utils.py
class SchemaUtils (line 15) | class SchemaUtils(object):
method from_file (line 21) | def from_file(file_path: str, disable_dbfs_retry: bool = False) -> Str...
method from_file_to_dict (line 37) | def from_file_to_dict(file_path: str, disable_dbfs_retry: bool = False...
method from_dict (line 51) | def from_dict(struct_type: dict) -> StructType:
method from_table_schema (line 64) | def from_table_schema(table: str) -> StructType:
method from_input_spec (line 76) | def from_input_spec(cls, input_spec: InputSpec) -> Optional[StructType]:
method _get_prefix_alias (line 110) | def _get_prefix_alias(num_chars: int, prefix: str, shorten_names: bool...
method schema_flattener (line 121) | def schema_flattener(
FILE: lakehouse_engine/utils/sharepoint_utils.py
class SharepointUtils (line 30) | class SharepointUtils(object):
method __init__ (line 33) | def __init__(
method _get_token (line 98) | def _get_token(self) -> None:
method _create_app (line 107) | def _create_app(self) -> None:
method _make_request (line 129) | def _make_request(
method _parse_json (line 185) | def _parse_json(self, response: requests.Response, context: str) -> Di...
method _get_site_id (line 219) | def _get_site_id(self) -> str:
method _get_drive_id (line 261) | def _get_drive_id(self) -> str:
method check_if_endpoint_exists (line 302) | def check_if_endpoint_exists(
method check_if_local_path_exists (line 344) | def check_if_local_path_exists(self, local_path: str) -> None:
method save_to_staging_area (line 358) | def save_to_staging_area(self, sp_file: SharepointFile) -> str:
method download_file_streaming (line 390) | def download_file_streaming(self, sp_file: SharepointFile) -> str:
method write_bytes_to_local_file (line 427) | def write_bytes_to_local_file(self, sp_file: SharepointFile) -> str:
method write_to_local_path (line 457) | def write_to_local_path(self, df: DataFrame) -> None:
method _rename_local_file (line 486) | def _rename_local_file(self, local_path: str, file_name: str) -> None:
method write_to_sharepoint (line 504) | def write_to_sharepoint(self) -> None:
method delete_local_path (line 566) | def delete_local_path(self) -> None:
method staging_area (line 581) | def staging_area(self) -> Generator[str, None, None]:
method list_items_in_path (line 598) | def list_items_in_path(self, path: str) -> list[Any]:
method get_file_metadata (line 664) | def get_file_metadata(self, file_path: str) -> SharepointFile:
method archive_sharepoint_file (line 723) | def archive_sharepoint_file(
method _rename_sharepoint_file (line 767) | def _rename_sharepoint_file(self, sp_file: SharepointFile) -> str:
method _move_file_in_sharepoint (line 827) | def _move_file_in_sharepoint(self, sp_file: SharepointFile, to_path: s...
method _create_folder_in_sharepoint (line 900) | def _create_folder_in_sharepoint(self, folder_path: str) -> None:
FILE: lakehouse_engine/utils/spark_utils.py
class SparkUtils (line 8) | class SparkUtils(object):
method create_temp_view (line 12) | def create_temp_view(
FILE: lakehouse_engine/utils/sql_parser_utils.py
class SQLParserUtils (line 6) | class SQLParserUtils(object):
method split_sql_commands (line 9) | def split_sql_commands(
method _split_sql_commands (line 34) | def _split_sql_commands(self) -> list[str]:
method _get_substring (line 109) | def _get_substring(self, first_char: int = None, last_char: int = None...
method _validate_command_is_closed (line 121) | def _validate_command_is_closed(self, index: int, dependencies: int) -...
method _character_validation (line 149) | def _character_validation(self, value: str | list) -> bool:
method _add_new_command (line 165) | def _add_new_command(self, sql_command: str) -> None:
method _update_value (line 173) | def _update_value(self, value: int, operation: str, condition: bool = ...
FILE: lakehouse_engine/utils/storage/dbfs_storage.py
class DBFSStorage (line 11) | class DBFSStorage(FileStorage):
method get_file_payload (line 18) | def get_file_payload(cls, url: ParseResult) -> Any:
method write_payload_to_file (line 36) | def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
FILE: lakehouse_engine/utils/storage/file_storage.py
class FileStorage (line 8) | class FileStorage(ABC):
method get_file_payload (line 13) | def get_file_payload(cls, url: ParseResult) -> Any:
method write_payload_to_file (line 26) | def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
FILE: lakehouse_engine/utils/storage/file_storage_functions.py
class FileStorageFunctions (line 15) | class FileStorageFunctions(ABC): # noqa: B024
method read_json (line 19) | def read_json(cls, path: str, disable_dbfs_retry: bool = False) -> Any:
method read_sql (line 50) | def read_sql(cls, path: str, disable_dbfs_retry: bool = False) -> Any:
method write_payload (line 81) | def write_payload(
method is_boto3_configured (line 108) | def is_boto3_configured() -> bool:
FILE: lakehouse_engine/utils/storage/local_fs_storage.py
class LocalFSStorage (line 11) | class LocalFSStorage(FileStorage):
method get_file_payload (line 17) | def get_file_payload(cls, url: ParseResult) -> TextIO:
method write_payload_to_file (line 30) | def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
FILE: lakehouse_engine/utils/storage/s3_storage.py
class S3Storage (line 12) | class S3Storage(FileStorage):
method get_file_payload (line 18) | def get_file_payload(cls, url: ParseResult) -> Any:
method write_payload_to_file (line 36) | def write_payload_to_file(cls, url: ParseResult, content: str) -> None:
FILE: lakehouse_engine_usage/managerhelper/operations-script.js
constant TABLE_OPERATIONS (line 94) | const TABLE_OPERATIONS = {
constant FILE_OPERATIONS (line 231) | const FILE_OPERATIONS = {
function initializeTabs (line 320) | function initializeTabs() {
function switchTab (line 333) | function switchTab(tabId) {
function initializeEventListeners (line 355) | function initializeEventListeners() {
function handleTableOperationChange (line 378) | function handleTableOperationChange() {
function handleFileOperationChange (line 393) | function handleFileOperationChange() {
function showNoOperationSelected (line 408) | function showNoOperationSelected(container) {
function renderDynamicFields (line 423) | function renderDynamicFields(container, operationDef, type) {
function renderField (line 448) | function renderField(field, type) {
function validateField (line 495) | function validateField(input) {
function clearFieldValidation (line 538) | function clearFieldValidation(input) {
function updateAddButtonState (line 555) | function updateAddButtonState() {
function addCurrentOperation (line 565) | function addCurrentOperation() {
function removeOperation (line 630) | function removeOperation(id) {
function clearAllOperations (line 641) | function clearAllOperations() {
function renderOperationsList (line 657) | function renderOperationsList() {
function updateGenerateButtonState (line 699) | function updateGenerateButtonState() {
function generateJSON (line 711) | function generateJSON() {
function displayJSON (line 747) | function displayJSON(config) {
function highlightJSON (line 757) | function highlightJSON() {
function formatJSON (line 773) | function formatJSON() {
function validateJSON (line 793) | function validateJSON() {
function showValidationResult (line 834) | function showValidationResult(isValid, message) {
function copyToClipboard (line 843) | async function copyToClipboard() {
function downloadJSON (line 869) | function downloadJSON() {
function enableActionButtons (line 903) | function enableActionButtons() {
function showLoading (line 911) | function showLoading() {
function hideLoading (line 918) | function hideLoading() {
function showToast (line 927) | function showToast(message, type = 'success') {
function saveToLocalStorage (line 946) | function saveToLocalStorage() {
function loadFromLocalStorage (line 959) | function loadFromLocalStorage() {
FILE: samples/tpch_load_and_analysis_tutorial.py
function is_a_super_vip (line 40) | def is_a_super_vip(df: DataFrame) -> DataFrame:
FILE: tests/conftest.py
function patch_databricks_utils_job_info (line 23) | def patch_databricks_utils_job_info() -> Generator:
function pytest_addoption (line 33) | def pytest_addoption(parser: Any) -> Any:
function spark_driver_memory (line 43) | def spark_driver_memory(request: Any) -> Any:
function prepare_exec_env (line 49) | def prepare_exec_env(spark_driver_memory: str) -> None:
function before_each_test (line 59) | def before_each_test() -> Generator:
function test_session_closure (line 66) | def test_session_closure(request: Any) -> None:
FILE: tests/feature/custom_expectations/test_custom_expectations.py
function test_custom_expectation (line 171) | def test_custom_expectation(scenario: dict, caplog: Any) -> None:
function _clean_folders (line 218) | def _clean_folders(expectation_name: str) -> None:
function _generate_acon (line 228) | def _generate_acon(
function _generate_dataframe (line 268) | def _generate_dataframe(load_type: str, expectation_name: str) -> DataFr...
function _get_result_and_control_dfs (line 305) | def _get_result_and_control_dfs(
FILE: tests/feature/custom_expectations/test_expectation_validity.py
function test_expectation_validity (line 35) | def test_expectation_validity(expectation: str) -> None:
function _run_diagnostics (line 58) | def _run_diagnostics(expectation_name: str) -> tuple:
function _process_diagnostics_output (line 87) | def _process_diagnostics_output(diagnostics_output: str) -> None:
function _validate_metric_name_structure (line 104) | def _validate_metric_name_structure(metric_name: str) -> int:
FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_calculate_kpi.py
function yet_another_kpi_calculator (line 26) | def yet_another_kpi_calculator(df: DataFrame) -> DataFrame:
function get_test_acon (line 47) | def get_test_acon() -> dict:
function test_calculate_kpi_and_merge (line 93) | def test_calculate_kpi_and_merge(scenario: str) -> None:
FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_delta_load.py
function multiply_by_100 (line 26) | def multiply_by_100(df: DataFrame) -> DataFrame:
function get_test_acon (line 39) | def get_test_acon() -> dict:
function test_delta_load (line 140) | def test_delta_load(scenario: str) -> None:
function _create_table (line 191) | def _create_table(table_name: str, location: str) -> None:
FILE: tests/feature/data_loader_custom_transformer/test_data_loader_custom_transformer_sql_transformation.py
function get_test_acon (line 31) | def get_test_acon() -> dict:
function test_sql_transformation_and_merge (line 78) | def test_sql_transformation_and_merge(scenario: str) -> None:
FILE: tests/feature/delta_load/test_delta_load_group_and_rank.py
function test_delta_load_group_and_rank (line 37) | def test_delta_load_group_and_rank(scenario: List[str]) -> None:
function execute_loads (line 81) | def execute_loads(scenario: List[str], iteration: int) -> None:
function _create_table (line 122) | def _create_table(scenario: List[str]) -> None:
FILE: tests/feature/delta_load/test_delta_load_merge_options.py
function test_delta_load_merge_options (line 31) | def test_delta_load_merge_options(scenario: List[str]) -> None:
function execute_loads (line 72) | def execute_loads(scenario: List[str]) -> None:
FILE: tests/feature/delta_load/test_delta_load_record_mode_cdc.py
function test_batch_delta_load (line 35) | def test_batch_delta_load(scenario: List[str]) -> None:
function test_file_by_file (line 87) | def test_file_by_file(scenario: str) -> None:
function test_backfill (line 157) | def test_backfill(scenario: str) -> None:
function test_direct_silver_load (line 214) | def test_direct_silver_load(scenario: str) -> None:
function _create_table (line 273) | def _create_table(table_name: str, location: str) -> None:
FILE: tests/feature/test_append_load.py
function test_permissive_jdbc_append_load (line 27) | def test_permissive_jdbc_append_load(scenario: str) -> None:
function test_failfast_append_load (line 70) | def test_failfast_append_load(scenario: str) -> None:
function test_streaming_dropmalformed (line 98) | def test_streaming_dropmalformed(scenario: str) -> None:
function test_streaming_with_terminators (line 140) | def test_streaming_with_terminators(scenario: str, caplog: Any) -> None:
function _append_data_into_source (line 178) | def _append_data_into_source(scenario: str) -> None:
FILE: tests/feature/test_data_quality.py
function test_load_with_dq_validator (line 80) | def test_load_with_dq_validator(scenario: dict) -> None:
function test_load_with_dq_validator_table (line 243) | def test_load_with_dq_validator_table(scenario: dict) -> None:
function test_validator_dq_spec (line 528) | def test_validator_dq_spec(scenario: dict, caplog: Any) -> None:
function test_chunked_result_sink (line 816) | def test_chunked_result_sink(scenario: dict, caplog: Any) -> None:
function _test_result_structure (line 905) | def _test_result_structure(df: DataFrame) -> None:
function _prepare_validation_df (line 922) | def _prepare_validation_df(df: DataFrame) -> DataFrame:
FILE: tests/feature/test_dq_validator.py
function test_dq_rule_id_uniqueness (line 101) | def test_dq_rule_id_uniqueness(scenario: dict, caplog: Any) -> None:
function test_dq_validator (line 360) | def test_dq_validator(scenario: dict, caplog: Any) -> None:
function test_dq_validator_two_runs (line 507) | def test_dq_validator_two_runs(scenario: dict, caplog: Any) -> None:
function _clean_folders (line 585) | def _clean_folders() -> None:
function _create_table (line 596) | def _create_table(table_name: str) -> None:
function _execute_load (line 622) | def _execute_load(load_type: str) -> None:
function _generate_acon (line 644) | def _generate_acon(
function _generate_dataframe (line 706) | def _generate_dataframe(load_type: str) -> DataFrame:
function _get_result_and_control_dfs (line 734) | def _get_result_and_control_dfs(
FILE: tests/feature/test_engine_usage_stats.py
function custom_transformation (line 33) | def custom_transformation(df: DataFrame) -> DataFrame:
function _get_test_acon (line 45) | def _get_test_acon(scenario_name: str) -> dict:
function test_load_data (line 111) | def test_load_data(scenario: str) -> None:
function test_table_manager (line 139) | def test_table_manager(scenario: str) -> None:
function test_dq_validator (line 167) | def test_dq_validator(scenario: str) -> None:
function _prepare_and_compare_dfs (line 224) | def _prepare_and_compare_dfs(scenario: str) -> None:
function _prepare_df_comparison (line 252) | def _prepare_df_comparison(df: DataFrame) -> str:
FILE: tests/feature/test_extract_from_sap_b4.py
function test_extract_aq_dso (line 175) | def test_extract_aq_dso(scenario: dict) -> None:
function test_extract_cl_dso (line 195) | def test_extract_cl_dso(scenario: dict) -> None:
function _execute_and_validate (line 214) | def _execute_and_validate(scenario: dict, extra_params: dict) -> None:
function _execute_load (line 244) | def _execute_load(
function _get_test_acon (line 284) | def _get_test_acon(
function _prepare_files (line 362) | def _prepare_files(scenario: str, extra_params: dict) -> None:
function _load_test_table (line 397) | def _load_test_table(
function _validate (line 437) | def _validate(scenario: str, extra_params: dict, min_timestamp: bool) ->...
FILE: tests/feature/test_extract_from_sap_bw.py
function test_extract_dso (line 210) | def test_extract_dso(scenario: dict, caplog: LogCaptureFixture) -> None:
function test_extract_write_optimised_dso (line 232) | def test_extract_write_optimised_dso(scenario: dict, caplog: LogCaptureF...
function _execute_and_validate (line 253) | def _execute_and_validate(
function _execute_load (line 305) | def _execute_load(
function _get_test_acon (line 345) | def _get_test_acon(
function _prepare_files (line 446) | def _prepare_files(scenario: str, extra_params: dict) -> None:
function _load_test_table (line 491) | def _load_test_table(
function _validate (line 531) | def _validate(scenario: str, extra_params: dict, min_timestamp: bool) ->...
function test_changelog_table_name_derivation (line 578) | def test_changelog_table_name_derivation(scenario: dict) -> None:
FILE: tests/feature/test_file_manager.py
function test_file_manager (line 17) | def test_file_manager(caplog: Any) -> None:
function _test_file_manager_copy (line 52) | def _test_file_manager_copy(caplog: Any, s3_cli: Any) -> None:
function _test_file_manager_delete (line 80) | def _test_file_manager_delete(caplog: Any, s3_cli: Any) -> None:
function test_file_manager_restore_archive (line 113) | def test_file_manager_restore_archive(scenario: dict, caplog: Any) -> None:
function _test_file_manager_restore_check (line 146) | def _test_file_manager_restore_check(caplog: Any, s3_cli: Any, s3_res: A...
function _test_file_manager_restore_request (line 179) | def _test_file_manager_restore_request(caplog: Any, s3_cli: Any, s3_res:...
function test_file_manager_restore_sync (line 220) | def test_file_manager_restore_sync(scenario: dict, caplog: Any) -> None:
function _test_file_manager_restore_sync (line 253) | def _test_file_manager_restore_sync(caplog: Any, s3_cli: Any, s3_res: An...
function _test_file_manager_restore_sync_retrieval_tier_exception (line 299) | def _test_file_manager_restore_sync_retrieval_tier_exception(caplog: Any...
FILE: tests/feature/test_file_manager_dbfs.py
class FileInfoFixture (line 23) | class FileInfoFixture:
method isDir (line 30) | def isDir(self) -> bool:
method isFile (line 38) | def isFile(self) -> bool:
class DBUtilsFixture (line 47) | class DBUtilsFixture:
method __init__ (line 50) | def __init__(self) -> None:
method cp (line 55) | def cp(src: str, dest: str, recurse: bool = False) -> None:
method ls (line 71) | def ls(path: str) -> list:
method mkdirs (line 84) | def mkdirs(path: str) -> None:
method mv (line 93) | def mv(src: str, dest: str, recurse: bool = False) -> None:
method put (line 109) | def put(path: str, content: str, overwrite: bool = False) -> None:
method rm (line 125) | def rm(path: str, recurse: bool = False) -> None:
function dbutils_fixture (line 141) | def dbutils_fixture() -> Iterator[None]:
function test_file_manager_dbfs (line 152) | def test_file_manager_dbfs(_patch: Any, caplog: Any) -> None:
function _list_objects (line 184) | def _list_objects(path: str, objects_list: list, dbutils: Any) -> list:
function _test_file_manager_dbfs_copy (line 195) | def _test_file_manager_dbfs_copy(caplog: Any, dbutils: Any) -> None:
function _test_file_manager_dbfs_delete (line 229) | def _test_file_manager_dbfs_delete(caplog: Any, dbutils: Any) -> None:
function _test_file_manager_dbfs_move (line 261) | def _test_file_manager_dbfs_move(caplog: Any, dbutils: Any) -> None:
FILE: tests/feature/test_file_manager_s3.py
function test_get_caller_identity_with_default_credentials (line 18) | def test_get_caller_identity_with_default_credentials() -> None:
function test_file_manager_s3 (line 24) | def test_file_manager_s3(caplog: Any) -> None:
function _test_file_manager_s3_copy (line 60) | def _test_file_manager_s3_copy(caplog: Any, s3_cli: Any) -> None:
function _test_file_manager_s3_delete (line 93) | def _test_file_manager_s3_delete(caplog: Any, s3_cli: Any) -> None:
function test_file_manager_s3_restore_archive (line 128) | def test_file_manager_s3_restore_archive(scenario: dict, caplog: Any) ->...
function _test_file_manager_s3_restore_check (line 162) | def _test_file_manager_s3_restore_check(caplog: Any, s3_cli: Any, s3_res...
function _test_file_manager_s3_restore_request (line 195) | def _test_file_manager_s3_restore_request(
function test_file_manager_s3_restore_sync (line 238) | def test_file_manager_s3_restore_sync(scenario: dict, caplog: Any) -> None:
function _test_file_manager_s3_restore_sync (line 272) | def _test_file_manager_s3_restore_sync(caplog: Any, s3_cli: Any, s3_res:...
function _test_file_manager_s3_restore_sync_retrieval_tier_exception (line 318) | def _test_file_manager_s3_restore_sync_retrieval_tier_exception(caplog: ...
FILE: tests/feature/test_full_load.py
function test_batch_full_load (line 34) | def test_batch_full_load(scenario: List[str]) -> None:
FILE: tests/feature/test_gab.py
function _create_gab_tables (line 42) | def _create_gab_tables() -> None:
function _generate_calendar_test_dates (line 53) | def _generate_calendar_test_dates() -> list:
function _transform_dates_list_to_dataframe (line 65) | def _transform_dates_list_to_dataframe(dates: list) -> DataFrame:
function _feed_dim_calendar (line 79) | def _feed_dim_calendar(df: DataFrame) -> DataFrame:
function _feed_table_with_test_data (line 138) | def _feed_table_with_test_data(
function _create_and_load_source_data_for_use_case (line 192) | def _create_and_load_source_data_for_use_case(source_table: str) -> None:
function _import_use_case_sql (line 208) | def _import_use_case_sql(use_case_name: str) -> None:
function _setup_use_case (line 220) | def _setup_use_case(use_case_name: str) -> None:
function _gab_setup (line 231) | def _gab_setup() -> None:
function _run_setup_use_case (line 265) | def _run_setup_use_case(request: SubRequest) -> None:
function test_gold_asset_builder (line 340) | def test_gold_asset_builder(scenario: dict, caplog: Any) -> None:
FILE: tests/feature/test_heartbeat.py
function _create_heartbeat_table (line 36) | def _create_heartbeat_table(scenario_name: str, tables: dict) -> None:
function _test_heartbeat_sensor_data_feed (line 53) | def _test_heartbeat_sensor_data_feed(
function _test_execute_sensor_heartbeat (line 88) | def _test_execute_sensor_heartbeat(
function _test_update_heartbeat_sensor_status (line 128) | def _test_update_heartbeat_sensor_status(
function _trigger_heartbeat_sensor_jobs (line 183) | def _trigger_heartbeat_sensor_jobs(
function test_heartbeat (line 287) | def test_heartbeat(scenario: dict) -> None:
FILE: tests/feature/test_jdbc_reader.py
function test_jdbc_reader (line 56) | def test_jdbc_reader(scenario: List[str]) -> None:
FILE: tests/feature/test_materialize_cdf.py
function test_streaming_with_cdf (line 30) | def test_streaming_with_cdf(scenario: str, caplog: Any) -> None:
FILE: tests/feature/test_notification.py
function test_email_notification (line 133) | def test_email_notification(scenario: dict) -> None:
function test_email_notification_facade (line 273) | def test_email_notification_facade(scenario: dict) -> None:
function _parse_email_output (line 332) | def _parse_email_output(
FILE: tests/feature/test_reconciliation.py
function test_reconciliation (line 306) | def test_reconciliation(scenario: str, caplog: Any) -> None:
function test_nulls_and_zero_values_and_threshold (line 389) | def test_nulls_and_zero_values_and_threshold(
FILE: tests/feature/test_schema_evolution.py
function prepare_tests (line 30) | def prepare_tests() -> Generator:
function test_schema_evolution_delta_load (line 107) | def test_schema_evolution_delta_load(scenario: str) -> None:
function test_schema_evolution_append_load (line 308) | def test_schema_evolution_append_load(scenario: str) -> None:
function test_schema_evolution_full_load (line 463) | def test_schema_evolution_full_load(scenario: str) -> None:
function _create_table (line 531) | def _create_table(table_name: str, location: str) -> None:
FILE: tests/feature/test_sensors.py
function test_table_sensor (line 64) | def test_table_sensor(scenario: list) -> None:
function test_if_sensor_already_exists (line 179) | def test_if_sensor_already_exists(scenario: dict) -> None:
function test_jdbc_sensor (line 255) | def test_jdbc_sensor(scenario: str) -> None:
function test_files_sensor (line 331) | def test_files_sensor() -> None:
function test_update_sensor_status (line 366) | def test_update_sensor_status() -> None:
function _insert_data_into_upstream_table (line 402) | def _insert_data_into_upstream_table(
function _insert_files_sensor_test_data (line 428) | def _insert_files_sensor_test_data(files_location: str) -> StructType:
function _insert_into_jdbc_table (line 452) | def _insert_into_jdbc_table(
FILE: tests/feature/test_sftp_reader.py
function sftp_client (line 43) | def sftp_client(sftpserver: SFTPServer) -> Generator:
function test_sftp_reader_csv (line 86) | def test_sftp_reader_csv(
function test_sftp_reader_fwf (line 145) | def test_sftp_reader_fwf(
function test_sftp_reader_gz_file (line 194) | def test_sftp_reader_gz_file(
function test_sftp_reader_json (line 243) | def test_sftp_reader_json(
function test_sftp_reader_mult_files (line 292) | def test_sftp_reader_mult_files(
function test_sftp_reader_xml (line 348) | def test_sftp_reader_xml(
function test_sftp_reader_zip_file (line 404) | def test_sftp_reader_zip_file(
function test_sftp_server_available (line 442) | def test_sftp_server_available(sftpserver: SFTPServer) -> None:
function _execute_and_validate (line 453) | def _execute_and_validate(
function _get_test_acon (line 509) | def _get_test_acon(
function remote_location (line 548) | def remote_location() -> dict:
function rename_remote_files (line 589) | def rename_remote_files(sftp_client: SFTPClient) -> None:
FILE: tests/feature/test_sharepoint_reader.py
function _read_bytes (line 114) | def _read_bytes(path_value: str) -> bytes:
function _get_output_path_by_scenario (line 119) | def _get_output_path_by_scenario() -> Dict[str, str]:
function _setup_sharepoint_reader_mocks_for_success (line 157) | def _setup_sharepoint_reader_mocks_for_success(
function _assert_archive_calls_for_success (line 226) | def _assert_archive_calls_for_success(
function _assert_sharepoint_reader_success_output (line 291) | def _assert_sharepoint_reader_success_output(
function test_sharepoint_reader_success (line 338) | def test_sharepoint_reader_success(
function test_sharepoint_reader_failures (line 391) | def test_sharepoint_reader_failures(
function test_sharepoint_reader_exceptions (line 612) | def test_sharepoint_reader_exceptions(
FILE: tests/feature/test_sharepoint_writer.py
function test_sharepoint_writer_exceptions (line 104) | def test_sharepoint_writer_exceptions(
function test_sharepoint_writer (line 180) | def test_sharepoint_writer(
FILE: tests/feature/test_table_manager.py
function test_table_manager (line 86) | def test_table_manager(scenarios: dict, caplog: Any) -> None:
FILE: tests/feature/test_writers.py
function test_write_to_files (line 53) | def test_write_to_files(scenario: dict) -> None:
function test_write_to_rest_api (line 85) | def test_write_to_rest_api(scenario: dict) -> None:
function test_write_to_jdbc (line 112) | def test_write_to_jdbc(scenario: dict) -> None:
function test_write_to_table (line 147) | def test_write_to_table(scenario: dict) -> None:
function test_write_to_console (line 177) | def test_write_to_console(scenario: dict, capsys: Any) -> None:
function test_write_to_dataframe (line 206) | def test_write_to_dataframe(scenario: dict, capsys: Any) -> None:
function test_write_to_dataframe_checkpoints (line 259) | def test_write_to_dataframe_checkpoints(scenario: dict, capsys: Any) -> ...
function test_multiple_write_to_dataframe (line 320) | def test_multiple_write_to_dataframe(scenario: dict, capsys: Any) -> None:
function test_write_to_dataframe_exception (line 367) | def test_write_to_dataframe_exception(scenario: dict, capsys: Any) -> None:
function _generate_acon_from_source (line 418) | def _generate_acon_from_source(source: OrderedDict) -> dict:
function _prepare_files (line 458) | def _prepare_files(iteration: int = 0) -> None:
FILE: tests/feature/transformations/test_chain_transformations.py
function test_chain_transformations (line 37) | def test_chain_transformations(scenario: dict, caplog: Any) -> None:
function _prepare_files (line 100) | def _prepare_files() -> None:
FILE: tests/feature/transformations/test_column_creators.py
function test_column_creators (line 28) | def test_column_creators(scenario: str) -> None:
FILE: tests/feature/transformations/test_column_reshapers.py
function test_column_reshapers (line 34) | def test_column_reshapers(scenario: dict) -> None:
FILE: tests/feature/transformations/test_data_maskers.py
function test_data_maskers (line 29) | def test_data_maskers(scenario: str) -> None:
FILE: tests/feature/transformations/test_date_transformers.py
function test_date_transformers (line 29) | def test_date_transformers(scenario: str) -> None:
FILE: tests/feature/transformations/test_drop_duplicate_rows.py
function test_drop_duplicate_rows (line 31) | def test_drop_duplicate_rows(scenario: str) -> None:
FILE: tests/feature/transformations/test_joiners.py
function test_joiners (line 37) | def test_joiners(scenario: List[str]) -> None:
FILE: tests/feature/transformations/test_multiple_transformations.py
function test_multiple_transformations (line 28) | def test_multiple_transformations(scenario: str) -> None:
FILE: tests/feature/transformations/test_null_handlers.py
function test_replace_nulls (line 29) | def test_replace_nulls(scenario: str) -> None:
FILE: tests/feature/transformations/test_optimizers.py
function is_df_cached (line 15) | def is_df_cached(df: DataFrame) -> DataFrame:
function is_df_not_cached (line 30) | def is_df_not_cached(df: DataFrame) -> DataFrame:
function test_optimizer (line 46) | def test_optimizer(scenario: str) -> None:
function _get_test_acon (line 58) | def _get_test_acon(read_type: str) -> dict:
FILE: tests/feature/transformations/test_regex_transformers.py
function test_regex_transformers (line 29) | def test_regex_transformers(scenario: str) -> None:
FILE: tests/feature/transformations/test_unions.py
function test_unions (line 45) | def test_unions(scenario: List[str]) -> None:
function copy_data_files (line 116) | def copy_data_files(iteration: int) -> None:
FILE: tests/feature/transformations/test_watermarker.py
function test_drop_duplicates_with_watermark (line 33) | def test_drop_duplicates_with_watermark(scenario: dict) -> None:
function test_joins_with_watermark (line 93) | def test_joins_with_watermark(scenario: dict) -> None:
function _drop_and_create_table (line 159) | def _drop_and_create_table(table_name: str, location: str) -> None:
FILE: tests/resources/feature/materialize_cdf/data/table/streaming_with_cdf.sql
type test_db (line 1) | CREATE TABLE test_db.streaming_with_cdf (salesorder INT, item INT, date ...
FILE: tests/resources/feature/table_manager/create/table/test_table_complex_default_scenario.sql
type test_db (line 3) | CREATE TABLE test_db.DummyTableBronzeComplexDefaultScenario1
FILE: tests/resources/feature/table_manager/create/table/test_table_complex_different_delimiter_scenario.sql
type test_db (line 3) | CREATE TABLE test_db.DummyTableBronzeComplexDifferentDelimiterScenario1
FILE: tests/resources/feature/table_manager/create/table/test_table_simple_split_scenario.sql
type test_db (line 1) | CREATE TABLE test_db.DummyTableBronzeSimpleSplitScenario
FILE: tests/resources/feature/table_manager/create/view/test_view_complex_default_scenario.sql
type test_db (line 3) | CREATE VIEW test_db.DummyViewBronzeComplexDefaultScenario1 (id,col1,col2...
type test_db (line 8) | CREATE VIEW test_db.DummyViewBronzeComplexDefaultScenario2 (id,col1,col2...
FILE: tests/resources/feature/table_manager/create/view/test_view_complex_different_delimiter_scenario.sql
type test_db (line 3) | CREATE VIEW test_db.DummyViewBronzeComplexDifferentDelimiterScenario1 (i...
FILE: tests/resources/feature/table_manager/create/view/test_view_simple_split_scenario.sql
type test_db (line 1) | CREATE VIEW test_db.DummyViewBronzeSimpleSplitScenario (id,col1,col2,col...
FILE: tests/unit/test_acon_validation.py
function test_manager_validation (line 93) | def test_manager_validation(scenario: dict) -> None:
FILE: tests/unit/test_custom_configs.py
function test_custom_config (line 13) | def test_custom_config() -> None:
FILE: tests/unit/test_databricks_utils.py
function test_get_usage_context_for_serverless (line 27) | def test_get_usage_context_for_serverless() -> None:
FILE: tests/unit/test_failure_notification_creation.py
function test_failure_notification_creation (line 43) | def test_failure_notification_creation(scenario: dict) -> None:
function _parse_email_output (line 73) | def _parse_email_output(mail_content: str) -> str:
FILE: tests/unit/test_heartbeat_acon_creation.py
function _create_heartbeat_table (line 24) | def _create_heartbeat_table() -> None:
function _select_all (line 36) | def _select_all(table: str) -> DataFrame:
function _check_acon (line 45) | def _check_acon(heartbeat_table: str, acon: dict, acon_result_list: dict...
function test_get_sensor_acon (line 219) | def test_get_sensor_acon(mock_get_db_utils: Mock, scenario: dict) -> None:
FILE: tests/unit/test_heartbeat_anchor_job.py
function _create_heartbeat_table (line 22) | def _create_heartbeat_table() -> None:
function test_anchor_job (line 104) | def test_anchor_job(mock_run_job: Mock, scenario: dict) -> None:
FILE: tests/unit/test_log_filter_sensitive_data.py
function test_log_filter_sensitive_data (line 65) | def test_log_filter_sensitive_data(caplog: Any) -> None:
FILE: tests/unit/test_notification_creation.py
function test_notification_creation (line 74) | def test_notification_creation(scenario: dict) -> None:
function test_office365_notification_creation (line 132) | def test_office365_notification_creation(scenario: TerminatorSpec) -> None:
FILE: tests/unit/test_notification_factory.py
function test_notification_factory (line 48) | def test_notification_factory(scenario: dict) -> None:
FILE: tests/unit/test_prisma_dq_rule_id.py
function test_prisma_manual_function_definition (line 130) | def test_prisma_manual_function_definition(scenario: dict) -> None:
FILE: tests/unit/test_prisma_function_definition.py
function test_prisma_manual_function_definition (line 102) | def test_prisma_manual_function_definition(scenario: dict) -> None:
FILE: tests/unit/test_rest_api_functions.py
function test_send_payload_to_rest_api_simple_params (line 22) | def test_send_payload_to_rest_api_simple_params(_: Any, caplog: Any) -> ...
function test_send_payload_to_rest_api_with_file_params (line 54) | def test_send_payload_to_rest_api_with_file_params(_: Any, caplog: Any) ...
FILE: tests/unit/test_sensor.py
function test_create_sensor (line 82) | def test_create_sensor(scenario: dict, capsys: Any) -> None:
function test_sensor_already_exists (line 234) | def test_sensor_already_exists(scenario: dict, capsys: Any) -> None:
class TestExecuteSensor (line 263) | class TestExecuteSensor:
method setup_class (line 273) | def setup_class(cls) -> None:
method teardown_class (line 278) | def teardown_class(cls) -> None:
method test_execute_stream_sensor (line 303) | def test_execute_stream_sensor(self, scenario: dict, capsys: Any) -> N...
method test_execute_batch_sensor (line 355) | def test_execute_batch_sensor(self, scenario: dict, capsys: Any) -> None:
method test_execute_sensor_raise_no_input_spec_format_implemented (line 432) | def test_execute_sensor_raise_no_input_spec_format_implemented(
method test_execute_sensor_raise_no_new_data_exception (line 467) | def test_execute_sensor_raise_no_new_data_exception(
FILE: tests/unit/test_sensor_manager.py
function test_sensor_update_set (line 58) | def test_sensor_update_set(scenario: dict, capsys: Any) -> None:
function test_sensor_data (line 96) | def test_sensor_data(scenario: dict, capsys: Any) -> None:
function test_check_if_sensor_has_acquired_data (line 170) | def test_check_if_sensor_has_acquired_data(scenario: dict, capsys: Any) ...
function control_table_fixture (line 193) | def control_table_fixture() -> DataFrame:
function test_read_sensor_table_data (line 271) | def test_read_sensor_table_data(
function test_has_new_data (line 330) | def test_has_new_data(scenario: dict, capsys: Any) -> None:
function test_if_generate_filter_exp_preprocess_query (line 461) | def test_if_generate_filter_exp_preprocess_query(scenario: dict, capsys:...
function test_generate_sensor_table_preprocess_query (line 521) | def test_generate_sensor_table_preprocess_query(scenario: dict, capsys: ...
function dataframe_fixture (line 536) | def dataframe_fixture() -> DataFrame:
function test_read_new_data (line 562) | def test_read_new_data(
function test_generate_sensor_sap_logchain_query (line 680) | def test_generate_sensor_sap_logchain_query(scenario: dict, capsys: Any)...
function _prepare_new_data_tests (line 728) | def _prepare_new_data_tests(return_empty_df: bool = False) -> DataFrame:
FILE: tests/unit/test_sharepoint_csv_reader.py
class DummySharepointOptions (line 14) | class DummySharepointOptions:
method __init__ (line 22) | def __init__(self, local_options: Dict[str, Any]) -> None:
class DummyInputSpec (line 27) | class DummyInputSpec:
method __init__ (line 34) | def __init__(self, sharepoint_options: DummySharepointOptions) -> None:
function create_csv_reader (line 39) | def create_csv_reader(local_options: Dict[str, Any]) -> SharepointCsvRea...
function test_detect_delimiter_uses_user_provided_delimiter (line 55) | def test_detect_delimiter_uses_user_provided_delimiter() -> None:
function test_detect_delimiter_autodetects_semicolon (line 66) | def test_detect_delimiter_autodetects_semicolon() -> None:
function test_detect_delimiter_defaults_to_comma_on_decode_error (line 77) | def test_detect_delimiter_defaults_to_comma_on_decode_error() -> None:
function test_resolve_csv_options_prefers_sep_over_delimiter (line 88) | def test_resolve_csv_options_prefers_sep_over_delimiter() -> None:
function test_resolve_spark_csv_options_uses_delimiter_when_sep_missing (line 106) | def test_resolve_spark_csv_options_uses_delimiter_when_sep_missing() -> ...
function test_resolve_spark_csv_options_autodetects_when_no_delimiter_provided (line 118) | def test_resolve_spark_csv_options_autodetects_when_no_delimiter_provide...
function test_resolve_spark_csv_options_warns_when_expected_columns_names_mismatch (line 133) | def test_resolve_spark_csv_options_warns_when_expected_columns_names_mis...
function test_resolve_spark_csv_options_warns_when_expected_columns_validation_fails (line 160) | def test_resolve_spark_csv_options_warns_when_expected_columns_validatio...
FILE: tests/unit/test_spark_session.py
function test_spark_session (line 9) | def test_spark_session() -> None:
FILE: tests/unit/test_version.py
function test_version (line 8) | def test_version() -> None:
FILE: tests/utils/dataframe_helpers.py
class DataframeHelpers (line 26) | class DataframeHelpers(object):
method has_diff (line 32) | def has_diff(
method read_from_file (line 73) | def read_from_file(
method read_from_table (line 104) | def read_from_table(db_table: str, options: Optional[dict] = None) -> ...
method read_from_jdbc (line 123) | def read_from_jdbc(
method write_into_jdbc_table (line 145) | def write_into_jdbc_table(
method create_empty_dataframe (line 174) | def create_empty_dataframe(struct_type: StructType) -> DataFrame:
method create_dataframe (line 187) | def create_dataframe(data: list, schema: StructType) -> DataFrame:
method create_delta_table (line 201) | def create_delta_table(
FILE: tests/utils/dq_rules_table_utils.py
function _create_dq_functions_source_table (line 7) | def _create_dq_functions_source_table(
FILE: tests/utils/exec_env_helpers.py
class ExecEnvHelpers (line 6) | class ExecEnvHelpers(object):
method prepare_exec_env (line 10) | def prepare_exec_env(spark_driver_memory: str) -> None:
method set_exec_env_config (line 30) | def set_exec_env_config(cls, key: str, value: str) -> None:
method reset_default_spark_session_configs (line 35) | def reset_default_spark_session_configs(cls) -> None:
FILE: tests/utils/local_storage.py
class LocalStorage (line 13) | class LocalStorage(object):
method copy_file (line 17) | def copy_file(from_path: str, to_path: str) -> None:
method clean_folder (line 29) | def clean_folder(folder_path: str) -> None:
method delete_file (line 38) | def delete_file(file_path: str) -> None:
method read_file (line 48) | def read_file(file_path: str) -> str:
method copy_dir (line 59) | def copy_dir(source: str, destination: str) -> None:
FILE: tests/utils/mocks.py
class MockRESTResponse (line 9) | class MockRESTResponse:
method __init__ (line 12) | def __init__(
method json (line 30) | def json(self) -> Optional[dict[str, Any]]:
method __enter__ (line 37) | def __enter__(self) -> MockRESTResponse:
method __exit__ (line 41) | def __exit__(
FILE: tests/utils/smtp_server.py
class SMTPHandler (line 12) | class SMTPHandler(Message):
method __init__ (line 15) | def __init__(self) -> None:
method handle_message (line 20) | def handle_message(self, message: Any) -> None:
class SMTPServer (line 32) | class SMTPServer:
method __init__ (line 37) | def __init__(self, host: str, port: int) -> None:
method start (line 49) | def start(self) -> None:
method stop (line 57) | def stop(self) -> None:
method get_messages (line 63) | def get_messages(self) -> list:
method clear_messages (line 67) | def clear_messages(self) -> None:
method get_last_message (line 71) | def get_last_message(self) -> Any:
Condensed preview — 1183 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (3,658K chars).
[
{
"path": ".github/ISSUE_TEMPLATE/bug_report.md",
"chars": 778,
"preview": "---\nname: Bug report\nabout: Create a Bug report to help us improve\ntitle: \"[BUG] Function X is raising error Y\"\nlabels: "
},
{
"path": ".github/ISSUE_TEMPLATE/feature_request.md",
"chars": 678,
"preview": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: \"[FEATURE] I would like to have the capability "
},
{
"path": ".github/pull_request_template.md",
"chars": 646,
"preview": "- [ ] Description of PR changes above includes a link to [an existing GitHub issue](https://github.com/adidas/lakehouse-"
},
{
"path": ".gitignore",
"chars": 2288,
"preview": "# mac os hidden files\n.DS_Store\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensio"
},
{
"path": "CONTRIBUTING.md",
"chars": 3348,
"preview": "# How to Contribute\n\n📖 Search algorithms, transformations and check implementation details & examples in our [documentat"
},
{
"path": "LICENSE.txt",
"chars": 11339,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "Makefile",
"chars": 14336,
"preview": "SHELL := /bin/bash -euxo pipefail\n\ncontainer_cli := docker\nimage_name := lakehouse-engine\ndeploy_env := dev\nproject_vers"
},
{
"path": "README.md",
"chars": 19166,
"preview": "<img align=\"right\" src=\"assets/img/lakehouse_engine_logo_symbol_small.png\" alt=\"Lakehouse Engine Logo\">\n\n# Lakehouse Eng"
},
{
"path": "assets/gab/metadata/gab/f_agg_dummy_sales_kpi/1_article_category.sql",
"chars": 538,
"preview": "SELECT\n \"category_a\" AS category_name\n ,\"article1\" AS article_id\nUNION\nSELECT\n \"category_a\" AS category_name\n "
},
{
"path": "assets/gab/metadata/gab/f_agg_dummy_sales_kpi/2_f_agg_dummy_sales_kpi.sql",
"chars": 689,
"preview": "SELECT\n {% if replace_offset_value == 0 %} {{ project_date_column }}\n {% else %} ({{ project_date_column }} + inte"
},
{
"path": "assets/gab/metadata/tables/dim_calendar.sql",
"chars": 1178,
"preview": "DROP TABLE IF EXISTS `database`.dim_calendar;\nCREATE EXTERNAL TABLE `database`.dim_calendar (\n calendar_date DATE COMM"
},
{
"path": "assets/gab/metadata/tables/dummy_sales_kpi.sql",
"chars": 513,
"preview": "DROP TABLE IF EXISTS `database`.`dummy_sales_kpi`;\nCREATE EXTERNAL TABLE `database`.`dummy_sales_kpi` (\n `order_date` D"
},
{
"path": "assets/gab/metadata/tables/gab_log_events.sql",
"chars": 1097,
"preview": "DROP TABLE IF EXISTS `database`.`gab_log_events`;\nCREATE EXTERNAL TABLE `database`.`gab_log_events`\n(\n`run_start_time` T"
},
{
"path": "assets/gab/metadata/tables/gab_use_case_results.sql",
"chars": 3568,
"preview": "DROP TABLE IF EXISTS `database`.`gab_use_case_results`;\nCREATE EXTERNAL TABLE `database`.`gab_use_case_results`\n(\n`query"
},
{
"path": "assets/gab/metadata/tables/lkp_query_builder.sql",
"chars": 1294,
"preview": "DROP TABLE IF EXISTS `database`.`lkp_query_builder`;\nCREATE EXTERNAL TABLE `database`.`lkp_query_builder`\n(\n`query_id` I"
},
{
"path": "assets/gab/notebooks/gab.py",
"chars": 4535,
"preview": "# Databricks notebook source\nfrom datetime import datetime, timedelta\n\nfrom lakehouse_engine.engine import execute_gab\nf"
},
{
"path": "assets/gab/notebooks/gab_dim_calendar.py",
"chars": 2823,
"preview": "# Databricks notebook source\n# MAGIC %md\n# MAGIC # This notebook holds the calendar used as part of the GAB framework.\n\n"
},
{
"path": "assets/gab/notebooks/gab_job_manager.py",
"chars": 5459,
"preview": "# Databricks notebook source\nimport os\n\nNOTEBOOK_CONTEXT = dbutils.notebook.entry_point.getDbutils().notebook().getConte"
},
{
"path": "assets/gab/notebooks/query_builder_helper.py",
"chars": 14394,
"preview": "# Databricks notebook source\n# MAGIC %md\n# MAGIC # Import Utils\n\n# COMMAND ----------\n\n# MAGIC %run ../utils/query_build"
},
{
"path": "assets/gab/utils/databricks_job_utils.py",
"chars": 8003,
"preview": "# Databricks notebook source\n# imports\nimport enum\nfrom typing import Tuple\nfrom uuid import UUID\n\nimport requests\n\n\n# C"
},
{
"path": "assets/gab/utils/query_builder_utils.py",
"chars": 40066,
"preview": "# Databricks notebook source\nimport json\nimport re\n\nfrom databricks.sdk.runtime import *\n\n\nclass QueryBuilderUtils:\n "
},
{
"path": "cicd/.bumpversion.cfg",
"chars": 175,
"preview": "[bumpversion]\ncurrent_version = 2.0.0\ncommit = False\ntag = False\n\n[bumpversion:file:pyproject.toml]\nsearch = version = \""
},
{
"path": "cicd/Dockerfile",
"chars": 1457,
"preview": "ARG PYTHON_IMAGE=python:3.12-slim-bullseye\n\nFROM $PYTHON_IMAGE\n\nARG USER_ID=1000\nARG GROUP_ID=1000\nARG CPU_ARCHITECTURE\n"
},
{
"path": "cicd/Jenkinsfile",
"chars": 3073,
"preview": "@Library(['GlobalJenkinsLibrary']) _\n\npipeline {\n options {\n buildDiscarder(logRotator(numToKeepStr: '30', art"
},
{
"path": "cicd/Jenkinsfile_deploy",
"chars": 8956,
"preview": "pipeline {\n parameters {\n string(name: 'BRANCH', defaultValue: 'master', description: 'Branch to use for the d"
},
{
"path": "cicd/bandit.yaml",
"chars": 32,
"preview": "assert_used:\n skips: ['*test*']"
},
{
"path": "cicd/code_doc/content.css",
"chars": 9181,
"preview": "/*\nThis CSS file contains all style definitions for documentation content.\n\nAll selectors are scoped with \".pdoc\".\nThis "
},
{
"path": "cicd/code_doc/custom_example_macros.py",
"chars": 5648,
"preview": "\"\"\"Macro methods to be used on Lakehouse Engine Docs.\"\"\"\nimport warnings\nimport json\nimport pygments.formatters.html\nfro"
},
{
"path": "cicd/code_doc/examples.json",
"chars": 3183,
"preview": "{\n \"base_link\":\"https://github.com/adidas/lakehouse-engine/blob/master/\",\n \"get_max_value\": \"tests/resources/feature/d"
},
{
"path": "cicd/code_doc/gen_ref_nav.py",
"chars": 2028,
"preview": "\"\"\"Module to generate code reference docs.\"\"\"\n\n# Import necessary libraries\nfrom pathlib import Path\nimport mkdocs_gen_f"
},
{
"path": "cicd/code_doc/index.html.jinja2",
"chars": 959,
"preview": "{% set root_module_name = \"\" %}\n{% extends \"default/index.html.jinja2\" %}\n{% block title %}Lakehouse Engine Documentatio"
},
{
"path": "cicd/code_doc/mkdocs.yml",
"chars": 9636,
"preview": "site_name: Lakehouse Engine Documentation\nsite_url: https://adidas.github.io/lakehouse-engine-docs\nrepo_url: https://git"
},
{
"path": "cicd/code_doc/mkdocs_macros.py",
"chars": 7551,
"preview": "\"\"\"Macro methods to be used on Lakehouse Engine Docs.\"\"\"\nimport warnings\nimport json\nimport pygments.formatters.html\nfro"
},
{
"path": "cicd/code_doc/module.html.jinja2",
"chars": 3549,
"preview": "{#\nOn this Jinja template we're extending a pre-existing template,\ncopying the block on which we would like to make chan"
},
{
"path": "cicd/code_doc/render_doc.py",
"chars": 5900,
"preview": "\"\"\"Module for customizing pdoc documentation.\"\"\"\n\nimport json\nimport os\nimport shutil\nimport warnings\nfrom pathlib impor"
},
{
"path": "cicd/code_doc/render_docs.py",
"chars": 2167,
"preview": "\"\"\"Module for customizing mkdocs documentation.\"\"\"\n\n# Import necessary libraries\nimport os\nimport shutil\nfrom pathlib im"
},
{
"path": "cicd/flake8.conf",
"chars": 306,
"preview": "[flake8]\nmax-line-length = 88\nextend-ignore = E203\ninline-quotes=double\ndocstring-quotes=\"\"\"\nmax-expression-complexity=1"
},
{
"path": "cicd/meta.yaml",
"chars": 316,
"preview": "dev_deploy_bucket: s3://sample-dev-bucket\nprod_deploy_bucket: s3://sample-prod-bucket\narm_python_image: arm64v8/python:3"
},
{
"path": "cicd/requirements.txt",
"chars": 378,
"preview": "# The main dependencies without which the core functionalities of the project will not work.\n# These dependencies are no"
},
{
"path": "cicd/requirements_azure.txt",
"chars": 567,
"preview": "# Dependencies necessary for azure related features to work (ex: mail notifications using o365).\n#\n# ! Do not forget run"
},
{
"path": "cicd/requirements_cicd.txt",
"chars": 1578,
"preview": "# Dependencies necessary for the Lakehouse Engine CICD (tests, linting, deployment,...).\n#\n# ! Do not forget running `ma"
},
{
"path": "cicd/requirements_dq.txt",
"chars": 460,
"preview": "# Dependencies necessary for the Data Quality features to work.\n#\n# ! Do not forget running `make build-lock-files` afte"
},
{
"path": "cicd/requirements_os.txt",
"chars": 342,
"preview": "# Special requirements from which the project depends on, but for which some use cases might use environments with\n# the"
},
{
"path": "cicd/requirements_sftp.txt",
"chars": 116,
"preview": "#\n# ! Do not forget running `make build-lock-files` after updating dependency list !\n#\nparamiko==4.0.0\npynacl==1.6.2"
},
{
"path": "cicd/requirements_sharepoint.txt",
"chars": 134,
"preview": "#\n# ! Do not forget running `make build-lock-files` after updating dependency list !\n#\ntenacity==9.0.0\nmsal==1.32.3\nazur"
},
{
"path": "lakehouse_engine/__init__.py",
"chars": 70,
"preview": "\"\"\"Lakehouse engine package containing all the system subpackages.\"\"\"\n"
},
{
"path": "lakehouse_engine/algorithms/__init__.py",
"chars": 62,
"preview": "\"\"\"Package containing all the lakehouse engine algorithms.\"\"\"\n"
},
{
"path": "lakehouse_engine/algorithms/algorithm.py",
"chars": 5835,
"preview": "\"\"\"Module containing the Algorithm class.\"\"\"\n\nfrom typing import List, Tuple\n\nfrom lakehouse_engine.core.definitions imp"
},
{
"path": "lakehouse_engine/algorithms/data_loader.py",
"chars": 25954,
"preview": "\"\"\"Module to define DataLoader class.\"\"\"\n\nfrom collections import OrderedDict\nfrom copy import deepcopy\nfrom logging imp"
},
{
"path": "lakehouse_engine/algorithms/dq_validator.py",
"chars": 6814,
"preview": "\"\"\"Module to define Data Validator class.\"\"\"\n\nfrom delta.tables import DeltaTable\nfrom pyspark.sql import DataFrame\nfrom"
},
{
"path": "lakehouse_engine/algorithms/exceptions.py",
"chars": 539,
"preview": "\"\"\"Package defining all the algorithm custom exceptions.\"\"\"\n\n\nclass ReconciliationFailedException(Exception):\n \"\"\"Exc"
},
{
"path": "lakehouse_engine/algorithms/gab.py",
"chars": 34402,
"preview": "\"\"\"Module to define Gold Asset Builder algorithm behavior.\"\"\"\n\nimport copy\nfrom datetime import datetime, timedelta\n\nimp"
},
{
"path": "lakehouse_engine/algorithms/reconciliator.py",
"chars": 12866,
"preview": "\"\"\"Module containing the Reconciliator class.\"\"\"\n\nfrom enum import Enum\nfrom typing import List\n\nimport pyspark.sql.func"
},
{
"path": "lakehouse_engine/algorithms/sensor.py",
"chars": 6073,
"preview": "\"\"\"Module to define Sensor algorithm behavior.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.algorithms.a"
},
{
"path": "lakehouse_engine/algorithms/sensors/__init__.py",
"chars": 79,
"preview": "\"\"\"Package containing all the lakehouse engine Sensor Heartbeat algorithms.\"\"\"\n"
},
{
"path": "lakehouse_engine/algorithms/sensors/heartbeat.py",
"chars": 33218,
"preview": "\"\"\"Module to define Heartbeat Sensor algorithm behavior.\"\"\"\n\nimport re\nfrom typing import Optional\n\nfrom delta import De"
},
{
"path": "lakehouse_engine/algorithms/sensors/sensor.py",
"chars": 5991,
"preview": "\"\"\"Module to define Sensor algorithm behavior.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.algorithms.a"
},
{
"path": "lakehouse_engine/configs/__init__.py",
"chars": 73,
"preview": "\"\"\"This module receives a config file which is included in the wheel.\"\"\"\n"
},
{
"path": "lakehouse_engine/configs/engine.yaml",
"chars": 745,
"preview": "dq_bucket: s3://sample-dq-bucket\ndq_dev_bucket: s3://sample-dq-dev-bucket\ndq_functions_column_list:\n - dq_rule_id\n - e"
},
{
"path": "lakehouse_engine/core/__init__.py",
"chars": 63,
"preview": "\"\"\"Package with the core behaviour of the lakehouse engine.\"\"\"\n"
},
{
"path": "lakehouse_engine/core/dbfs_file_manager.py",
"chars": 8751,
"preview": "\"\"\"File manager module using dbfs.\"\"\"\n\nfrom lakehouse_engine.core.file_manager import FileManager\nfrom lakehouse_engine."
},
{
"path": "lakehouse_engine/core/definitions.py",
"chars": 75076,
"preview": "\"\"\"Definitions of standard values and structures for core components.\"\"\"\n\nfrom dataclasses import dataclass\nfrom datetim"
},
{
"path": "lakehouse_engine/core/exec_env.py",
"chars": 6475,
"preview": "\"\"\"Module to take care of creating a singleton of the execution environment class.\"\"\"\n\nfrom dataclasses import replace\n\n"
},
{
"path": "lakehouse_engine/core/executable.py",
"chars": 446,
"preview": "\"\"\"Module representing an executable lakehouse engine component.\"\"\"\n\nfrom abc import ABC, abstractmethod\nfrom typing imp"
},
{
"path": "lakehouse_engine/core/file_manager.py",
"chars": 2487,
"preview": "\"\"\"Module for abstract representation of a file manager system.\"\"\"\n\nfrom abc import ABC, abstractmethod\nfrom typing impo"
},
{
"path": "lakehouse_engine/core/gab_manager.py",
"chars": 32395,
"preview": "\"\"\"Module to define GAB Manager classes.\"\"\"\n\nimport calendar\nfrom datetime import datetime, timedelta\nfrom typing import"
},
{
"path": "lakehouse_engine/core/gab_sql_generator.py",
"chars": 19841,
"preview": "\"\"\"Module to define GAB SQL classes.\"\"\"\n\nimport ast\nimport json\nfrom abc import ABC, abstractmethod\nfrom typing import A"
},
{
"path": "lakehouse_engine/core/s3_file_manager.py",
"chars": 21479,
"preview": "\"\"\"File manager module using boto3.\"\"\"\n\nimport time\nfrom typing import Any, Optional, Tuple\n\nimport boto3\n\nfrom lakehous"
},
{
"path": "lakehouse_engine/core/sensor_manager.py",
"chars": 15029,
"preview": "\"\"\"Module to define Sensor Manager classes.\"\"\"\n\nimport json\nfrom datetime import datetime\nfrom typing import List, Optio"
},
{
"path": "lakehouse_engine/core/table_manager.py",
"chars": 10659,
"preview": "\"\"\"Table manager module.\"\"\"\n\nfrom typing import List\n\nfrom delta.tables import DeltaTable\nfrom pyspark.sql import DataFr"
},
{
"path": "lakehouse_engine/dq_processors/__init__.py",
"chars": 82,
"preview": "\"\"\"Package to define data quality processes available in the lakehouse engine.\"\"\"\n"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/__init__.py",
"chars": 83,
"preview": "\"\"\"Package containing custom DQ expectations available in the lakehouse engine.\"\"\"\n"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_not_equal_to_b.py",
"chars": 6491,
"preview": "\"\"\"Expectation to check if column 'a' is not equal to column 'b'.\"\"\"\n\nfrom typing import Any, Dict, Optional\n\nfrom great"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_a_to_be_smaller_or_equal_than_b.py",
"chars": 7007,
"preview": "\"\"\"Expectation to check if column 'a' is lower or equal than column 'b'.\"\"\"\n\nfrom typing import Any, Dict, Optional\n\nfro"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b.py",
"chars": 7158,
"preview": "\"\"\"Expectation to check if date column 'a' is greater or equal to date column 'b'.\"\"\"\n\nimport datetime\nfrom typing impor"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_be_date_not_older_than.py",
"chars": 7863,
"preview": "\"\"\"Expectation to check if column value is a date within a timeframe.\"\"\"\n\nimport datetime\nfrom datetime import timedelta"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_column_values_to_not_be_null_or_empty_string.py",
"chars": 6202,
"preview": "\"\"\"Expectation to check if column value is not null or empty string.\"\"\"\n\nfrom typing import Any, Dict, Optional\n\nfrom gr"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c.py",
"chars": 6705,
"preview": "\"\"\"Expectation to check if column 'a' equals 'b', or 'c'.\"\"\"\n\nfrom typing import Any, Dict, Literal, Optional\n\nfrom grea"
},
{
"path": "lakehouse_engine/dq_processors/custom_expectations/expect_queried_column_agg_value_to_be.py",
"chars": 14393,
"preview": "\"\"\"Expectation to check if aggregated column satisfy the condition.\"\"\"\n\nfrom typing import Any, Dict, Optional\n\nfrom gre"
},
{
"path": "lakehouse_engine/dq_processors/dq_factory.py",
"chars": 37990,
"preview": "\"\"\"Module containing the class definition of the Data Quality Factory.\"\"\"\n\nimport importlib\nimport json\nimport random\nfr"
},
{
"path": "lakehouse_engine/dq_processors/exceptions.py",
"chars": 528,
"preview": "\"\"\"Package defining all the DQ custom exceptions.\"\"\"\n\n\nclass DQValidationsFailedException(Exception):\n \"\"\"Exception f"
},
{
"path": "lakehouse_engine/dq_processors/validator.py",
"chars": 11065,
"preview": "\"\"\"Module containing the definition of a data quality validator.\"\"\"\n\nfrom typing import Any, List\n\nfrom great_expectatio"
},
{
"path": "lakehouse_engine/engine.py",
"chars": 17935,
"preview": "\"\"\"Contract of the lakehouse engine with all the available functions to be executed.\"\"\"\n\nfrom typing import List, Option"
},
{
"path": "lakehouse_engine/io/__init__.py",
"chars": 85,
"preview": "\"\"\"Input and Output package responsible for the behaviour of reading and writing.\"\"\"\n"
},
{
"path": "lakehouse_engine/io/exceptions.py",
"chars": 1583,
"preview": "\"\"\"Package defining all the io custom exceptions.\"\"\"\n\n\nclass IncrementalFilterInputNotFoundException(Exception):\n \"\"\""
},
{
"path": "lakehouse_engine/io/reader.py",
"chars": 798,
"preview": "\"\"\"Defines abstract reader behaviour.\"\"\"\n\nfrom abc import ABC, abstractmethod\n\nfrom pyspark.sql import DataFrame\n\nfrom l"
},
{
"path": "lakehouse_engine/io/reader_factory.py",
"chars": 2690,
"preview": "\"\"\"Module for reader factory.\"\"\"\n\nfrom abc import ABC\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core.def"
},
{
"path": "lakehouse_engine/io/readers/__init__.py",
"chars": 51,
"preview": "\"\"\"Readers package to define reading behaviour.\"\"\"\n"
},
{
"path": "lakehouse_engine/io/readers/dataframe_reader.py",
"chars": 718,
"preview": "\"\"\"Module to define behaviour to read from dataframes.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core"
},
{
"path": "lakehouse_engine/io/readers/file_reader.py",
"chars": 2337,
"preview": "\"\"\"Module to define behaviour to read from files.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core.defi"
},
{
"path": "lakehouse_engine/io/readers/jdbc_reader.py",
"chars": 2449,
"preview": "\"\"\"Module to define behaviour to read from JDBC sources.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.co"
},
{
"path": "lakehouse_engine/io/readers/kafka_reader.py",
"chars": 864,
"preview": "\"\"\"Module to define behaviour to read from Kafka.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core.defi"
},
{
"path": "lakehouse_engine/io/readers/query_reader.py",
"chars": 735,
"preview": "\"\"\"Module to define behaviour to read from a query.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core.de"
},
{
"path": "lakehouse_engine/io/readers/sap_b4_reader.py",
"chars": 7462,
"preview": "\"\"\"Module to define behaviour to read from SAP B4 sources.\"\"\"\n\nfrom logging import Logger\nfrom typing import Tuple\n\nfrom"
},
{
"path": "lakehouse_engine/io/readers/sap_bw_reader.py",
"chars": 7802,
"preview": "\"\"\"Module to define behaviour to read from SAP BW sources.\"\"\"\n\nfrom logging import Logger\nfrom typing import Tuple\n\nfrom"
},
{
"path": "lakehouse_engine/io/readers/sftp_reader.py",
"chars": 5445,
"preview": "\"\"\"Module to define behaviour to read from SFTP.\"\"\"\n\nimport gzip\nfrom datetime import datetime\nfrom io import TextIOWrap"
},
{
"path": "lakehouse_engine/io/readers/sharepoint_reader.py",
"chars": 24568,
"preview": "\"\"\"Module to define the behaviour to read from Sharepoint.\"\"\"\n\nimport csv\nimport fnmatch\nimport time\nfrom functools impo"
},
{
"path": "lakehouse_engine/io/readers/table_reader.py",
"chars": 1301,
"preview": "\"\"\"Module to define behaviour to read from tables.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core.def"
},
{
"path": "lakehouse_engine/io/writer.py",
"chars": 4721,
"preview": "\"\"\"Defines abstract writer behaviour.\"\"\"\n\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Callable, Dict, Li"
},
{
"path": "lakehouse_engine/io/writer_factory.py",
"chars": 3140,
"preview": "\"\"\"Module for writer factory.\"\"\"\n\nfrom abc import ABC\nfrom typing import OrderedDict\n\nfrom pyspark.sql import DataFrame\n"
},
{
"path": "lakehouse_engine/io/writers/__init__.py",
"chars": 67,
"preview": "\"\"\"Package containing the writers responsible for writing data.\"\"\"\n"
},
{
"path": "lakehouse_engine/io/writers/console_writer.py",
"chars": 4527,
"preview": "\"\"\"Module to define behaviour to write to console.\"\"\"\n\nfrom typing import Callable, OrderedDict\n\nfrom pyspark.sql import"
},
{
"path": "lakehouse_engine/io/writers/dataframe_writer.py",
"chars": 7579,
"preview": "\"\"\"Module to define behaviour to write to dataframe.\"\"\"\n\nimport uuid\nfrom typing import Callable, Optional, OrderedDict\n"
},
{
"path": "lakehouse_engine/io/writers/delta_merge_writer.py",
"chars": 7810,
"preview": "\"\"\"Module to define the behaviour of delta merges.\"\"\"\n\nfrom typing import Callable, Optional, OrderedDict\n\nfrom delta.ta"
},
{
"path": "lakehouse_engine/io/writers/file_writer.py",
"chars": 3997,
"preview": "\"\"\"Module to define behaviour to write to files.\"\"\"\n\nfrom typing import Callable, OrderedDict\n\nfrom pyspark.sql import D"
},
{
"path": "lakehouse_engine/io/writers/jdbc_writer.py",
"chars": 3183,
"preview": "\"\"\"Module that defines the behaviour to write to JDBC targets.\"\"\"\n\nfrom typing import Callable, OrderedDict\n\nfrom pyspar"
},
{
"path": "lakehouse_engine/io/writers/kafka_writer.py",
"chars": 3788,
"preview": "\"\"\"Module that defines the behaviour to write to Kafka.\"\"\"\n\nfrom typing import Callable, OrderedDict\n\nfrom pyspark.sql i"
},
{
"path": "lakehouse_engine/io/writers/rest_api_writer.py",
"chars": 8585,
"preview": "\"\"\"Module to define behaviour to write to REST APIs.\"\"\"\n\nimport json\nfrom typing import Any, Callable, OrderedDict\n\nfrom"
},
{
"path": "lakehouse_engine/io/writers/sharepoint_writer.py",
"chars": 4615,
"preview": "\"\"\"Module to define the behaviour to write to Sharepoint.\"\"\"\n\nimport os\nfrom typing import OrderedDict\n\nfrom pyspark.sql"
},
{
"path": "lakehouse_engine/io/writers/table_writer.py",
"chars": 5878,
"preview": "\"\"\"Module that defines the behaviour to write to tables.\"\"\"\n\nfrom typing import Any, Callable, OrderedDict\n\nfrom pyspark"
},
{
"path": "lakehouse_engine/terminators/__init__.py",
"chars": 87,
"preview": "\"\"\"Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).\"\"\"\n"
},
{
"path": "lakehouse_engine/terminators/cdf_processor.py",
"chars": 4558,
"preview": "\"\"\"Defines change data feed processor behaviour.\"\"\"\n\nfrom datetime import datetime, timedelta\nfrom typing import Ordered"
},
{
"path": "lakehouse_engine/terminators/dataset_optimizer.py",
"chars": 5333,
"preview": "\"\"\"Module with dataset optimizer terminator.\"\"\"\n\nfrom typing import List, Optional\n\nfrom pyspark.sql.utils import Analys"
},
{
"path": "lakehouse_engine/terminators/notifier.py",
"chars": 3263,
"preview": "\"\"\"Module with notification terminator.\"\"\"\n\nfrom abc import ABC, abstractmethod\n\nfrom jinja2 import Template\n\nfrom lakeh"
},
{
"path": "lakehouse_engine/terminators/notifier_factory.py",
"chars": 2604,
"preview": "\"\"\"Module for notifier factory.\"\"\"\n\nfrom lakehouse_engine.core.definitions import NotifierType, TerminatorSpec\nfrom lake"
},
{
"path": "lakehouse_engine/terminators/notifiers/__init__.py",
"chars": 28,
"preview": "\"\"\"Notifications module.\"\"\"\n"
},
{
"path": "lakehouse_engine/terminators/notifiers/email_notifier.py",
"chars": 11714,
"preview": "\"\"\"Module with email notifier.\"\"\"\n\nimport asyncio\nimport smtplib\nfrom email.mime.application import MIMEApplication\nfrom"
},
{
"path": "lakehouse_engine/terminators/notifiers/exceptions.py",
"chars": 538,
"preview": "\"\"\"Package defining all the Notifier custom exceptions.\"\"\"\n\n\nclass NotifierNotFoundException(Exception):\n \"\"\"Exceptio"
},
{
"path": "lakehouse_engine/terminators/notifiers/notification_templates.py",
"chars": 497,
"preview": "\"\"\"Email notification templates.\"\"\"\n\n\nclass NotificationsTemplates(object):\n \"\"\"Templates for notifications.\"\"\"\n\n "
},
{
"path": "lakehouse_engine/terminators/sensor_terminator.py",
"chars": 2127,
"preview": "\"\"\"Module with sensor terminator.\"\"\"\n\nfrom typing import List\n\nfrom lakehouse_engine.core.definitions import SensorSpec,"
},
{
"path": "lakehouse_engine/terminators/spark_terminator.py",
"chars": 457,
"preview": "\"\"\"Module with spark terminator.\"\"\"\n\nfrom lakehouse_engine.core.exec_env import ExecEnv\nfrom lakehouse_engine.utils.logg"
},
{
"path": "lakehouse_engine/terminators/terminator_factory.py",
"chars": 1991,
"preview": "\"\"\"Module with the factory pattern to return terminators.\"\"\"\n\nfrom typing import Optional\n\nfrom pyspark.sql import DataF"
},
{
"path": "lakehouse_engine/transformers/__init__.py",
"chars": 72,
"preview": "\"\"\"Package to define transformers available in the lakehouse engine.\"\"\"\n"
},
{
"path": "lakehouse_engine/transformers/aggregators.py",
"chars": 964,
"preview": "\"\"\"Aggregators module.\"\"\"\n\nfrom typing import Callable\n\nfrom pyspark.sql import DataFrame\nfrom pyspark.sql.functions imp"
},
{
"path": "lakehouse_engine/transformers/column_creators.py",
"chars": 3495,
"preview": "\"\"\"Column creators transformers module.\"\"\"\n\nfrom typing import Any, Callable, Dict\n\nfrom pyspark.sql import DataFrame, W"
},
{
"path": "lakehouse_engine/transformers/column_reshapers.py",
"chars": 16343,
"preview": "\"\"\"Module with column reshaping transformers.\"\"\"\n\nfrom collections import OrderedDict\nfrom typing import Any, Callable, "
},
{
"path": "lakehouse_engine/transformers/condensers.py",
"chars": 4987,
"preview": "\"\"\"Condensers module.\"\"\"\n\nfrom typing import Callable, List, Optional\n\nfrom pyspark.sql import DataFrame, Window\nfrom py"
},
{
"path": "lakehouse_engine/transformers/custom_transformers.py",
"chars": 2154,
"preview": "\"\"\"Custom transformers module.\"\"\"\n\nfrom typing import Callable\n\nfrom pyspark.sql import DataFrame\n\n\nclass CustomTransfor"
},
{
"path": "lakehouse_engine/transformers/data_maskers.py",
"chars": 2290,
"preview": "\"\"\"Module with data masking transformers.\"\"\"\n\nfrom typing import Callable, List\n\nfrom pyspark.sql import DataFrame\nfrom "
},
{
"path": "lakehouse_engine/transformers/date_transformers.py",
"chars": 5617,
"preview": "\"\"\"Module containing date transformers.\"\"\"\n\nfrom datetime import datetime\nfrom typing import Callable, List, Optional\n\nf"
},
{
"path": "lakehouse_engine/transformers/exceptions.py",
"chars": 341,
"preview": "\"\"\"Module for all the transformers exceptions.\"\"\"\n\n\nclass WrongArgumentsException(Exception):\n \"\"\"Exception for when "
},
{
"path": "lakehouse_engine/transformers/filters.py",
"chars": 6587,
"preview": "\"\"\"Module containing the filters transformers.\"\"\"\n\nfrom typing import Any, Callable, List, Optional\n\nfrom pyspark.sql im"
},
{
"path": "lakehouse_engine/transformers/joiners.py",
"chars": 3661,
"preview": "\"\"\"Module with join transformers.\"\"\"\n\nimport uuid\nfrom typing import Callable, List, Optional\n\nfrom pyspark.sql import D"
},
{
"path": "lakehouse_engine/transformers/null_handlers.py",
"chars": 1610,
"preview": "\"\"\"Module with null handlers transformers.\"\"\"\n\nfrom typing import Callable, List\n\nfrom pyspark.sql import DataFrame\n\nfro"
},
{
"path": "lakehouse_engine/transformers/optimizers.py",
"chars": 2071,
"preview": "\"\"\"Optimizers module.\"\"\"\n\nfrom typing import Callable\n\nfrom pyspark.sql import DataFrame\nfrom pyspark.storagelevel impor"
},
{
"path": "lakehouse_engine/transformers/regex_transformers.py",
"chars": 1297,
"preview": "\"\"\"Regex transformers module.\"\"\"\n\nfrom typing import Callable\n\nfrom pyspark.sql import DataFrame\nfrom pyspark.sql.functi"
},
{
"path": "lakehouse_engine/transformers/repartitioners.py",
"chars": 1919,
"preview": "\"\"\"Module with repartitioners transformers.\"\"\"\n\nfrom typing import Callable, List, Optional\n\nfrom pyspark.sql import Dat"
},
{
"path": "lakehouse_engine/transformers/transformer_factory.py",
"chars": 7097,
"preview": "\"\"\"Module with the factory pattern to return transformers.\"\"\"\n\nfrom typing import Callable, OrderedDict\n\nfrom lakehouse_"
},
{
"path": "lakehouse_engine/transformers/unions.py",
"chars": 2065,
"preview": "\"\"\"Module with union transformers.\"\"\"\n\nfrom functools import reduce\nfrom typing import Callable, List\n\nfrom pyspark.sql "
},
{
"path": "lakehouse_engine/transformers/watermarker.py",
"chars": 974,
"preview": "\"\"\"Watermarker module.\"\"\"\n\nfrom typing import Callable\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.utils.l"
},
{
"path": "lakehouse_engine/utils/__init__.py",
"chars": 25,
"preview": "\"\"\"Utilities package.\"\"\"\n"
},
{
"path": "lakehouse_engine/utils/acon_utils.py",
"chars": 7450,
"preview": "\"\"\"Module to perform validations and resolve the acon.\"\"\"\n\nfrom lakehouse_engine.core.definitions import (\n FILE_MANA"
},
{
"path": "lakehouse_engine/utils/configs/__init__.py",
"chars": 32,
"preview": "\"\"\"Config utilities package.\"\"\"\n"
},
{
"path": "lakehouse_engine/utils/configs/config_utils.py",
"chars": 4414,
"preview": "\"\"\"Module to read configurations.\"\"\"\n\nfrom importlib.metadata import PackageNotFoundError, version\nfrom typing import An"
},
{
"path": "lakehouse_engine/utils/databricks_utils.py",
"chars": 5498,
"preview": "\"\"\"Utilities for databricks operations.\"\"\"\n\nimport ast\nimport json\nimport os\nimport re\nfrom typing import Any, Tuple\n\nfr"
},
{
"path": "lakehouse_engine/utils/dq_utils.py",
"chars": 10431,
"preview": "\"\"\"Module containing utils for DQ processing.\"\"\"\n\nfrom json import loads\n\nfrom pyspark.sql.functions import col, from_js"
},
{
"path": "lakehouse_engine/utils/engine_usage_stats.py",
"chars": 4447,
"preview": "\"\"\"Utilities for recording the engine activity.\"\"\"\n\nimport json\nfrom datetime import datetime\nfrom urllib.parse import u"
},
{
"path": "lakehouse_engine/utils/expectations_utils.py",
"chars": 3270,
"preview": "\"\"\"Utilities to be used by custom expectations.\"\"\"\n\nfrom typing import Any, Dict\n\n\ndef validate_result(\n expectation_"
},
{
"path": "lakehouse_engine/utils/extraction/__init__.py",
"chars": 36,
"preview": "\"\"\"Extraction utilities package.\"\"\"\n"
},
{
"path": "lakehouse_engine/utils/extraction/jdbc_extraction_utils.py",
"chars": 15404,
"preview": "\"\"\"Utilities module for JDBC extraction processes.\"\"\"\n\nfrom abc import abstractmethod\nfrom dataclasses import dataclass\n"
},
{
"path": "lakehouse_engine/utils/extraction/sap_b4_extraction_utils.py",
"chars": 12164,
"preview": "\"\"\"Utilities module for SAP B4 extraction processes.\"\"\"\n\nimport re\nfrom dataclasses import dataclass\nfrom enum import En"
},
{
"path": "lakehouse_engine/utils/extraction/sap_bw_extraction_utils.py",
"chars": 16082,
"preview": "\"\"\"Utilities module for SAP BW extraction processes.\"\"\"\n\nfrom dataclasses import dataclass\nfrom logging import Logger\nfr"
},
{
"path": "lakehouse_engine/utils/extraction/sftp_extraction_utils.py",
"chars": 18311,
"preview": "\"\"\"Utilities module for SFTP extraction processes.\"\"\"\n\nimport stat\nfrom base64 import decodebytes\nfrom datetime import d"
},
{
"path": "lakehouse_engine/utils/file_utils.py",
"chars": 1115,
"preview": "\"\"\"Utilities for file name based operations.\"\"\"\n\nimport re\nfrom os import listdir\nfrom typing import List\n\n\ndef get_file"
},
{
"path": "lakehouse_engine/utils/gab_utils.py",
"chars": 22918,
"preview": "\"\"\"Module to define GAB Utility classes.\"\"\"\n\nimport ast\nimport calendar\nimport json\nfrom datetime import datetime\nfrom t"
},
{
"path": "lakehouse_engine/utils/logging_handler.py",
"chars": 2872,
"preview": "\"\"\"Module to configure project logging.\"\"\"\n\nimport logging\nimport re\n\nFORMATTER = logging.Formatter(\"%(asctime)s — %(nam"
},
{
"path": "lakehouse_engine/utils/rest_api.py",
"chars": 3781,
"preview": "\"\"\"Module to handle REST API operations.\"\"\"\n\nimport time\nfrom enum import Enum\n\nimport requests\nfrom requests.adapters i"
},
{
"path": "lakehouse_engine/utils/schema_utils.py",
"chars": 6936,
"preview": "\"\"\"Utilities to facilitate dataframe schema management.\"\"\"\n\nfrom logging import Logger\nfrom typing import Any, List, Opt"
},
{
"path": "lakehouse_engine/utils/sharepoint_utils.py",
"chars": 34425,
"preview": "\"\"\"Utilities for sharepoint API operations.\"\"\"\n\nfrom __future__ import annotations\n\nimport os\nimport shutil\nfrom context"
},
{
"path": "lakehouse_engine/utils/spark_utils.py",
"chars": 1565,
"preview": "\"\"\"Utilities to facilitate spark dataframe management.\"\"\"\n\nfrom pyspark.sql import DataFrame\n\nfrom lakehouse_engine.core"
},
{
"path": "lakehouse_engine/utils/sql_parser_utils.py",
"chars": 7296,
"preview": "\"\"\"Module to parse sql files.\"\"\"\n\nfrom lakehouse_engine.core.definitions import SQLParser\n\n\nclass SQLParserUtils(object)"
},
{
"path": "lakehouse_engine/utils/storage/__init__.py",
"chars": 50,
"preview": "\"\"\"Utilities to interact with storage systems.\"\"\"\n"
},
{
"path": "lakehouse_engine/utils/storage/dbfs_storage.py",
"chars": 1526,
"preview": "\"\"\"Module to represent a DBFS file storage system.\"\"\"\n\nfrom typing import Any\nfrom urllib.parse import ParseResult, urlu"
},
{
"path": "lakehouse_engine/utils/storage/file_storage.py",
"chars": 775,
"preview": "\"\"\"Module for abstract representation of a storage system holding files.\"\"\"\n\nfrom abc import ABC, abstractmethod\nfrom ty"
},
{
"path": "lakehouse_engine/utils/storage/file_storage_functions.py",
"chars": 4213,
"preview": "\"\"\"Module for common file storage functions.\"\"\"\n\nimport json\nfrom abc import ABC\nfrom typing import Any\nfrom urllib.pars"
},
{
"path": "lakehouse_engine/utils/storage/local_fs_storage.py",
"chars": 1281,
"preview": "\"\"\"Module to represent a local file storage system.\"\"\"\n\nimport os\nfrom typing import TextIO\nfrom urllib.parse import Par"
},
{
"path": "lakehouse_engine/utils/storage/s3_storage.py",
"chars": 1423,
"preview": "\"\"\"Module to represent a s3 file storage system.\"\"\"\n\nfrom typing import Any\nfrom urllib.parse import ParseResult\n\nimport"
},
{
"path": "lakehouse_engine_usage/__init__.py",
"chars": 427,
"preview": "\"\"\"\n# How to use the Lakehouse Engine?\nLakehouse engine usage examples for all the algorithms and other core functionali"
},
{
"path": "lakehouse_engine_usage/data_loader/__init__.py",
"chars": 35,
"preview": "\"\"\"\n.. include::data_loader.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/__init__.py",
"chars": 65,
"preview": "\"\"\"\n.. include::append_load_from_jdbc_with_permissive_mode.md\n\"\"\""
},
{
"path": "lakehouse_engine_usage/data_loader/append_load_from_jdbc_with_permissive_mode/append_load_from_jdbc_with_permissive_mode.md",
"chars": 2825,
"preview": "# Append Load from JDBC with PERMISSIVE mode (default)\n\nThis scenario is an append load from a JDBC source (e.g., SAP BW"
},
{
"path": "lakehouse_engine_usage/data_loader/append_load_with_failfast/__init__.py",
"chars": 48,
"preview": "\"\"\"\n.. include::append_load_with_failfast.md\n\"\"\""
},
{
"path": "lakehouse_engine_usage/data_loader/append_load_with_failfast/append_load_with_failfast.md",
"chars": 2842,
"preview": "# Append Load with FAILFAST\n\nThis scenario is an append load enforcing the schema (using the schema of the target table "
},
{
"path": "lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/__init__.py",
"chars": 70,
"preview": "\"\"\"\n.. include::batch_delta_load_init_delta_backfill_with_merge.md\n\"\"\""
},
{
"path": "lakehouse_engine_usage/data_loader/batch_delta_load_init_delta_backfill_with_merge/batch_delta_load_init_delta_backfill_with_merge.md",
"chars": 11536,
"preview": "# Batch Delta Load Init, Delta and Backfill with Merge\n\nThis scenario illustrates the process of implementing a delta lo"
},
{
"path": "lakehouse_engine_usage/data_loader/custom_transformer/__init__.py",
"chars": 42,
"preview": "\"\"\"\n.. include::custom_transformer.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/custom_transformer/custom_transformer.md",
"chars": 8817,
"preview": "# Custom Transformer\n\nThere may appear a scenario where the data product dev team faces the need to perform complex data"
},
{
"path": "lakehouse_engine_usage/data_loader/custom_transformer/sql_custom_transformer.md",
"chars": 1860,
"preview": "# SQL Custom Transformer\nThe SQL Custom Transformer executes a SQL transformation provided by the user.This transformer "
},
{
"path": "lakehouse_engine_usage/data_loader/custom_transformer_sql/__init__.py",
"chars": 46,
"preview": "\"\"\"\n.. include::custom_transformer_sql.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/custom_transformer_sql/custom_transformer_sql.md",
"chars": 1860,
"preview": "# SQL Custom Transformer\nThe SQL Custom Transformer executes a SQL transformation provided by the user.This transformer "
},
{
"path": "lakehouse_engine_usage/data_loader/data_loader.md",
"chars": 16692,
"preview": "# Data Loader\n\n## How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file?\n\nAn algorithm ("
},
{
"path": "lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/__init__.py",
"chars": 48,
"preview": "\"\"\"\n.. include::extract_from_sap_b4_adso.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/extract_from_sap_b4_adso/extract_from_sap_b4_adso.md",
"chars": 25253,
"preview": "# Extract from SAP B4 ADSOs\n\nA custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so tha"
},
{
"path": "lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/__init__.py",
"chars": 46,
"preview": "\"\"\"\n.. include::extract_from_sap_bw_dso.md\n\"\"\""
},
{
"path": "lakehouse_engine_usage/data_loader/extract_from_sap_bw_dso/extract_from_sap_bw_dso.md",
"chars": 31803,
"preview": "# Extract from SAP BW DSOs\n\n!!! danger \"**Parallelization Limitations**\"\n Parallel extractions **can bring a jdbc sou"
},
{
"path": "lakehouse_engine_usage/data_loader/extract_from_sftp/__init__.py",
"chars": 41,
"preview": "\"\"\"\n.. include::extract_from_sftp.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/extract_from_sftp/extract_from_sftp.md",
"chars": 11124,
"preview": "# Extract from SFTP\n\nSecure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web.\n\nThis "
},
{
"path": "lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/__init__.py",
"chars": 53,
"preview": "\"\"\"\n.. include::extract_using_jdbc_connection.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/extract_using_jdbc_connection/extract_using_jdbc_connection.md",
"chars": 26621,
"preview": "# Extract using JDBC connection\n\n!!! danger \"**SAP Extraction**\"\n\n SAP is only used as an example to demonstrate how "
},
{
"path": "lakehouse_engine_usage/data_loader/filtered_full_load/__init__.py",
"chars": 42,
"preview": "\"\"\"\n.. include::filtered_full_load.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/filtered_full_load/filtered_full_load.md",
"chars": 662,
"preview": "# Filtered Full Load\n\nThis scenario is very similar to the [full load](../full_load/full_load.md), but it filters the da"
},
{
"path": "lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/__init__.py",
"chars": 64,
"preview": "\"\"\"\n.. include::filtered_full_load_with_selective_replace.md\n\"\"\""
},
{
"path": "lakehouse_engine_usage/data_loader/filtered_full_load_with_selective_replace/filtered_full_load_with_selective_replace.md",
"chars": 937,
"preview": "# Filtered Full Load with Selective Replace\n\nThis scenario is very similar to the [Filtered Full Load](../filtered_full_"
},
{
"path": "lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/__init__.py",
"chars": 58,
"preview": "\"\"\"\n.. include::flatten_schema_and_explode_columns.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/flatten_schema_and_explode_columns/flatten_schema_and_explode_columns.md",
"chars": 3756,
"preview": "# Flatten Schema and Explode Columns\n\nRelated with schema, we can make two kind of operations:\n\n* **Flatten Schema**: tr"
},
{
"path": "lakehouse_engine_usage/data_loader/full_load/__init__.py",
"chars": 33,
"preview": "\"\"\"\n.. include::full_load.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/full_load/full_load.md",
"chars": 1007,
"preview": "# Full Load\n\nThis scenario reads CSV data from a path and writes in full to another path with delta lake files.\n\n##### R"
},
{
"path": "lakehouse_engine_usage/data_loader/read_from_dataframe/__init__.py",
"chars": 43,
"preview": "\"\"\"\n.. include::read_from_dataframe.md\n\"\"\"\n"
},
{
"path": "lakehouse_engine_usage/data_loader/read_from_dataframe/read_from_dataframe.md",
"chars": 853,
"preview": "# Read from Dataframe\n\n!!! danger\n Don't use this feature if the Lakehouse Engine already has a supported data format"
},
{
"path": "lakehouse_engine_usage/data_loader/read_from_sharepoint/__init__.py",
"chars": 44,
"preview": "\"\"\"\n.. include::read_from_sharepoint.md\n\"\"\"\n"
}
]
// ... and 983 more files (download for full content)
About this extraction
This page contains the full source code of the adidas/lakehouse-engine GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1183 files (3.2 MB), approximately 916.5k tokens, and a symbol index with 1206 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.