36classEngineConfig(object): 37"""Definitions that can come from the Engine Config file. 38
- 39 - dq_bucket: S3 bucket used to store data quality related artifacts.
- 40 - notif_disallowed_email_servers: email servers not allowed to be used
- 41 for sending notifications.
- 42 - engine_usage_path: path where the engine prod usage stats are stored.
- 43 - engine_dev_usage_path: path where the engine dev usage stats are stored.
- 44 - collect_engine_usage: whether to enable the collection of lakehouse
- 45 engine usage stats or not.
- 46 - dq_functions_column_list: list of columns to be added to the meta argument
- 47 of GX when using PRISMA.
- 48 """
- 49
- 50dq_bucket:Optional[str]=None
- 51notif_disallowed_email_servers:Optional[list]=None
- 52engine_usage_path:Optional[str]=None
- 53engine_dev_usage_path:Optional[str]=None
- 54collect_engine_usage:str=CollectEngineUsage.ENABLED.value
- 55dq_functions_column_list:Optional[list]=None
- 56
- 57
- 58classEngineStats(Enum):
- 59"""Definitions for collection of Lakehouse Engine Stats.
- 60
- 61 .. note::
- 62 Note: whenever the value comes from a key inside a Spark Config
- 63 that returns an array, it can be specified with a '#' so that it
- 64 is adequately processed.
- 65 """
- 66
- 67CLUSTER_USAGE_TAGS="spark.databricks.clusterUsageTags"
- 68DEF_SPARK_CONFS={
- 69"dp_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#accountName",
- 70"environment":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#environment",
- 71"workspace_id":f"{CLUSTER_USAGE_TAGS}.orgId",
- 72"job_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#JobId",
- 73"job_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#RunName",
- 74"run_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#ClusterName",
- 75}
- 76
- 77
- 78classInputFormat(Enum):
- 79"""Formats of algorithm input."""
- 80
- 81JDBC="jdbc"
- 82AVRO="avro"
- 83JSON="json"
- 84CSV="csv"
- 85PARQUET="parquet"
- 86DELTAFILES="delta"
- 87CLOUDFILES="cloudfiles"
- 88KAFKA="kafka"
- 89SQL="sql"
- 90SAP_BW="sap_bw"
- 91SAP_B4="sap_b4"
- 92DATAFRAME="dataframe"
- 93SFTP="sftp"
- 94
- 95@classmethod
- 96defvalues(cls):# type: ignore
- 97"""Generates a list containing all enum values.
- 98
- 99 Return:
- 100 A list with all enum values.
- 101 """
- 102return(c.valueforcincls)
- 103
- 104@classmethod
- 105defexists(cls,input_format:str)->bool:
- 106"""Checks if the input format exists in the enum values.
- 107
- 108 Args:
- 109 input_format: format to check if exists.
- 110
- 111 Return:
- 112 If the input format exists in our enum.
- 113 """
- 114returninput_formatincls.values()
- 115
- 116
- 117# Formats of input that are considered files.
- 118FILE_INPUT_FORMATS=[
- 119InputFormat.AVRO.value,
- 120InputFormat.JSON.value,
- 121InputFormat.PARQUET.value,
- 122InputFormat.CSV.value,
- 123InputFormat.DELTAFILES.value,
- 124InputFormat.CLOUDFILES.value,
- 125]
- 126
- 127
- 128classOutputFormat(Enum):
- 129"""Formats of algorithm output."""
- 130
- 131JDBC="jdbc"
- 132AVRO="avro"
- 133JSON="json"
- 134CSV="csv"
- 135PARQUET="parquet"
- 136DELTAFILES="delta"
- 137KAFKA="kafka"
- 138CONSOLE="console"
- 139NOOP="noop"
- 140DATAFRAME="dataframe"
- 141REST_API="rest_api"
- 142FILE="file"# Internal use only
- 143TABLE="table"# Internal use only
- 144
- 145@classmethod
- 146defvalues(cls):# type: ignore
- 147"""Generates a list containing all enum values.
- 148
- 149 Return:
- 150 A list with all enum values.
- 151 """
- 152return(c.valueforcincls)
- 153
- 154@classmethod
- 155defexists(cls,output_format:str)->bool:
- 156"""Checks if the output format exists in the enum values.
- 157
- 158 Args:
- 159 output_format: format to check if exists.
- 160
- 161 Return:
- 162 If the output format exists in our enum.
- 163 """
- 164returnoutput_formatincls.values()
- 165
- 166
- 167# Formats of output that are considered files.
- 168FILE_OUTPUT_FORMATS=[
- 169OutputFormat.AVRO.value,
- 170OutputFormat.JSON.value,
- 171OutputFormat.PARQUET.value,
- 172OutputFormat.CSV.value,
- 173OutputFormat.DELTAFILES.value,
- 174]
- 175
- 176
- 177classNotifierType(Enum):
- 178"""Type of notifier available."""
- 179
- 180EMAIL="email"
+ 39 - dq_bucket: S3 prod bucket used to store data quality related artifacts.
+ 40 - dq_dev_bucket: S3 dev bucket used to store data quality related artifacts.
+ 41 - notif_disallowed_email_servers: email servers not allowed to be used
+ 42 for sending notifications.
+ 43 - engine_usage_path: path where the engine prod usage stats are stored.
+ 44 - engine_dev_usage_path: path where the engine dev usage stats are stored.
+ 45 - collect_engine_usage: whether to enable the collection of lakehouse
+ 46 engine usage stats or not.
+ 47 - dq_functions_column_list: list of columns to be added to the meta argument
+ 48 of GX when using PRISMA.
+ 49 """
+ 50
+ 51dq_bucket:Optional[str]=None
+ 52dq_dev_bucket:Optional[str]=None
+ 53notif_disallowed_email_servers:Optional[list]=None
+ 54engine_usage_path:Optional[str]=None
+ 55engine_dev_usage_path:Optional[str]=None
+ 56collect_engine_usage:str=CollectEngineUsage.ENABLED.value
+ 57dq_functions_column_list:Optional[list]=None
+ 58
+ 59
+ 60classEngineStats(Enum):
+ 61"""Definitions for collection of Lakehouse Engine Stats.
+ 62
+ 63 .. note::
+ 64 Note: whenever the value comes from a key inside a Spark Config
+ 65 that returns an array, it can be specified with a '#' so that it
+ 66 is adequately processed.
+ 67 """
+ 68
+ 69CLUSTER_USAGE_TAGS="spark.databricks.clusterUsageTags"
+ 70DEF_SPARK_CONFS={
+ 71"dp_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#accountName",
+ 72"environment":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#environment",
+ 73"workspace_id":f"{CLUSTER_USAGE_TAGS}.orgId",
+ 74"job_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#JobId",
+ 75"job_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#RunName",
+ 76"run_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#ClusterName",
+ 77}
+ 78
+ 79
+ 80classInputFormat(Enum):
+ 81"""Formats of algorithm input."""
+ 82
+ 83JDBC="jdbc"
+ 84AVRO="avro"
+ 85JSON="json"
+ 86CSV="csv"
+ 87PARQUET="parquet"
+ 88DELTAFILES="delta"
+ 89CLOUDFILES="cloudfiles"
+ 90KAFKA="kafka"
+ 91SQL="sql"
+ 92SAP_BW="sap_bw"
+ 93SAP_B4="sap_b4"
+ 94DATAFRAME="dataframe"
+ 95SFTP="sftp"
+ 96
+ 97@classmethod
+ 98defvalues(cls):# type: ignore
+ 99"""Generates a list containing all enum values.
+ 100
+ 101 Return:
+ 102 A list with all enum values.
+ 103 """
+ 104return(c.valueforcincls)
+ 105
+ 106@classmethod
+ 107defexists(cls,input_format:str)->bool:
+ 108"""Checks if the input format exists in the enum values.
+ 109
+ 110 Args:
+ 111 input_format: format to check if exists.
+ 112
+ 113 Return:
+ 114 If the input format exists in our enum.
+ 115 """
+ 116returninput_formatincls.values()
+ 117
+ 118
+ 119# Formats of input that are considered files.
+ 120FILE_INPUT_FORMATS=[
+ 121InputFormat.AVRO.value,
+ 122InputFormat.JSON.value,
+ 123InputFormat.PARQUET.value,
+ 124InputFormat.CSV.value,
+ 125InputFormat.DELTAFILES.value,
+ 126InputFormat.CLOUDFILES.value,
+ 127]
+ 128
+ 129
+ 130classOutputFormat(Enum):
+ 131"""Formats of algorithm output."""
+ 132
+ 133JDBC="jdbc"
+ 134AVRO="avro"
+ 135JSON="json"
+ 136CSV="csv"
+ 137PARQUET="parquet"
+ 138DELTAFILES="delta"
+ 139KAFKA="kafka"
+ 140CONSOLE="console"
+ 141NOOP="noop"
+ 142DATAFRAME="dataframe"
+ 143REST_API="rest_api"
+ 144FILE="file"# Internal use only
+ 145TABLE="table"# Internal use only
+ 146
+ 147@classmethod
+ 148defvalues(cls):# type: ignore
+ 149"""Generates a list containing all enum values.
+ 150
+ 151 Return:
+ 152 A list with all enum values.
+ 153 """
+ 154return(c.valueforcincls)
+ 155
+ 156@classmethod
+ 157defexists(cls,output_format:str)->bool:
+ 158"""Checks if the output format exists in the enum values.
+ 159
+ 160 Args:
+ 161 output_format: format to check if exists.
+ 162
+ 163 Return:
+ 164 If the output format exists in our enum.
+ 165 """
+ 166returnoutput_formatincls.values()
+ 167
+ 168
+ 169# Formats of output that are considered files.
+ 170FILE_OUTPUT_FORMATS=[
+ 171OutputFormat.AVRO.value,
+ 172OutputFormat.JSON.value,
+ 173OutputFormat.PARQUET.value,
+ 174OutputFormat.CSV.value,
+ 175OutputFormat.DELTAFILES.value,
+ 176]
+ 177
+ 178
+ 179classNotifierType(Enum):
+ 180"""Type of notifier available.""" 181
- 182
- 183classNotificationRuntimeParameters(Enum):
- 184"""Parameters to be replaced in runtime."""
- 185
- 186DATABRICKS_JOB_NAME="databricks_job_name"
- 187DATABRICKS_WORKSPACE_ID="databricks_workspace_id"
- 188
- 189
- 190NOTIFICATION_RUNTIME_PARAMETERS=[
- 191NotificationRuntimeParameters.DATABRICKS_JOB_NAME.value,
- 192NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID.value,
- 193]
- 194
- 195
- 196classReadType(Enum):
- 197"""Define the types of read operations.
- 198
- 199 - BATCH - read the data in batch mode (e.g., Spark batch).
- 200 - STREAMING - read the data in streaming mode (e.g., Spark streaming).
- 201 """
- 202
- 203BATCH="batch"
- 204STREAMING="streaming"
- 205
- 206
- 207classReadMode(Enum):
- 208"""Different modes that control how we handle compliance to the provided schema.
- 209
- 210 These read modes map to Spark's read modes at the moment.
- 211 """
- 212
- 213PERMISSIVE="PERMISSIVE"
- 214FAILFAST="FAILFAST"
- 215DROPMALFORMED="DROPMALFORMED"
- 216
- 217
- 218classDQDefaults(Enum):
- 219"""Defaults used on the data quality process."""
- 220
- 221FILE_SYSTEM_STORE="file_system"
- 222FILE_SYSTEM_S3_STORE="s3"
- 223DQ_BATCH_IDENTIFIERS=["spec_id","input_id","timestamp"]
- 224DATASOURCE_CLASS_NAME="Datasource"
- 225DATASOURCE_EXECUTION_ENGINE="SparkDFExecutionEngine"
- 226DATA_CONNECTORS_CLASS_NAME="RuntimeDataConnector"
- 227DATA_CONNECTORS_MODULE_NAME="great_expectations.datasource.data_connector"
- 228DATA_CHECKPOINTS_CLASS_NAME="SimpleCheckpoint"
- 229DATA_CHECKPOINTS_CONFIG_VERSION=1.0
- 230STORE_BACKEND="s3"
- 231EXPECTATIONS_STORE_PREFIX="dq/expectations/"
- 232VALIDATIONS_STORE_PREFIX="dq/validations/"
- 233DATA_DOCS_PREFIX="dq/data_docs/site/"
- 234CHECKPOINT_STORE_PREFIX="dq/checkpoints/"
- 235VALIDATION_COLUMN_IDENTIFIER="validationresultidentifier"
- 236CUSTOM_EXPECTATION_LIST=[
- 237"expect_column_values_to_be_date_not_older_than",
- 238"expect_column_pair_a_to_be_smaller_or_equal_than_b",
- 239"expect_multicolumn_column_a_must_equal_b_or_c",
- 240"expect_queried_column_agg_value_to_be",
- 241"expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b",
- 242"expect_column_pair_a_to_be_not_equal_to_b",
- 243]
- 244DQ_VALIDATIONS_SCHEMA=StructType(
- 245[
- 246StructField(
- 247"dq_validations",
- 248StructType(
- 249[
- 250StructField("run_name",StringType()),
- 251StructField("run_success",BooleanType()),
- 252StructField("raised_exceptions",BooleanType()),
- 253StructField("run_row_success",BooleanType()),
- 254StructField(
- 255"dq_failure_details",
- 256ArrayType(
- 257StructType(
- 258[
- 259StructField("expectation_type",StringType()),
- 260StructField("kwargs",StringType()),
- 261]
- 262),
- 263),
- 264),
- 265]
- 266),
- 267)
- 268]
- 269)
- 270
- 271
- 272classWriteType(Enum):
- 273"""Types of write operations."""
- 274
- 275OVERWRITE="overwrite"
- 276COMPLETE="complete"
- 277APPEND="append"
- 278UPDATE="update"
- 279MERGE="merge"
- 280ERROR_IF_EXISTS="error"
- 281IGNORE_IF_EXISTS="ignore"
- 282
- 283
- 284@dataclass
- 285classInputSpec(object):
- 286"""Specification of an algorithm input.
- 287
- 288 This is very aligned with the way the execution environment connects to the sources
- 289 (e.g., spark sources).
- 290
- 291 - spec_id: spec_id of the input specification read_type: ReadType type of read
- 292 operation.
- 293 - data_format: format of the input.
- 294 - sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp
- 295 directory.
- 296 - df_name: dataframe name.
- 297 - db_table: table name in the form of `<db>.<table>`.
- 298 - location: uri that identifies from where to read data in the specified format.
- 299 - enforce_schema_from_table: if we want to enforce the table schema or not, by
- 300 providing a table name in the form of `<db>.<table>`.
- 301 - query: sql query to execute and return the dataframe. Use it if you do not want to
- 302 read from a file system nor from a table, but rather from a sql query instead.
- 303 - schema: dict representation of a schema of the input (e.g., Spark struct type
- 304 schema).
- 305 - schema_path: path to a file with a representation of a schema of the input (e.g.,
- 306 Spark struct type schema).
- 307 - disable_dbfs_retry: optional flag to disable file storage dbfs.
- 308 - with_filepath: if we want to include the path of the file that is being read. Only
- 309 works with the file reader (batch and streaming modes are supported).
- 310 - options: dict with other relevant options according to the execution
- 311 environment (e.g., spark) possible sources.
- 312 - calculate_upper_bound: when to calculate upper bound to extract from SAP BW
- 313 or not.
- 314 - calc_upper_bound_schema: specific schema for the calculated upper_bound.
- 315 - generate_predicates: when to generate predicates to extract from SAP BW or not.
- 316 - predicates_add_null: if we want to include is null on partition by predicates.
- 317 - temp_view: optional name of a view to point to the input dataframe to be used
- 318 to create or replace a temp view on top of the dataframe.
- 319 """
- 320
- 321spec_id:str
- 322read_type:str
- 323data_format:Optional[str]=None
- 324sftp_files_format:Optional[str]=None
- 325df_name:Optional[DataFrame]=None
- 326db_table:Optional[str]=None
- 327location:Optional[str]=None
- 328query:Optional[str]=None
- 329enforce_schema_from_table:Optional[str]=None
- 330schema:Optional[dict]=None
- 331schema_path:Optional[str]=None
- 332disable_dbfs_retry:bool=False
- 333with_filepath:bool=False
- 334options:Optional[dict]=None
- 335jdbc_args:Optional[dict]=None
- 336calculate_upper_bound:bool=False
- 337calc_upper_bound_schema:Optional[str]=None
- 338generate_predicates:bool=False
- 339predicates_add_null:bool=True
- 340temp_view:Optional[str]=None
- 341
- 342
- 343@dataclass
- 344classTransformerSpec(object):
- 345"""Transformer Specification, i.e., a single transformation amongst many.
- 346
- 347 - function: name of the function (or callable function) to be executed.
- 348 - args: (not applicable if using a callable function) dict with the arguments
- 349 to pass to the function `<k,v>` pairs with the name of the parameter of
- 350 the function and the respective value.
- 351 """
- 352
- 353function:str
- 354args:dict
- 355
- 356
- 357@dataclass
- 358classTransformSpec(object):
- 359"""Transformation Specification.
- 360
- 361 I.e., the specification that defines the many transformations to be done to the data
- 362 that was read.
- 363
- 364 - spec_id: id of the terminate specification
- 365 - input_id: id of the corresponding input
- 366 specification.
- 367 - transformers: list of transformers to execute.
- 368 - force_streaming_foreach_batch_processing: sometimes, when using streaming, we want
- 369 to force the transform to be executed in the foreachBatch function to ensure
- 370 non-supported streaming operations can be properly executed.
- 371 """
- 372
- 373spec_id:str
- 374input_id:str
- 375transformers:List[TransformerSpec]
- 376force_streaming_foreach_batch_processing:bool=False
- 377
- 378
- 379classDQType(Enum):
- 380"""Available data quality tasks."""
- 381
- 382VALIDATOR="validator"
- 383PRISMA="prisma"
- 384
- 385
- 386classDQExecutionPoint(Enum):
- 387"""Available data quality execution points."""
- 388
- 389IN_MOTION="in_motion"
- 390AT_REST="at_rest"
- 391
- 392
- 393classDQTableBaseParameters(Enum):
- 394"""Base parameters for importing DQ rules from a table."""
- 395
- 396PRISMA_BASE_PARAMETERS=["arguments","dq_tech_function"]
+ 182EMAIL="email"
+ 183
+ 184
+ 185classNotificationRuntimeParameters(Enum):
+ 186"""Parameters to be replaced in runtime."""
+ 187
+ 188DATABRICKS_JOB_NAME="databricks_job_name"
+ 189DATABRICKS_WORKSPACE_ID="databricks_workspace_id"
+ 190
+ 191
+ 192NOTIFICATION_RUNTIME_PARAMETERS=[
+ 193NotificationRuntimeParameters.DATABRICKS_JOB_NAME.value,
+ 194NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID.value,
+ 195]
+ 196
+ 197
+ 198classReadType(Enum):
+ 199"""Define the types of read operations.
+ 200
+ 201 - BATCH - read the data in batch mode (e.g., Spark batch).
+ 202 - STREAMING - read the data in streaming mode (e.g., Spark streaming).
+ 203 """
+ 204
+ 205BATCH="batch"
+ 206STREAMING="streaming"
+ 207
+ 208
+ 209classReadMode(Enum):
+ 210"""Different modes that control how we handle compliance to the provided schema.
+ 211
+ 212 These read modes map to Spark's read modes at the moment.
+ 213 """
+ 214
+ 215PERMISSIVE="PERMISSIVE"
+ 216FAILFAST="FAILFAST"
+ 217DROPMALFORMED="DROPMALFORMED"
+ 218
+ 219
+ 220classDQDefaults(Enum):
+ 221"""Defaults used on the data quality process."""
+ 222
+ 223FILE_SYSTEM_STORE="file_system"
+ 224FILE_SYSTEM_S3_STORE="s3"
+ 225DQ_BATCH_IDENTIFIERS=["spec_id","input_id","timestamp"]
+ 226DATASOURCE_CLASS_NAME="Datasource"
+ 227DATASOURCE_EXECUTION_ENGINE="SparkDFExecutionEngine"
+ 228DATA_CONNECTORS_CLASS_NAME="RuntimeDataConnector"
+ 229DATA_CONNECTORS_MODULE_NAME="great_expectations.datasource.data_connector"
+ 230DATA_CHECKPOINTS_CLASS_NAME="SimpleCheckpoint"
+ 231DATA_CHECKPOINTS_CONFIG_VERSION=1.0
+ 232STORE_BACKEND="s3"
+ 233EXPECTATIONS_STORE_PREFIX="dq/expectations/"
+ 234VALIDATIONS_STORE_PREFIX="dq/validations/"
+ 235DATA_DOCS_PREFIX="dq/data_docs/site/"
+ 236CHECKPOINT_STORE_PREFIX="dq/checkpoints/"
+ 237VALIDATION_COLUMN_IDENTIFIER="validationresultidentifier"
+ 238CUSTOM_EXPECTATION_LIST=[
+ 239"expect_column_values_to_be_date_not_older_than",
+ 240"expect_column_pair_a_to_be_smaller_or_equal_than_b",
+ 241"expect_multicolumn_column_a_must_equal_b_or_c",
+ 242"expect_queried_column_agg_value_to_be",
+ 243"expect_column_pair_date_a_to_be_greater_than_or_equal_to_date_b",
+ 244"expect_column_pair_a_to_be_not_equal_to_b",
+ 245]
+ 246DQ_VALIDATIONS_SCHEMA=StructType(
+ 247[
+ 248StructField(
+ 249"dq_validations",
+ 250StructType(
+ 251[
+ 252StructField("run_name",StringType()),
+ 253StructField("run_success",BooleanType()),
+ 254StructField("raised_exceptions",BooleanType()),
+ 255StructField("run_row_success",BooleanType()),
+ 256StructField(
+ 257"dq_failure_details",
+ 258ArrayType(
+ 259StructType(
+ 260[
+ 261StructField("expectation_type",StringType()),
+ 262StructField("kwargs",StringType()),
+ 263]
+ 264),
+ 265),
+ 266),
+ 267]
+ 268),
+ 269)
+ 270]
+ 271)
+ 272
+ 273
+ 274classWriteType(Enum):
+ 275"""Types of write operations."""
+ 276
+ 277OVERWRITE="overwrite"
+ 278COMPLETE="complete"
+ 279APPEND="append"
+ 280UPDATE="update"
+ 281MERGE="merge"
+ 282ERROR_IF_EXISTS="error"
+ 283IGNORE_IF_EXISTS="ignore"
+ 284
+ 285
+ 286@dataclass
+ 287classInputSpec(object):
+ 288"""Specification of an algorithm input.
+ 289
+ 290 This is very aligned with the way the execution environment connects to the sources
+ 291 (e.g., spark sources).
+ 292
+ 293 - spec_id: spec_id of the input specification read_type: ReadType type of read
+ 294 operation.
+ 295 - data_format: format of the input.
+ 296 - sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp
+ 297 directory.
+ 298 - df_name: dataframe name.
+ 299 - db_table: table name in the form of `<db>.<table>`.
+ 300 - location: uri that identifies from where to read data in the specified format.
+ 301 - enforce_schema_from_table: if we want to enforce the table schema or not, by
+ 302 providing a table name in the form of `<db>.<table>`.
+ 303 - query: sql query to execute and return the dataframe. Use it if you do not want to
+ 304 read from a file system nor from a table, but rather from a sql query instead.
+ 305 - schema: dict representation of a schema of the input (e.g., Spark struct type
+ 306 schema).
+ 307 - schema_path: path to a file with a representation of a schema of the input (e.g.,
+ 308 Spark struct type schema).
+ 309 - disable_dbfs_retry: optional flag to disable file storage dbfs.
+ 310 - with_filepath: if we want to include the path of the file that is being read. Only
+ 311 works with the file reader (batch and streaming modes are supported).
+ 312 - options: dict with other relevant options according to the execution
+ 313 environment (e.g., spark) possible sources.
+ 314 - calculate_upper_bound: when to calculate upper bound to extract from SAP BW
+ 315 or not.
+ 316 - calc_upper_bound_schema: specific schema for the calculated upper_bound.
+ 317 - generate_predicates: when to generate predicates to extract from SAP BW or not.
+ 318 - predicates_add_null: if we want to include is null on partition by predicates.
+ 319 - temp_view: optional name of a view to point to the input dataframe to be used
+ 320 to create or replace a temp view on top of the dataframe.
+ 321 """
+ 322
+ 323spec_id:str
+ 324read_type:str
+ 325data_format:Optional[str]=None
+ 326sftp_files_format:Optional[str]=None
+ 327df_name:Optional[DataFrame]=None
+ 328db_table:Optional[str]=None
+ 329location:Optional[str]=None
+ 330query:Optional[str]=None
+ 331enforce_schema_from_table:Optional[str]=None
+ 332schema:Optional[dict]=None
+ 333schema_path:Optional[str]=None
+ 334disable_dbfs_retry:bool=False
+ 335with_filepath:bool=False
+ 336options:Optional[dict]=None
+ 337jdbc_args:Optional[dict]=None
+ 338calculate_upper_bound:bool=False
+ 339calc_upper_bound_schema:Optional[str]=None
+ 340generate_predicates:bool=False
+ 341predicates_add_null:bool=True
+ 342temp_view:Optional[str]=None
+ 343
+ 344
+ 345@dataclass
+ 346classTransformerSpec(object):
+ 347"""Transformer Specification, i.e., a single transformation amongst many.
+ 348
+ 349 - function: name of the function (or callable function) to be executed.
+ 350 - args: (not applicable if using a callable function) dict with the arguments
+ 351 to pass to the function `<k,v>` pairs with the name of the parameter of
+ 352 the function and the respective value.
+ 353 """
+ 354
+ 355function:str
+ 356args:dict
+ 357
+ 358
+ 359@dataclass
+ 360classTransformSpec(object):
+ 361"""Transformation Specification.
+ 362
+ 363 I.e., the specification that defines the many transformations to be done to the data
+ 364 that was read.
+ 365
+ 366 - spec_id: id of the terminate specification
+ 367 - input_id: id of the corresponding input
+ 368 specification.
+ 369 - transformers: list of transformers to execute.
+ 370 - force_streaming_foreach_batch_processing: sometimes, when using streaming, we want
+ 371 to force the transform to be executed in the foreachBatch function to ensure
+ 372 non-supported streaming operations can be properly executed.
+ 373 """
+ 374
+ 375spec_id:str
+ 376input_id:str
+ 377transformers:List[TransformerSpec]
+ 378force_streaming_foreach_batch_processing:bool=False
+ 379
+ 380
+ 381classDQType(Enum):
+ 382"""Available data quality tasks."""
+ 383
+ 384VALIDATOR="validator"
+ 385PRISMA="prisma"
+ 386
+ 387
+ 388classDQExecutionPoint(Enum):
+ 389"""Available data quality execution points."""
+ 390
+ 391IN_MOTION="in_motion"
+ 392AT_REST="at_rest"
+ 393
+ 394
+ 395classDQTableBaseParameters(Enum):
+ 396"""Base parameters for importing DQ rules from a table.""" 397
- 398
- 399@dataclass
- 400classDQFunctionSpec(object):
- 401"""Defines a data quality function specification.
- 402
- 403 - function - name of the data quality function (expectation) to execute.
- 404 It follows the great_expectations api https://greatexpectations.io/expectations/.
- 405 - args - args of the function (expectation). Follow the same api as above.
- 406 """
- 407
- 408function:str
- 409args:Optional[dict]=None
- 410
- 411
- 412@dataclass
- 413classDQSpec(object):
- 414"""Data quality overall specification.
- 415
- 416 - spec_id - id of the specification.
- 417 - input_id - id of the input specification.
- 418 - dq_type - type of DQ process to execute (e.g. validator).
- 419 - dq_functions - list of function specifications to execute.
- 420 - dq_db_table - name of table to derive the dq functions from.
- 421 - dq_table_table_filter - name of the table which rules are to be applied in the
- 422 validations (Only used when deriving dq functions).
- 423 - dq_table_extra_filters - extra filters to be used when deriving dq functions.
- 424 This is a sql expression to be applied to the dq_db_table.
- 425 - execution_point - execution point of the dq functions. [at_rest, in_motion].
- 426 This is set during the load_data or dq_validator functions.
- 427 - unexpected_rows_pk - the list of columns composing the primary key of the
- 428 source data to identify the rows failing the DQ validations. Note: only one
- 429 of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It
- 430 is mandatory to provide one of these arguments when using tag_source_data
- 431 as True. When tag_source_data is False, this is not mandatory, but still
- 432 recommended.
- 433 - tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
- 434 Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to
- 435 be provided. It is mandatory to provide one of these arguments when using
- 436 tag_source_data as True. hen tag_source_data is False, this is not
- 437 mandatory, but still recommended.
- 438 - gx_result_format - great expectations result format. Default: "COMPLETE".
- 439 - tag_source_data - when set to true, this will ensure that the DQ process ends by
- 440 tagging the source data with an additional column with information about the
- 441 DQ results. This column makes it possible to identify if the DQ run was
- 442 succeeded in general and, if not, it unlocks the insights to know what
- 443 specific rows have made the DQ validations fail and why. Default: False.
- 444 Note: it only works if result_sink_explode is True, gx_result_format is
- 445 COMPLETE, fail_on_error is False (which is done automatically when
- 446 you specify tag_source_data as True) and tbl_to_derive_pk or
- 447 unexpected_rows_pk is configured.
- 448 - store_backend - which store_backend to use (e.g. s3 or file_system).
- 449 - local_fs_root_dir - path of the root directory. Note: only applicable for
- 450 store_backend file_system.
- 451 - data_docs_local_fs - the path for data docs only for store_backend
- 452 file_system.
- 453 - bucket - the bucket name to consider for the store_backend (store DQ artefacts).
- 454 Note: only applicable for store_backend s3.
- 455 - data_docs_bucket - the bucket name for data docs only. When defined, it will
- 456 supersede bucket parameter. Note: only applicable for store_backend s3.
- 457 - expectations_store_prefix - prefix where to store expectations' data. Note: only
- 458 applicable for store_backend s3.
- 459 - validations_store_prefix - prefix where to store validations' data. Note: only
+ 398PRISMA_BASE_PARAMETERS=["arguments","dq_tech_function"]
+ 399
+ 400
+ 401@dataclass
+ 402classDQFunctionSpec(object):
+ 403"""Defines a data quality function specification.
+ 404
+ 405 - function - name of the data quality function (expectation) to execute.
+ 406 It follows the great_expectations api https://greatexpectations.io/expectations/.
+ 407 - args - args of the function (expectation). Follow the same api as above.
+ 408 """
+ 409
+ 410function:str
+ 411args:Optional[dict]=None
+ 412
+ 413
+ 414@dataclass
+ 415classDQSpec(object):
+ 416"""Data quality overall specification.
+ 417
+ 418 - spec_id - id of the specification.
+ 419 - input_id - id of the input specification.
+ 420 - dq_type - type of DQ process to execute (e.g. validator).
+ 421 - dq_functions - list of function specifications to execute.
+ 422 - dq_db_table - name of table to derive the dq functions from.
+ 423 - dq_table_table_filter - name of the table which rules are to be applied in the
+ 424 validations (Only used when deriving dq functions).
+ 425 - dq_table_extra_filters - extra filters to be used when deriving dq functions.
+ 426 This is a sql expression to be applied to the dq_db_table.
+ 427 - execution_point - execution point of the dq functions. [at_rest, in_motion].
+ 428 This is set during the load_data or dq_validator functions.
+ 429 - unexpected_rows_pk - the list of columns composing the primary key of the
+ 430 source data to identify the rows failing the DQ validations. Note: only one
+ 431 of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It
+ 432 is mandatory to provide one of these arguments when using tag_source_data
+ 433 as True. When tag_source_data is False, this is not mandatory, but still
+ 434 recommended.
+ 435 - tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
+ 436 Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to
+ 437 be provided. It is mandatory to provide one of these arguments when using
+ 438 tag_source_data as True. hen tag_source_data is False, this is not
+ 439 mandatory, but still recommended.
+ 440 - gx_result_format - great expectations result format. Default: "COMPLETE".
+ 441 - tag_source_data - when set to true, this will ensure that the DQ process ends by
+ 442 tagging the source data with an additional column with information about the
+ 443 DQ results. This column makes it possible to identify if the DQ run was
+ 444 succeeded in general and, if not, it unlocks the insights to know what
+ 445 specific rows have made the DQ validations fail and why. Default: False.
+ 446 Note: it only works if result_sink_explode is True, gx_result_format is
+ 447 COMPLETE, fail_on_error is False (which is done automatically when
+ 448 you specify tag_source_data as True) and tbl_to_derive_pk or
+ 449 unexpected_rows_pk is configured.
+ 450 - store_backend - which store_backend to use (e.g. s3 or file_system).
+ 451 - local_fs_root_dir - path of the root directory. Note: only applicable for
+ 452 store_backend file_system.
+ 453 - data_docs_local_fs - the path for data docs only for store_backend
+ 454 file_system.
+ 455 - bucket - the bucket name to consider for the store_backend (store DQ artefacts).
+ 456 Note: only applicable for store_backend s3.
+ 457 - data_docs_bucket - the bucket name for data docs only. When defined, it will
+ 458 supersede bucket parameter. Note: only applicable for store_backend s3.
+ 459 - expectations_store_prefix - prefix where to store expectations' data. Note: only 460 applicable for store_backend s3.
- 461 - data_docs_prefix - prefix where to store data_docs' data.
- 462 - checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only
- 463 applicable for store_backend s3.
- 464 - data_asset_name - name of the data asset to consider when configuring the great
- 465 expectations' data source.
- 466 - expectation_suite_name - name to consider for great expectations' suite.
- 467 - result_sink_db_table - db.table_name indicating the database and table in which
- 468 to save the results of the DQ process.
- 469 - result_sink_location - file system location in which to save the results of the
- 470 DQ process.
- 471 - data_product_name - name of the data product.
- 472 - result_sink_partitions - the list of partitions to consider.
- 473 - result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
- 474 - result_sink_options - extra spark options for configuring the result sink.
- 475 E.g: can be used to configure a Kafka sink if result_sink_format is kafka.
- 476 - result_sink_explode - flag to determine if the output table/location should have
- 477 the columns exploded (as True) or not (as False). Default: True.
- 478 - result_sink_extra_columns - list of extra columns to be exploded (following
- 479 the pattern "<name>.*") or columns to be selected. It is only used when
- 480 result_sink_explode is set to True.
- 481 - source - name of data source, to be easier to identify in analysis. If not
- 482 specified, it is set as default <input_id>. This will be only used
- 483 when result_sink_explode is set to True.
- 484 - fail_on_error - whether to fail the algorithm if the validations of your data in
- 485 the DQ process failed.
- 486 - cache_df - whether to cache the dataframe before running the DQ process or not.
- 487 - critical_functions - functions that should not fail. When this argument is
- 488 defined, fail_on_error is nullified.
- 489 - max_percentage_failure - percentage of failure that should be allowed.
- 490 This argument has priority over both fail_on_error and critical_functions.
- 491 """
- 492
- 493spec_id:str
- 494input_id:str
- 495dq_type:str
- 496dq_functions:Optional[List[DQFunctionSpec]]=None
- 497dq_db_table:Optional[str]=None
- 498dq_table_table_filter:Optional[str]=None
- 499dq_table_extra_filters:Optional[str]=None
- 500execution_point:Optional[str]=None
- 501unexpected_rows_pk:Optional[List[str]]=None
- 502tbl_to_derive_pk:Optional[str]=None
- 503gx_result_format:Optional[str]="COMPLETE"
- 504tag_source_data:Optional[bool]=False
- 505store_backend:str=DQDefaults.STORE_BACKEND.value
- 506local_fs_root_dir:Optional[str]=None
- 507data_docs_local_fs:Optional[str]=None
- 508bucket:Optional[str]=None
- 509data_docs_bucket:Optional[str]=None
- 510expectations_store_prefix:str=DQDefaults.EXPECTATIONS_STORE_PREFIX.value
- 511validations_store_prefix:str=DQDefaults.VALIDATIONS_STORE_PREFIX.value
- 512data_docs_prefix:str=DQDefaults.DATA_DOCS_PREFIX.value
- 513checkpoint_store_prefix:str=DQDefaults.CHECKPOINT_STORE_PREFIX.value
- 514data_asset_name:Optional[str]=None
- 515expectation_suite_name:Optional[str]=None
- 516result_sink_db_table:Optional[str]=None
- 517result_sink_location:Optional[str]=None
- 518data_product_name:Optional[str]=None
- 519result_sink_partitions:Optional[List[str]]=None
- 520result_sink_format:str=OutputFormat.DELTAFILES.value
- 521result_sink_options:Optional[dict]=None
- 522result_sink_explode:bool=True
- 523result_sink_extra_columns:Optional[List[str]]=None
- 524source:Optional[str]=None
- 525fail_on_error:bool=True
- 526cache_df:bool=False
- 527critical_functions:Optional[List[DQFunctionSpec]]=None
- 528max_percentage_failure:Optional[float]=None
- 529
- 530
- 531@dataclass
- 532classMergeOptions(object):
- 533"""Options for a merge operation.
- 534
- 535 - merge_predicate: predicate to apply to the merge operation so that we can
- 536 check if a new record corresponds to a record already included in the
- 537 historical data.
- 538 - insert_only: indicates if the merge should only insert data (e.g., deduplicate
- 539 scenarios).
- 540 - delete_predicate: predicate to apply to the delete operation.
- 541 - update_predicate: predicate to apply to the update operation.
- 542 - insert_predicate: predicate to apply to the insert operation.
- 543 - update_column_set: rules to apply to the update operation which allows to
- 544 set the value for each column to be updated.
- 545 (e.g. {"data": "new.data", "count": "current.count + 1"} )
- 546 - insert_column_set: rules to apply to the insert operation which allows to
- 547 set the value for each column to be inserted.
- 548 (e.g. {"date": "updates.date", "count": "1"} )
- 549 """
- 550
- 551merge_predicate:str
- 552insert_only:bool=False
- 553delete_predicate:Optional[str]=None
- 554update_predicate:Optional[str]=None
- 555insert_predicate:Optional[str]=None
- 556update_column_set:Optional[dict]=None
- 557insert_column_set:Optional[dict]=None
- 558
- 559
- 560@dataclass
- 561classOutputSpec(object):
- 562"""Specification of an algorithm output.
- 563
- 564 This is very aligned with the way the execution environment connects to the output
- 565 systems (e.g., spark outputs).
- 566
- 567 - spec_id: id of the output specification.
- 568 - input_id: id of the corresponding input specification.
- 569 - write_type: type of write operation.
- 570 - data_format: format of the output. Defaults to DELTA.
- 571 - db_table: table name in the form of `<db>.<table>`.
- 572 - location: uri that identifies from where to write data in the specified format.
- 573 - partitions: list of partition input_col names.
- 574 - merge_opts: options to apply to the merge operation.
- 575 - streaming_micro_batch_transformers: transformers to invoke for each streaming
- 576 micro batch, before writing (i.e., in Spark's foreachBatch structured
- 577 streaming function). Note: the lakehouse engine manages this for you, so
- 578 you don't have to manually specify streaming transformations here, so we don't
- 579 advise you to manually specify transformations through this parameter. Supply
- 580 them as regular transformers in the transform_specs sections of an ACON.
- 581 - streaming_once: if the streaming query is to be executed just once, or not,
- 582 generating just one micro batch.
- 583 - streaming_processing_time: if streaming query is to be kept alive, this indicates
- 584 the processing time of each micro batch.
- 585 - streaming_available_now: if set to True, set a trigger that processes all
- 586 available data in multiple batches then terminates the query.
- 587 When using streaming, this is the default trigger that the lakehouse-engine will
- 588 use, unless you configure a different one.
- 589 - streaming_continuous: set a trigger that runs a continuous query with a given
- 590 checkpoint interval.
- 591 - streaming_await_termination: whether to wait (True) for the termination of the
- 592 streaming query (e.g. timeout or exception) or not (False). Default: True.
- 593 - streaming_await_termination_timeout: a timeout to set to the
- 594 streaming_await_termination. Default: None.
- 595 - with_batch_id: whether to include the streaming batch id in the final data,
- 596 or not. It only takes effect in streaming mode.
- 597 - options: dict with other relevant options according to the execution environment
- 598 (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for
- 599 streaming, etc.
- 600 - streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers
- 601 but for the DQ functions to be executed. Used internally by the lakehouse
- 602 engine, so you don't have to supply DQ functions through this parameter. Use the
- 603 dq_specs of the acon instead.
- 604 """
- 605
- 606spec_id:str
- 607input_id:str
- 608write_type:str
- 609data_format:str=OutputFormat.DELTAFILES.value
- 610db_table:Optional[str]=None
- 611location:Optional[str]=None
- 612merge_opts:Optional[MergeOptions]=None
- 613partitions:Optional[List[str]]=None
- 614streaming_micro_batch_transformers:Optional[List[TransformerSpec]]=None
- 615streaming_once:Optional[bool]=None
- 616streaming_processing_time:Optional[str]=None
- 617streaming_available_now:bool=True
- 618streaming_continuous:Optional[str]=None
- 619streaming_await_termination:bool=True
- 620streaming_await_termination_timeout:Optional[int]=None
- 621with_batch_id:bool=False
- 622options:Optional[dict]=None
- 623streaming_micro_batch_dq_processors:Optional[List[DQSpec]]=None
- 624
- 625
- 626@dataclass
- 627classTerminatorSpec(object):
- 628"""Terminator Specification.
- 629
- 630 I.e., the specification that defines a terminator operation to be executed. Examples
- 631 are compute statistics, vacuum, optimize, etc.
- 632
- 633 - function: terminator function to execute.
- 634 - args: arguments of the terminator function.
- 635 - input_id: id of the corresponding output specification (Optional).
- 636 """
- 637
- 638function:str
- 639args:Optional[dict]=None
- 640input_id:Optional[str]=None
- 641
- 642
- 643@dataclass
- 644classReconciliatorSpec(object):
- 645"""Reconciliator Specification.
- 646
- 647 - metrics: list of metrics in the form of:
- 648 [{
- 649 metric: name of the column present in both truth and current datasets,
- 650 aggregation: sum, avg, max, min, ...,
- 651 type: percentage or absolute,
- 652 yellow: value,
- 653 red: value
- 654 }].
- 655 - recon_type: reconciliation type (percentage or absolute). Percentage calculates
- 656 the difference between truth and current results as a percentage (x-y/x), and
- 657 absolute calculates the raw difference (x - y).
- 658 - truth_input_spec: input specification of the truth data.
- 659 - current_input_spec: input specification of the current results data
- 660 - truth_preprocess_query: additional query on top of the truth input data to
- 661 preprocess the truth data before it gets fueled into the reconciliation process.
- 662 Important note: you need to assume that the data out of
- 663 the truth_input_spec is referencable by a table called 'truth'.
- 664 - truth_preprocess_query_args: optional dict having the functions/transformations to
- 665 apply on top of the truth_preprocess_query and respective arguments. Note: cache
- 666 is being applied on the Dataframe, by default. For turning the default behavior
- 667 off, pass `"truth_preprocess_query_args": []`.
- 668 - current_preprocess_query: additional query on top of the current results input
- 669 data to preprocess the current results data before it gets fueled into the
- 670 reconciliation process. Important note: you need to assume that the data out of
- 671 the current_results_input_spec is referencable by a table called 'current'.
- 672 - current_preprocess_query_args: optional dict having the
- 673 functions/transformations to apply on top of the current_preprocess_query
- 674 and respective arguments. Note: cache is being applied on the Dataframe,
- 675 by default. For turning the default behavior off, pass
- 676 `"current_preprocess_query_args": []`.
- 677 - ignore_empty_df: optional boolean, to ignore the recon process if source & target
- 678 dataframes are empty, recon will exit success code (passed)
- 679 """
- 680
- 681metrics:List[dict]
- 682truth_input_spec:InputSpec
- 683current_input_spec:InputSpec
- 684truth_preprocess_query:Optional[str]=None
- 685truth_preprocess_query_args:Optional[List[dict]]=None
- 686current_preprocess_query:Optional[str]=None
- 687current_preprocess_query_args:Optional[List[dict]]=None
- 688ignore_empty_df:Optional[bool]=False
- 689
- 690
- 691@dataclass
- 692classDQValidatorSpec(object):
- 693"""Data Quality Validator Specification.
- 694
- 695 - input_spec: input specification of the data to be checked/validated.
- 696 - dq_spec: data quality specification.
- 697 - restore_prev_version: specify if, having
- 698 delta table/files as input, they should be restored to the
- 699 previous version if the data quality process fails. Note: this
- 700 is only considered if fail_on_error is kept as True.
- 701 """
- 702
- 703input_spec:InputSpec
- 704dq_spec:DQSpec
- 705restore_prev_version:Optional[bool]=False
- 706
- 707
- 708classSQLDefinitions(Enum):
- 709"""SQL definitions statements."""
- 710
- 711compute_table_stats="ANALYZE TABLE {} COMPUTE STATISTICS"
- 712drop_table_stmt="DROP TABLE IF EXISTS"
- 713drop_view_stmt="DROP VIEW IF EXISTS"
- 714truncate_stmt="TRUNCATE TABLE"
- 715describe_stmt="DESCRIBE TABLE"
- 716optimize_stmt="OPTIMIZE"
- 717show_tbl_props_stmt="SHOW TBLPROPERTIES"
- 718delete_where_stmt="DELETE FROM {} WHERE {}"
- 719
- 720
- 721classFileManagerAPIKeys(Enum):
- 722"""File Manager s3 api keys."""
- 723
- 724CONTENTS="Contents"
- 725KEY="Key"
- 726CONTINUATION="NextContinuationToken"
- 727BUCKET="Bucket"
- 728OBJECTS="Objects"
- 729
- 730
- 731@dataclass
- 732classSensorSpec(object):
- 733"""Sensor Specification.
- 734
- 735 - sensor_id: sensor id.
- 736 - assets: a list of assets that are considered as available to
- 737 consume downstream after this sensor has status
- 738 PROCESSED_NEW_DATA.
- 739 - control_db_table_name: db.table to store sensor metadata.
- 740 - input_spec: input specification of the source to be checked for new data.
- 741 - preprocess_query: SQL query to transform/filter the result from the
- 742 upstream. Consider that we should refer to 'new_data' whenever
- 743 we are referring to the input of the sensor. E.g.:
- 744 "SELECT dummy_col FROM new_data WHERE ..."
- 745 - checkpoint_location: optional location to store checkpoints to resume
- 746 from. These checkpoints use the same as Spark checkpoint strategy.
- 747 For Spark readers that do not support checkpoints, use the
- 748 preprocess_query parameter to form a SQL query to filter the result
- 749 from the upstream accordingly.
- 750 - fail_on_empty_result: if the sensor should throw an error if there is no new
- 751 data in the upstream. Default: True.
- 752 """
- 753
- 754sensor_id:str
- 755assets:List[str]
- 756control_db_table_name:str
- 757input_spec:InputSpec
- 758preprocess_query:Optional[str]
- 759checkpoint_location:Optional[str]
- 760fail_on_empty_result:bool=True
- 761
- 762@classmethod
- 763defcreate_from_acon(cls,acon:dict):# type: ignore
- 764"""Create SensorSpec from acon.
- 765
- 766 Args:
- 767 acon: sensor ACON.
- 768 """
- 769checkpoint_location=acon.get("base_checkpoint_location")
- 770ifcheckpoint_location:
- 771checkpoint_location=(
- 772f"{checkpoint_location.rstrip('/')}/lakehouse_engine/"
- 773f"sensors/{acon['sensor_id']}"
- 774)
- 775
- 776returncls(
- 777sensor_id=acon["sensor_id"],
- 778assets=acon["assets"],
- 779control_db_table_name=acon["control_db_table_name"],
- 780input_spec=InputSpec(**acon["input_spec"]),
- 781preprocess_query=acon.get("preprocess_query"),
- 782checkpoint_location=checkpoint_location,
- 783fail_on_empty_result=acon.get("fail_on_empty_result",True),
- 784)
- 785
- 786
- 787classSensorStatus(Enum):
- 788"""Status for a sensor."""
- 789
- 790ACQUIRED_NEW_DATA="ACQUIRED_NEW_DATA"
- 791PROCESSED_NEW_DATA="PROCESSED_NEW_DATA"
- 792
- 793
- 794SENSOR_SCHEMA=StructType(
- 795[
- 796StructField("sensor_id",StringType(),False),
- 797StructField("assets",ArrayType(StringType(),False),True),
- 798StructField("status",StringType(),False),
- 799StructField("status_change_timestamp",TimestampType(),False),
- 800StructField("checkpoint_location",StringType(),True),
- 801StructField("upstream_key",StringType(),True),
- 802StructField("upstream_value",StringType(),True),
- 803]
- 804)
- 805
- 806SENSOR_UPDATE_SET:dict={
- 807"sensors.sensor_id":"updates.sensor_id",
- 808"sensors.status":"updates.status",
- 809"sensors.status_change_timestamp":"updates.status_change_timestamp",
- 810}
- 811
- 812SENSOR_ALLOWED_DATA_FORMATS={
- 813ReadType.STREAMING.value:[InputFormat.KAFKA.value,*FILE_INPUT_FORMATS],
- 814ReadType.BATCH.value:[
- 815InputFormat.DELTAFILES.value,
- 816InputFormat.JDBC.value,
- 817],
- 818}
- 819
- 820
- 821classSAPLogchain(Enum):
- 822"""Defaults used on consuming data from SAP Logchain."""
- 823
- 824DBTABLE="SAPPHA.RSPCLOGCHAIN"
- 825GREEN_STATUS="G"
- 826ENGINE_TABLE="sensor_new_data"
- 827
- 828
- 829classRestoreType(Enum):
- 830"""Archive types."""
- 831
- 832BULK="Bulk"
- 833STANDARD="Standard"
- 834EXPEDITED="Expedited"
- 835
- 836@classmethod
- 837defvalues(cls):# type: ignore
- 838"""Generates a list containing all enum values.
- 839
- 840 Return:
- 841 A list with all enum values.
- 842 """
- 843return(c.valueforcincls)
- 844
- 845@classmethod
- 846defexists(cls,restore_type:str)->bool:
- 847"""Checks if the restore type exists in the enum values.
- 848
- 849 Args:
- 850 restore_type: restore type to check if exists.
- 851
- 852 Return:
- 853 If the restore type exists in our enum.
- 854 """
- 855returnrestore_typeincls.values()
- 856
- 857
- 858classRestoreStatus(Enum):
- 859"""Archive types."""
- 860
- 861NOT_STARTED="not_started"
- 862ONGOING="ongoing"
- 863RESTORED="restored"
- 864
- 865
- 866ARCHIVE_STORAGE_CLASS=[
- 867"GLACIER",
- 868"DEEP_ARCHIVE",
- 869"GLACIER_IR",
- 870]
- 871
- 872
- 873classSQLParser(Enum):
- 874"""Defaults to use for parsing."""
- 875
- 876DOUBLE_QUOTES='"'
- 877SINGLE_QUOTES="'"
- 878BACKSLASH="\\"
- 879SINGLE_TRACE="-"
- 880DOUBLE_TRACES="--"
- 881SLASH="/"
- 882OPENING_MULTIPLE_LINE_COMMENT="/*"
- 883CLOSING_MULTIPLE_LINE_COMMENT="*/"
- 884PARAGRAPH="\n"
- 885STAR="*"
- 886
- 887MULTIPLE_LINE_COMMENT=[
- 888OPENING_MULTIPLE_LINE_COMMENT,
- 889CLOSING_MULTIPLE_LINE_COMMENT,
- 890]
- 891
- 892
- 893classGABDefaults(Enum):
- 894"""Defaults used on the GAB process."""
- 895
- 896DATE_FORMAT="%Y-%m-%d"
- 897DIMENSIONS_DEFAULT_COLUMNS=["from_date","to_date"]
- 898DEFAULT_DIMENSION_CALENDAR_TABLE="dim_calendar"
- 899DEFAULT_LOOKUP_QUERY_BUILDER_TABLE="lkp_query_builder"
- 900
- 901
- 902classGABStartOfWeek(Enum):
- 903"""Representation of start of week values on GAB."""
- 904
- 905SUNDAY="S"
- 906MONDAY="M"
- 907
- 908@classmethod
- 909defget_start_of_week(cls)->dict:
- 910"""Get the start of week enum as a dict.
- 911
- 912 Returns:
- 913 dict containing all enum entries as `{name:value}`.
- 914 """
- 915return{
- 916start_of_week.name:start_of_week.value
- 917forstart_of_weekinlist(GABStartOfWeek)
- 918}
- 919
- 920@classmethod
- 921defget_values(cls)->set[str]:
- 922"""Get the start of week enum values as set.
- 923
- 924 Returns:
- 925 set containing all possible values `{value}`.
- 926 """
- 927return{start_of_week.valueforstart_of_weekinlist(GABStartOfWeek)}
- 928
- 929
- 930@dataclass
- 931classGABSpec(object):
- 932"""Gab Specification.
- 933
- 934 query_label_filter: query use-case label to execute.
- 935 queue_filter: queue to execute the job.
- 936 cadence_filter: selected cadences to build the asset.
- 937 target_database: target database to write.
- 938 curr_date: current date.
- 939 start_date: period start date.
- 940 end_date: period end date.
- 941 rerun_flag: rerun flag.
- 942 target_table: target table to write.
- 943 source_database: source database.
- 944 gab_base_path: base path to read the use cases.
- 945 lookup_table: gab configuration table.
- 946 calendar_table: gab calendar table.
- 947 """
- 948
- 949query_label_filter:list[str]
- 950queue_filter:list[str]
- 951cadence_filter:list[str]
- 952target_database:str
- 953current_date:datetime
- 954start_date:datetime
- 955end_date:datetime
- 956rerun_flag:str
- 957target_table:str
- 958source_database:str
- 959gab_base_path:str
- 960lookup_table:str
- 961calendar_table:str
- 962
- 963@classmethod
- 964defcreate_from_acon(cls,acon:dict):# type: ignore
- 965"""Create GabSpec from acon.
- 966
- 967 Args:
- 968 acon: gab ACON.
- 969 """
- 970lookup_table=f"{acon['source_database']}."+(
- 971acon.get(
- 972"lookup_table",GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE.value
- 973)
- 974)
- 975
- 976calendar_table=f"{acon['source_database']}."+(
- 977acon.get(
- 978"calendar_table",GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE.value
- 979)
- 980)
- 981
- 982defformat_date(date_to_format:Union[datetime,str])->datetime:
- 983ifisinstance(date_to_format,str):
- 984returndatetime.strptime(date_to_format,GABDefaults.DATE_FORMAT.value)
- 985else:
- 986returndate_to_format
- 987
- 988returncls(
- 989query_label_filter=acon["query_label_filter"],
- 990queue_filter=acon["queue_filter"],
- 991cadence_filter=acon["cadence_filter"],
- 992target_database=acon["target_database"],
- 993current_date=datetime.now(),
- 994start_date=format_date(acon["start_date"]),
- 995end_date=format_date(acon["end_date"]),
- 996rerun_flag=acon["rerun_flag"],
- 997target_table=acon["target_table"],
- 998source_database=acon["source_database"],
- 999gab_base_path=acon["gab_base_path"],
-1000lookup_table=lookup_table,
-1001calendar_table=calendar_table,
-1002)
-1003
-1004
-1005classGABCadence(Enum):
-1006"""Representation of the supported cadences on GAB."""
-1007
-1008DAY=1
-1009WEEK=2
-1010MONTH=3
-1011QUARTER=4
-1012YEAR=5
-1013
-1014@classmethod
-1015defget_ordered_cadences(cls)->dict:
-1016"""Get the cadences ordered by the value.
-1017
-1018 Returns:
-1019 dict containing ordered cadences as `{name:value}`.
-1020 """
-1021cadences=list(GABCadence)
-1022return{
-1023cadence.name:cadence.value
-1024forcadenceinsorted(cadences,key=lambdagab_cadence:gab_cadence.value)
-1025}
-1026
-1027@classmethod
-1028defget_cadences(cls)->set[str]:
-1029"""Get the cadences values as set.
-1030
-1031 Returns:
-1032 set containing all possible cadence values as `{value}`.
-1033 """
-1034return{cadence.nameforcadenceinlist(GABCadence)}
-1035
-1036@classmethod
-1037deforder_cadences(cls,cadences_to_order:list[str])->list[str]:
-1038"""Order a list of cadences by value.
-1039
-1040 Returns:
-1041 ordered set containing the received cadences.
-1042 """
-1043returnsorted(
-1044cadences_to_order,
-1045key=lambdaitem:cls.get_ordered_cadences().get(item),# type: ignore
-1046)
-1047
-1048
-1049classGABKeys:
-1050"""Constants used to update pre-configured gab dict key."""
-1051
-1052JOIN_SELECT="join_select"
-1053PROJECT_START="project_start"
-1054PROJECT_END="project_end"
-1055
-1056
-1057classGABReplaceableKeys:
-1058"""Constants used to replace pre-configured gab dict values."""
-1059
-1060CADENCE="${cad}"
-1061DATE_COLUMN="${date_column}"
-1062CONFIG_WEEK_START="${config_week_start}"
-1063RECONCILIATION_CADENCE="${rec_cadence}"
-1064
-1065
-1066classGABCombinedConfiguration(Enum):
-1067"""GAB combined configuration.
-1068
-1069 Based on the use case configuration return the values to override in the SQL file.
-1070 This enum aims to exhaustively map each combination of `cadence`, `reconciliation`,
-1071 `week_start` and `snap_flag` return the corresponding values `join_select`,
-1072 `project_start` and `project_end` to replace this values in the stages SQL file.
-1073
-1074 Return corresponding configuration (join_select, project_start, project_end) for
-1075 each combination (cadence x recon x week_start x snap_flag).
-1076 """
-1077
-1078_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE=(
-1079"date(date_trunc('${cad}',${date_column}))"
-1080)
-1081_DEFAULT_PROJECT_START="df_cal.cadence_start_date"
-1082_DEFAULT_PROJECT_END="df_cal.cadence_end_date"
-1083
-1084COMBINED_CONFIGURATION={
-1085# Combination of:
-1086# - cadence: `DAY`
-1087# - reconciliation_window: `DAY`, `WEEK`, `MONTH`, `QUARTER`, `YEAR`
-1088# - week_start: `S`, `M`
-1089# - snapshot_flag: `Y`, `N`
-10901:{
-1091"cadence":GABCadence.DAY.name,
-1092"recon":GABCadence.get_cadences(),
-1093"week_start":GABStartOfWeek.get_values(),
-1094"snap_flag":{"Y","N"},
-1095"join_select":"",
-1096"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1097"project_end":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1098},
-1099# Combination of:
-1100# - cadence: `WEEK`
-1101# - reconciliation_window: `DAY`
-1102# - week_start: `S`, `M`
-1103# - snapshot_flag: `Y`
-11042:{
-1105"cadence":GABCadence.WEEK.name,
-1106"recon":GABCadence.DAY.name,
-1107"week_start":GABStartOfWeek.get_values(),
-1108"snap_flag":"Y",
-1109"join_select":"""
-1110 select distinct case
-1111 when '${config_week_start}' = 'Monday' then weekstart_mon
-1112 when '${config_week_start}' = 'Sunday' then weekstart_sun
-1113 end as cadence_start_date,
-1114 calendar_date as cadence_end_date
-1115 """,
-1116"project_start":_DEFAULT_PROJECT_START,
-1117"project_end":_DEFAULT_PROJECT_END,
-1118},
-1119# Combination of:
-1120# - cadence: `WEEK`
-1121# - reconciliation_window: `DAY, `MONTH`, `QUARTER`, `YEAR`
-1122# - week_start: `M`
-1123# - snapshot_flag: `Y`, `N`
-11243:{
-1125"cadence":GABCadence.WEEK.name,
-1126"recon":{
-1127GABCadence.DAY.name,
-1128GABCadence.MONTH.name,
-1129GABCadence.QUARTER.name,
-1130GABCadence.YEAR.name,
-1131},
-1132"week_start":"M",
-1133"snap_flag":{"Y","N"},
-1134"join_select":"""
-1135 select distinct case
-1136 when '${config_week_start}' = 'Monday' then weekstart_mon
-1137 when '${config_week_start}' = 'Sunday' then weekstart_sun
-1138 end as cadence_start_date,
-1139 case
-1140 when '${config_week_start}' = 'Monday' then weekend_mon
-1141 when '${config_week_start}' = 'Sunday' then weekend_sun
-1142 end as cadence_end_date""",
-1143"project_start":_DEFAULT_PROJECT_START,
-1144"project_end":_DEFAULT_PROJECT_END,
-1145},
-11464:{
-1147"cadence":GABCadence.MONTH.name,
-1148"recon":GABCadence.DAY.name,
-1149"week_start":GABStartOfWeek.get_values(),
-1150"snap_flag":"Y",
-1151"join_select":"""
-1152 select distinct month_start as cadence_start_date,
-1153 calendar_date as cadence_end_date
-1154 """,
-1155"project_start":_DEFAULT_PROJECT_START,
-1156"project_end":_DEFAULT_PROJECT_END,
-1157},
-11585:{
-1159"cadence":GABCadence.MONTH.name,
-1160"recon":GABCadence.WEEK.name,
-1161"week_start":GABStartOfWeek.MONDAY.value,
-1162"snap_flag":"Y",
-1163"join_select":"""
-1164 select distinct month_start as cadence_start_date,
-1165 case
-1166 when date(
-1167 date_trunc('MONTH',add_months(calendar_date, 1))
-1168 )-1 < weekend_mon
-1169 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
-1170 else weekend_mon
-1171 end as cadence_end_date""",
-1172"project_start":_DEFAULT_PROJECT_START,
-1173"project_end":_DEFAULT_PROJECT_END,
-1174},
-11756:{
-1176"cadence":GABCadence.MONTH.name,
-1177"recon":GABCadence.WEEK.name,
-1178"week_start":GABStartOfWeek.SUNDAY.value,
-1179"snap_flag":"Y",
-1180"join_select":"""
-1181 select distinct month_start as cadence_start_date,
-1182 case
-1183 when date(
-1184 date_trunc('MONTH',add_months(calendar_date, 1))
-1185 )-1 < weekend_sun
-1186 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
-1187 else weekend_sun
-1188 end as cadence_end_date""",
-1189"project_start":_DEFAULT_PROJECT_START,
-1190"project_end":_DEFAULT_PROJECT_END,
-1191},
-11927:{
-1193"cadence":GABCadence.MONTH.name,
-1194"recon":GABCadence.get_cadences(),
-1195"week_start":GABStartOfWeek.get_values(),
-1196"snap_flag":{"Y","N"},
-1197"join_select":"",
-1198"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1199"project_end":"date(date_trunc('MONTH',add_months(${date_column}, 1)))-1",
-1200},
-12018:{
-1202"cadence":GABCadence.QUARTER.name,
-1203"recon":GABCadence.DAY.name,
-1204"week_start":GABStartOfWeek.get_values(),
-1205"snap_flag":"Y",
-1206"join_select":"""
-1207 select distinct quarter_start as cadence_start_date,
-1208 calendar_date as cadence_end_date
-1209 """,
-1210"project_start":_DEFAULT_PROJECT_START,
-1211"project_end":_DEFAULT_PROJECT_END,
-1212},
-12139:{
-1214"cadence":GABCadence.QUARTER.name,
-1215"recon":GABCadence.WEEK.name,
-1216"week_start":GABStartOfWeek.MONDAY.value,
-1217"snap_flag":"Y",
-1218"join_select":"""
-1219 select distinct quarter_start as cadence_start_date,
-1220 case
-1221 when weekend_mon > date(
-1222 date_trunc('QUARTER',add_months(calendar_date, 3))
-1223 )-1
-1224 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
-1225 else weekend_mon
-1226 end as cadence_end_date""",
-1227"project_start":_DEFAULT_PROJECT_START,
-1228"project_end":_DEFAULT_PROJECT_END,
-1229},
-123010:{
-1231"cadence":GABCadence.QUARTER.name,
-1232"recon":GABCadence.WEEK.name,
-1233"week_start":GABStartOfWeek.SUNDAY.value,
-1234"snap_flag":"Y",
-1235"join_select":"""
-1236 select distinct quarter_start as cadence_start_date,
-1237 case
-1238 when weekend_sun > date(
-1239 date_trunc('QUARTER',add_months(calendar_date, 3))
-1240 )-1
-1241 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
-1242 else weekend_sun
-1243 end as cadence_end_date""",
-1244"project_start":_DEFAULT_PROJECT_START,
-1245"project_end":_DEFAULT_PROJECT_END,
-1246},
-124711:{
-1248"cadence":GABCadence.QUARTER.name,
-1249"recon":GABCadence.MONTH.name,
-1250"week_start":GABStartOfWeek.get_values(),
-1251"snap_flag":"Y",
-1252"join_select":"""
-1253 select distinct quarter_start as cadence_start_date,
-1254 month_end as cadence_end_date
-1255 """,
-1256"project_start":_DEFAULT_PROJECT_START,
-1257"project_end":_DEFAULT_PROJECT_END,
-1258},
-125912:{
-1260"cadence":GABCadence.QUARTER.name,
-1261"recon":GABCadence.YEAR.name,
-1262"week_start":GABStartOfWeek.get_values(),
-1263"snap_flag":"N",
-1264"join_select":"",
-1265"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1266"project_end":"""
-1267 date(
-1268 date_trunc(
-1269 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)
-1270 )
-1271 )-1
-1272 """,
-1273},
-127413:{
-1275"cadence":GABCadence.QUARTER.name,
-1276"recon":GABCadence.get_cadences(),
-1277"week_start":GABStartOfWeek.get_values(),
-1278"snap_flag":"N",
-1279"join_select":"",
-1280"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1281"project_end":"""
-1282 date(
-1283 date_trunc(
-1284 '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)
-1285 )
-1286 )-1
-1287 """,
-1288},
-128914:{
-1290"cadence":GABCadence.YEAR.name,
-1291"recon":GABCadence.WEEK.name,
-1292"week_start":GABStartOfWeek.MONDAY.value,
-1293"snap_flag":"Y",
-1294"join_select":"""
-1295 select distinct year_start as cadence_start_date,
-1296 case
-1297 when weekend_mon > date(
-1298 date_trunc('YEAR',add_months(calendar_date, 12))
-1299 )-1
-1300 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
-1301 else weekend_mon
-1302 end as cadence_end_date""",
-1303"project_start":_DEFAULT_PROJECT_START,
-1304"project_end":_DEFAULT_PROJECT_END,
-1305},
-130615:{
-1307"cadence":GABCadence.YEAR.name,
-1308"recon":GABCadence.WEEK.name,
-1309"week_start":GABStartOfWeek.SUNDAY.value,
-1310"snap_flag":"Y",
-1311"join_select":"""
-1312 select distinct year_start as cadence_start_date,
-1313 case
-1314 when weekend_sun > date(
-1315 date_trunc('YEAR',add_months(calendar_date, 12))
-1316 )-1
-1317 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
-1318 else weekend_sun
-1319 end as cadence_end_date""",
-1320"project_start":_DEFAULT_PROJECT_START,
-1321"project_end":_DEFAULT_PROJECT_END,
-1322},
-132316:{
-1324"cadence":GABCadence.YEAR.name,
-1325"recon":GABCadence.get_cadences(),
-1326"week_start":GABStartOfWeek.get_values(),
-1327"snap_flag":"N",
-1328"inverse_flag":"Y",
-1329"join_select":"",
-1330"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1331"project_end":"""
-1332 date(
-1333 date_trunc(
-1334 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)
-1335 )
-1336 )-1
-1337 """,
-1338},
-133917:{
-1340"cadence":GABCadence.YEAR.name,
-1341"recon":{
-1342GABCadence.DAY.name,
-1343GABCadence.MONTH.name,
-1344GABCadence.QUARTER.name,
-1345},
-1346"week_start":GABStartOfWeek.get_values(),
-1347"snap_flag":"Y",
-1348"join_select":"""
-1349 select distinct year_start as cadence_start_date,
-1350 case
-1351 when '${rec_cadence}' = 'DAY' then calendar_date
-1352 when '${rec_cadence}' = 'MONTH' then month_end
-1353 when '${rec_cadence}' = 'QUARTER' then quarter_end
-1354 end as cadence_end_date
-1355 """,
-1356"project_start":_DEFAULT_PROJECT_START,
-1357"project_end":_DEFAULT_PROJECT_END,
-1358},
-135918:{
-1360"cadence":GABCadence.get_cadences(),
-1361"recon":GABCadence.get_cadences(),
-1362"week_start":GABStartOfWeek.get_values(),
-1363"snap_flag":{"Y","N"},
-1364"join_select":"""
-1365 select distinct
-1366 case
-1367 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
-1368 then weekstart_mon
-1369 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
-1370 then weekstart_sun
-1371 else
-1372 date(date_trunc('${cad}',calendar_date))
-1373 end as cadence_start_date,
-1374 case
-1375 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
-1376 then weekend_mon
-1377 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
-1378 then weekend_sun
-1379 when '${cad}' = 'DAY'
-1380 then date(date_trunc('${cad}',calendar_date))
-1381 when '${cad}' = 'MONTH'
-1382 then date(
-1383 date_trunc(
-1384 'MONTH',
-1385 add_months(date(date_trunc('${cad}',calendar_date)), 1)
-1386 )
-1387 )-1
-1388 when '${cad}' = 'QUARTER'
-1389 then date(
-1390 date_trunc(
-1391 'QUARTER',
-1392 add_months(date(date_trunc('${cad}',calendar_date)) , 3)
-1393 )
-1394 )-1
-1395 when '${cad}' = 'YEAR'
-1396 then date(
-1397 date_trunc(
-1398 'YEAR',
-1399 add_months(date(date_trunc('${cad}',calendar_date)), 12)
-1400 )
-1401 )-1
-1402 end as cadence_end_date
-1403 """,
-1404"project_start":_DEFAULT_PROJECT_START,
-1405"project_end":_DEFAULT_PROJECT_END,
-1406},
-1407}
+ 461 - validations_store_prefix - prefix where to store validations' data. Note: only
+ 462 applicable for store_backend s3.
+ 463 - data_docs_prefix - prefix where to store data_docs' data.
+ 464 - checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only
+ 465 applicable for store_backend s3.
+ 466 - data_asset_name - name of the data asset to consider when configuring the great
+ 467 expectations' data source.
+ 468 - expectation_suite_name - name to consider for great expectations' suite.
+ 469 - result_sink_db_table - db.table_name indicating the database and table in which
+ 470 to save the results of the DQ process.
+ 471 - result_sink_location - file system location in which to save the results of the
+ 472 DQ process.
+ 473 - data_product_name - name of the data product.
+ 474 - result_sink_partitions - the list of partitions to consider.
+ 475 - result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
+ 476 - result_sink_options - extra spark options for configuring the result sink.
+ 477 E.g: can be used to configure a Kafka sink if result_sink_format is kafka.
+ 478 - result_sink_explode - flag to determine if the output table/location should have
+ 479 the columns exploded (as True) or not (as False). Default: True.
+ 480 - result_sink_extra_columns - list of extra columns to be exploded (following
+ 481 the pattern "<name>.*") or columns to be selected. It is only used when
+ 482 result_sink_explode is set to True.
+ 483 - source - name of data source, to be easier to identify in analysis. If not
+ 484 specified, it is set as default <input_id>. This will be only used
+ 485 when result_sink_explode is set to True.
+ 486 - fail_on_error - whether to fail the algorithm if the validations of your data in
+ 487 the DQ process failed.
+ 488 - cache_df - whether to cache the dataframe before running the DQ process or not.
+ 489 - critical_functions - functions that should not fail. When this argument is
+ 490 defined, fail_on_error is nullified.
+ 491 - max_percentage_failure - percentage of failure that should be allowed.
+ 492 This argument has priority over both fail_on_error and critical_functions.
+ 493 """
+ 494
+ 495spec_id:str
+ 496input_id:str
+ 497dq_type:str
+ 498dq_functions:Optional[List[DQFunctionSpec]]=None
+ 499dq_db_table:Optional[str]=None
+ 500dq_table_table_filter:Optional[str]=None
+ 501dq_table_extra_filters:Optional[str]=None
+ 502execution_point:Optional[str]=None
+ 503unexpected_rows_pk:Optional[List[str]]=None
+ 504tbl_to_derive_pk:Optional[str]=None
+ 505gx_result_format:Optional[str]="COMPLETE"
+ 506tag_source_data:Optional[bool]=False
+ 507store_backend:str=DQDefaults.STORE_BACKEND.value
+ 508local_fs_root_dir:Optional[str]=None
+ 509data_docs_local_fs:Optional[str]=None
+ 510bucket:Optional[str]=None
+ 511data_docs_bucket:Optional[str]=None
+ 512expectations_store_prefix:str=DQDefaults.EXPECTATIONS_STORE_PREFIX.value
+ 513validations_store_prefix:str=DQDefaults.VALIDATIONS_STORE_PREFIX.value
+ 514data_docs_prefix:str=DQDefaults.DATA_DOCS_PREFIX.value
+ 515checkpoint_store_prefix:str=DQDefaults.CHECKPOINT_STORE_PREFIX.value
+ 516data_asset_name:Optional[str]=None
+ 517expectation_suite_name:Optional[str]=None
+ 518result_sink_db_table:Optional[str]=None
+ 519result_sink_location:Optional[str]=None
+ 520data_product_name:Optional[str]=None
+ 521result_sink_partitions:Optional[List[str]]=None
+ 522result_sink_format:str=OutputFormat.DELTAFILES.value
+ 523result_sink_options:Optional[dict]=None
+ 524result_sink_explode:bool=True
+ 525result_sink_extra_columns:Optional[List[str]]=None
+ 526source:Optional[str]=None
+ 527fail_on_error:bool=True
+ 528cache_df:bool=False
+ 529critical_functions:Optional[List[DQFunctionSpec]]=None
+ 530max_percentage_failure:Optional[float]=None
+ 531
+ 532
+ 533@dataclass
+ 534classMergeOptions(object):
+ 535"""Options for a merge operation.
+ 536
+ 537 - merge_predicate: predicate to apply to the merge operation so that we can
+ 538 check if a new record corresponds to a record already included in the
+ 539 historical data.
+ 540 - insert_only: indicates if the merge should only insert data (e.g., deduplicate
+ 541 scenarios).
+ 542 - delete_predicate: predicate to apply to the delete operation.
+ 543 - update_predicate: predicate to apply to the update operation.
+ 544 - insert_predicate: predicate to apply to the insert operation.
+ 545 - update_column_set: rules to apply to the update operation which allows to
+ 546 set the value for each column to be updated.
+ 547 (e.g. {"data": "new.data", "count": "current.count + 1"} )
+ 548 - insert_column_set: rules to apply to the insert operation which allows to
+ 549 set the value for each column to be inserted.
+ 550 (e.g. {"date": "updates.date", "count": "1"} )
+ 551 """
+ 552
+ 553merge_predicate:str
+ 554insert_only:bool=False
+ 555delete_predicate:Optional[str]=None
+ 556update_predicate:Optional[str]=None
+ 557insert_predicate:Optional[str]=None
+ 558update_column_set:Optional[dict]=None
+ 559insert_column_set:Optional[dict]=None
+ 560
+ 561
+ 562@dataclass
+ 563classOutputSpec(object):
+ 564"""Specification of an algorithm output.
+ 565
+ 566 This is very aligned with the way the execution environment connects to the output
+ 567 systems (e.g., spark outputs).
+ 568
+ 569 - spec_id: id of the output specification.
+ 570 - input_id: id of the corresponding input specification.
+ 571 - write_type: type of write operation.
+ 572 - data_format: format of the output. Defaults to DELTA.
+ 573 - db_table: table name in the form of `<db>.<table>`.
+ 574 - location: uri that identifies from where to write data in the specified format.
+ 575 - partitions: list of partition input_col names.
+ 576 - merge_opts: options to apply to the merge operation.
+ 577 - streaming_micro_batch_transformers: transformers to invoke for each streaming
+ 578 micro batch, before writing (i.e., in Spark's foreachBatch structured
+ 579 streaming function). Note: the lakehouse engine manages this for you, so
+ 580 you don't have to manually specify streaming transformations here, so we don't
+ 581 advise you to manually specify transformations through this parameter. Supply
+ 582 them as regular transformers in the transform_specs sections of an ACON.
+ 583 - streaming_once: if the streaming query is to be executed just once, or not,
+ 584 generating just one micro batch.
+ 585 - streaming_processing_time: if streaming query is to be kept alive, this indicates
+ 586 the processing time of each micro batch.
+ 587 - streaming_available_now: if set to True, set a trigger that processes all
+ 588 available data in multiple batches then terminates the query.
+ 589 When using streaming, this is the default trigger that the lakehouse-engine will
+ 590 use, unless you configure a different one.
+ 591 - streaming_continuous: set a trigger that runs a continuous query with a given
+ 592 checkpoint interval.
+ 593 - streaming_await_termination: whether to wait (True) for the termination of the
+ 594 streaming query (e.g. timeout or exception) or not (False). Default: True.
+ 595 - streaming_await_termination_timeout: a timeout to set to the
+ 596 streaming_await_termination. Default: None.
+ 597 - with_batch_id: whether to include the streaming batch id in the final data,
+ 598 or not. It only takes effect in streaming mode.
+ 599 - options: dict with other relevant options according to the execution environment
+ 600 (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for
+ 601 streaming, etc.
+ 602 - streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers
+ 603 but for the DQ functions to be executed. Used internally by the lakehouse
+ 604 engine, so you don't have to supply DQ functions through this parameter. Use the
+ 605 dq_specs of the acon instead.
+ 606 """
+ 607
+ 608spec_id:str
+ 609input_id:str
+ 610write_type:str
+ 611data_format:str=OutputFormat.DELTAFILES.value
+ 612db_table:Optional[str]=None
+ 613location:Optional[str]=None
+ 614merge_opts:Optional[MergeOptions]=None
+ 615partitions:Optional[List[str]]=None
+ 616streaming_micro_batch_transformers:Optional[List[TransformerSpec]]=None
+ 617streaming_once:Optional[bool]=None
+ 618streaming_processing_time:Optional[str]=None
+ 619streaming_available_now:bool=True
+ 620streaming_continuous:Optional[str]=None
+ 621streaming_await_termination:bool=True
+ 622streaming_await_termination_timeout:Optional[int]=None
+ 623with_batch_id:bool=False
+ 624options:Optional[dict]=None
+ 625streaming_micro_batch_dq_processors:Optional[List[DQSpec]]=None
+ 626
+ 627
+ 628@dataclass
+ 629classTerminatorSpec(object):
+ 630"""Terminator Specification.
+ 631
+ 632 I.e., the specification that defines a terminator operation to be executed. Examples
+ 633 are compute statistics, vacuum, optimize, etc.
+ 634
+ 635 - function: terminator function to execute.
+ 636 - args: arguments of the terminator function.
+ 637 - input_id: id of the corresponding output specification (Optional).
+ 638 """
+ 639
+ 640function:str
+ 641args:Optional[dict]=None
+ 642input_id:Optional[str]=None
+ 643
+ 644
+ 645@dataclass
+ 646classReconciliatorSpec(object):
+ 647"""Reconciliator Specification.
+ 648
+ 649 - metrics: list of metrics in the form of:
+ 650 [{
+ 651 metric: name of the column present in both truth and current datasets,
+ 652 aggregation: sum, avg, max, min, ...,
+ 653 type: percentage or absolute,
+ 654 yellow: value,
+ 655 red: value
+ 656 }].
+ 657 - recon_type: reconciliation type (percentage or absolute). Percentage calculates
+ 658 the difference between truth and current results as a percentage (x-y/x), and
+ 659 absolute calculates the raw difference (x - y).
+ 660 - truth_input_spec: input specification of the truth data.
+ 661 - current_input_spec: input specification of the current results data
+ 662 - truth_preprocess_query: additional query on top of the truth input data to
+ 663 preprocess the truth data before it gets fueled into the reconciliation process.
+ 664 Important note: you need to assume that the data out of
+ 665 the truth_input_spec is referencable by a table called 'truth'.
+ 666 - truth_preprocess_query_args: optional dict having the functions/transformations to
+ 667 apply on top of the truth_preprocess_query and respective arguments. Note: cache
+ 668 is being applied on the Dataframe, by default. For turning the default behavior
+ 669 off, pass `"truth_preprocess_query_args": []`.
+ 670 - current_preprocess_query: additional query on top of the current results input
+ 671 data to preprocess the current results data before it gets fueled into the
+ 672 reconciliation process. Important note: you need to assume that the data out of
+ 673 the current_results_input_spec is referencable by a table called 'current'.
+ 674 - current_preprocess_query_args: optional dict having the
+ 675 functions/transformations to apply on top of the current_preprocess_query
+ 676 and respective arguments. Note: cache is being applied on the Dataframe,
+ 677 by default. For turning the default behavior off, pass
+ 678 `"current_preprocess_query_args": []`.
+ 679 - ignore_empty_df: optional boolean, to ignore the recon process if source & target
+ 680 dataframes are empty, recon will exit success code (passed)
+ 681 """
+ 682
+ 683metrics:List[dict]
+ 684truth_input_spec:InputSpec
+ 685current_input_spec:InputSpec
+ 686truth_preprocess_query:Optional[str]=None
+ 687truth_preprocess_query_args:Optional[List[dict]]=None
+ 688current_preprocess_query:Optional[str]=None
+ 689current_preprocess_query_args:Optional[List[dict]]=None
+ 690ignore_empty_df:Optional[bool]=False
+ 691
+ 692
+ 693@dataclass
+ 694classDQValidatorSpec(object):
+ 695"""Data Quality Validator Specification.
+ 696
+ 697 - input_spec: input specification of the data to be checked/validated.
+ 698 - dq_spec: data quality specification.
+ 699 - restore_prev_version: specify if, having
+ 700 delta table/files as input, they should be restored to the
+ 701 previous version if the data quality process fails. Note: this
+ 702 is only considered if fail_on_error is kept as True.
+ 703 """
+ 704
+ 705input_spec:InputSpec
+ 706dq_spec:DQSpec
+ 707restore_prev_version:Optional[bool]=False
+ 708
+ 709
+ 710classSQLDefinitions(Enum):
+ 711"""SQL definitions statements."""
+ 712
+ 713compute_table_stats="ANALYZE TABLE {} COMPUTE STATISTICS"
+ 714drop_table_stmt="DROP TABLE IF EXISTS"
+ 715drop_view_stmt="DROP VIEW IF EXISTS"
+ 716truncate_stmt="TRUNCATE TABLE"
+ 717describe_stmt="DESCRIBE TABLE"
+ 718optimize_stmt="OPTIMIZE"
+ 719show_tbl_props_stmt="SHOW TBLPROPERTIES"
+ 720delete_where_stmt="DELETE FROM {} WHERE {}"
+ 721
+ 722
+ 723classFileManagerAPIKeys(Enum):
+ 724"""File Manager s3 api keys."""
+ 725
+ 726CONTENTS="Contents"
+ 727KEY="Key"
+ 728CONTINUATION="NextContinuationToken"
+ 729BUCKET="Bucket"
+ 730OBJECTS="Objects"
+ 731
+ 732
+ 733@dataclass
+ 734classSensorSpec(object):
+ 735"""Sensor Specification.
+ 736
+ 737 - sensor_id: sensor id.
+ 738 - assets: a list of assets that are considered as available to
+ 739 consume downstream after this sensor has status
+ 740 PROCESSED_NEW_DATA.
+ 741 - control_db_table_name: db.table to store sensor metadata.
+ 742 - input_spec: input specification of the source to be checked for new data.
+ 743 - preprocess_query: SQL query to transform/filter the result from the
+ 744 upstream. Consider that we should refer to 'new_data' whenever
+ 745 we are referring to the input of the sensor. E.g.:
+ 746 "SELECT dummy_col FROM new_data WHERE ..."
+ 747 - checkpoint_location: optional location to store checkpoints to resume
+ 748 from. These checkpoints use the same as Spark checkpoint strategy.
+ 749 For Spark readers that do not support checkpoints, use the
+ 750 preprocess_query parameter to form a SQL query to filter the result
+ 751 from the upstream accordingly.
+ 752 - fail_on_empty_result: if the sensor should throw an error if there is no new
+ 753 data in the upstream. Default: True.
+ 754 """
+ 755
+ 756sensor_id:str
+ 757assets:List[str]
+ 758control_db_table_name:str
+ 759input_spec:InputSpec
+ 760preprocess_query:Optional[str]
+ 761checkpoint_location:Optional[str]
+ 762fail_on_empty_result:bool=True
+ 763
+ 764@classmethod
+ 765defcreate_from_acon(cls,acon:dict):# type: ignore
+ 766"""Create SensorSpec from acon.
+ 767
+ 768 Args:
+ 769 acon: sensor ACON.
+ 770 """
+ 771checkpoint_location=acon.get("base_checkpoint_location")
+ 772ifcheckpoint_location:
+ 773checkpoint_location=(
+ 774f"{checkpoint_location.rstrip('/')}/lakehouse_engine/"
+ 775f"sensors/{acon['sensor_id']}"
+ 776)
+ 777
+ 778returncls(
+ 779sensor_id=acon["sensor_id"],
+ 780assets=acon["assets"],
+ 781control_db_table_name=acon["control_db_table_name"],
+ 782input_spec=InputSpec(**acon["input_spec"]),
+ 783preprocess_query=acon.get("preprocess_query"),
+ 784checkpoint_location=checkpoint_location,
+ 785fail_on_empty_result=acon.get("fail_on_empty_result",True),
+ 786)
+ 787
+ 788
+ 789classSensorStatus(Enum):
+ 790"""Status for a sensor."""
+ 791
+ 792ACQUIRED_NEW_DATA="ACQUIRED_NEW_DATA"
+ 793PROCESSED_NEW_DATA="PROCESSED_NEW_DATA"
+ 794
+ 795
+ 796SENSOR_SCHEMA=StructType(
+ 797[
+ 798StructField("sensor_id",StringType(),False),
+ 799StructField("assets",ArrayType(StringType(),False),True),
+ 800StructField("status",StringType(),False),
+ 801StructField("status_change_timestamp",TimestampType(),False),
+ 802StructField("checkpoint_location",StringType(),True),
+ 803StructField("upstream_key",StringType(),True),
+ 804StructField("upstream_value",StringType(),True),
+ 805]
+ 806)
+ 807
+ 808SENSOR_UPDATE_SET:dict={
+ 809"sensors.sensor_id":"updates.sensor_id",
+ 810"sensors.status":"updates.status",
+ 811"sensors.status_change_timestamp":"updates.status_change_timestamp",
+ 812}
+ 813
+ 814SENSOR_ALLOWED_DATA_FORMATS={
+ 815ReadType.STREAMING.value:[InputFormat.KAFKA.value,*FILE_INPUT_FORMATS],
+ 816ReadType.BATCH.value:[
+ 817InputFormat.DELTAFILES.value,
+ 818InputFormat.JDBC.value,
+ 819],
+ 820}
+ 821
+ 822
+ 823classSAPLogchain(Enum):
+ 824"""Defaults used on consuming data from SAP Logchain."""
+ 825
+ 826DBTABLE="SAPPHA.RSPCLOGCHAIN"
+ 827GREEN_STATUS="G"
+ 828ENGINE_TABLE="sensor_new_data"
+ 829
+ 830
+ 831classRestoreType(Enum):
+ 832"""Archive types."""
+ 833
+ 834BULK="Bulk"
+ 835STANDARD="Standard"
+ 836EXPEDITED="Expedited"
+ 837
+ 838@classmethod
+ 839defvalues(cls):# type: ignore
+ 840"""Generates a list containing all enum values.
+ 841
+ 842 Return:
+ 843 A list with all enum values.
+ 844 """
+ 845return(c.valueforcincls)
+ 846
+ 847@classmethod
+ 848defexists(cls,restore_type:str)->bool:
+ 849"""Checks if the restore type exists in the enum values.
+ 850
+ 851 Args:
+ 852 restore_type: restore type to check if exists.
+ 853
+ 854 Return:
+ 855 If the restore type exists in our enum.
+ 856 """
+ 857returnrestore_typeincls.values()
+ 858
+ 859
+ 860classRestoreStatus(Enum):
+ 861"""Archive types."""
+ 862
+ 863NOT_STARTED="not_started"
+ 864ONGOING="ongoing"
+ 865RESTORED="restored"
+ 866
+ 867
+ 868ARCHIVE_STORAGE_CLASS=[
+ 869"GLACIER",
+ 870"DEEP_ARCHIVE",
+ 871"GLACIER_IR",
+ 872]
+ 873
+ 874
+ 875classSQLParser(Enum):
+ 876"""Defaults to use for parsing."""
+ 877
+ 878DOUBLE_QUOTES='"'
+ 879SINGLE_QUOTES="'"
+ 880BACKSLASH="\\"
+ 881SINGLE_TRACE="-"
+ 882DOUBLE_TRACES="--"
+ 883SLASH="/"
+ 884OPENING_MULTIPLE_LINE_COMMENT="/*"
+ 885CLOSING_MULTIPLE_LINE_COMMENT="*/"
+ 886PARAGRAPH="\n"
+ 887STAR="*"
+ 888
+ 889MULTIPLE_LINE_COMMENT=[
+ 890OPENING_MULTIPLE_LINE_COMMENT,
+ 891CLOSING_MULTIPLE_LINE_COMMENT,
+ 892]
+ 893
+ 894
+ 895classGABDefaults(Enum):
+ 896"""Defaults used on the GAB process."""
+ 897
+ 898DATE_FORMAT="%Y-%m-%d"
+ 899DIMENSIONS_DEFAULT_COLUMNS=["from_date","to_date"]
+ 900DEFAULT_DIMENSION_CALENDAR_TABLE="dim_calendar"
+ 901DEFAULT_LOOKUP_QUERY_BUILDER_TABLE="lkp_query_builder"
+ 902
+ 903
+ 904classGABStartOfWeek(Enum):
+ 905"""Representation of start of week values on GAB."""
+ 906
+ 907SUNDAY="S"
+ 908MONDAY="M"
+ 909
+ 910@classmethod
+ 911defget_start_of_week(cls)->dict:
+ 912"""Get the start of week enum as a dict.
+ 913
+ 914 Returns:
+ 915 dict containing all enum entries as `{name:value}`.
+ 916 """
+ 917return{
+ 918start_of_week.name:start_of_week.value
+ 919forstart_of_weekinlist(GABStartOfWeek)
+ 920}
+ 921
+ 922@classmethod
+ 923defget_values(cls)->set[str]:
+ 924"""Get the start of week enum values as set.
+ 925
+ 926 Returns:
+ 927 set containing all possible values `{value}`.
+ 928 """
+ 929return{start_of_week.valueforstart_of_weekinlist(GABStartOfWeek)}
+ 930
+ 931
+ 932@dataclass
+ 933classGABSpec(object):
+ 934"""Gab Specification.
+ 935
+ 936 query_label_filter: query use-case label to execute.
+ 937 queue_filter: queue to execute the job.
+ 938 cadence_filter: selected cadences to build the asset.
+ 939 target_database: target database to write.
+ 940 curr_date: current date.
+ 941 start_date: period start date.
+ 942 end_date: period end date.
+ 943 rerun_flag: rerun flag.
+ 944 target_table: target table to write.
+ 945 source_database: source database.
+ 946 gab_base_path: base path to read the use cases.
+ 947 lookup_table: gab configuration table.
+ 948 calendar_table: gab calendar table.
+ 949 """
+ 950
+ 951query_label_filter:list[str]
+ 952queue_filter:list[str]
+ 953cadence_filter:list[str]
+ 954target_database:str
+ 955current_date:datetime
+ 956start_date:datetime
+ 957end_date:datetime
+ 958rerun_flag:str
+ 959target_table:str
+ 960source_database:str
+ 961gab_base_path:str
+ 962lookup_table:str
+ 963calendar_table:str
+ 964
+ 965@classmethod
+ 966defcreate_from_acon(cls,acon:dict):# type: ignore
+ 967"""Create GabSpec from acon.
+ 968
+ 969 Args:
+ 970 acon: gab ACON.
+ 971 """
+ 972lookup_table=f"{acon['source_database']}."+(
+ 973acon.get(
+ 974"lookup_table",GABDefaults.DEFAULT_LOOKUP_QUERY_BUILDER_TABLE.value
+ 975)
+ 976)
+ 977
+ 978calendar_table=f"{acon['source_database']}."+(
+ 979acon.get(
+ 980"calendar_table",GABDefaults.DEFAULT_DIMENSION_CALENDAR_TABLE.value
+ 981)
+ 982)
+ 983
+ 984defformat_date(date_to_format:Union[datetime,str])->datetime:
+ 985ifisinstance(date_to_format,str):
+ 986returndatetime.strptime(date_to_format,GABDefaults.DATE_FORMAT.value)
+ 987else:
+ 988returndate_to_format
+ 989
+ 990returncls(
+ 991query_label_filter=acon["query_label_filter"],
+ 992queue_filter=acon["queue_filter"],
+ 993cadence_filter=acon["cadence_filter"],
+ 994target_database=acon["target_database"],
+ 995current_date=datetime.now(),
+ 996start_date=format_date(acon["start_date"]),
+ 997end_date=format_date(acon["end_date"]),
+ 998rerun_flag=acon["rerun_flag"],
+ 999target_table=acon["target_table"],
+1000source_database=acon["source_database"],
+1001gab_base_path=acon["gab_base_path"],
+1002lookup_table=lookup_table,
+1003calendar_table=calendar_table,
+1004)
+1005
+1006
+1007classGABCadence(Enum):
+1008"""Representation of the supported cadences on GAB."""
+1009
+1010DAY=1
+1011WEEK=2
+1012MONTH=3
+1013QUARTER=4
+1014YEAR=5
+1015
+1016@classmethod
+1017defget_ordered_cadences(cls)->dict:
+1018"""Get the cadences ordered by the value.
+1019
+1020 Returns:
+1021 dict containing ordered cadences as `{name:value}`.
+1022 """
+1023cadences=list(GABCadence)
+1024return{
+1025cadence.name:cadence.value
+1026forcadenceinsorted(cadences,key=lambdagab_cadence:gab_cadence.value)
+1027}
+1028
+1029@classmethod
+1030defget_cadences(cls)->set[str]:
+1031"""Get the cadences values as set.
+1032
+1033 Returns:
+1034 set containing all possible cadence values as `{value}`.
+1035 """
+1036return{cadence.nameforcadenceinlist(GABCadence)}
+1037
+1038@classmethod
+1039deforder_cadences(cls,cadences_to_order:list[str])->list[str]:
+1040"""Order a list of cadences by value.
+1041
+1042 Returns:
+1043 ordered set containing the received cadences.
+1044 """
+1045returnsorted(
+1046cadences_to_order,
+1047key=lambdaitem:cls.get_ordered_cadences().get(item),# type: ignore
+1048)
+1049
+1050
+1051classGABKeys:
+1052"""Constants used to update pre-configured gab dict key."""
+1053
+1054JOIN_SELECT="join_select"
+1055PROJECT_START="project_start"
+1056PROJECT_END="project_end"
+1057
+1058
+1059classGABReplaceableKeys:
+1060"""Constants used to replace pre-configured gab dict values."""
+1061
+1062CADENCE="${cad}"
+1063DATE_COLUMN="${date_column}"
+1064CONFIG_WEEK_START="${config_week_start}"
+1065RECONCILIATION_CADENCE="${rec_cadence}"
+1066
+1067
+1068classGABCombinedConfiguration(Enum):
+1069"""GAB combined configuration.
+1070
+1071 Based on the use case configuration return the values to override in the SQL file.
+1072 This enum aims to exhaustively map each combination of `cadence`, `reconciliation`,
+1073 `week_start` and `snap_flag` return the corresponding values `join_select`,
+1074 `project_start` and `project_end` to replace this values in the stages SQL file.
+1075
+1076 Return corresponding configuration (join_select, project_start, project_end) for
+1077 each combination (cadence x recon x week_start x snap_flag).
+1078 """
+1079
+1080_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE=(
+1081"date(date_trunc('${cad}',${date_column}))"
+1082)
+1083_DEFAULT_PROJECT_START="df_cal.cadence_start_date"
+1084_DEFAULT_PROJECT_END="df_cal.cadence_end_date"
+1085
+1086COMBINED_CONFIGURATION={
+1087# Combination of:
+1088# - cadence: `DAY`
+1089# - reconciliation_window: `DAY`, `WEEK`, `MONTH`, `QUARTER`, `YEAR`
+1090# - week_start: `S`, `M`
+1091# - snapshot_flag: `Y`, `N`
+10921:{
+1093"cadence":GABCadence.DAY.name,
+1094"recon":GABCadence.get_cadences(),
+1095"week_start":GABStartOfWeek.get_values(),
+1096"snap_flag":{"Y","N"},
+1097"join_select":"",
+1098"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1099"project_end":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1100},
+1101# Combination of:
+1102# - cadence: `WEEK`
+1103# - reconciliation_window: `DAY`
+1104# - week_start: `S`, `M`
+1105# - snapshot_flag: `Y`
+11062:{
+1107"cadence":GABCadence.WEEK.name,
+1108"recon":GABCadence.DAY.name,
+1109"week_start":GABStartOfWeek.get_values(),
+1110"snap_flag":"Y",
+1111"join_select":"""
+1112 select distinct case
+1113 when '${config_week_start}' = 'Monday' then weekstart_mon
+1114 when '${config_week_start}' = 'Sunday' then weekstart_sun
+1115 end as cadence_start_date,
+1116 calendar_date as cadence_end_date
+1117 """,
+1118"project_start":_DEFAULT_PROJECT_START,
+1119"project_end":_DEFAULT_PROJECT_END,
+1120},
+1121# Combination of:
+1122# - cadence: `WEEK`
+1123# - reconciliation_window: `DAY, `MONTH`, `QUARTER`, `YEAR`
+1124# - week_start: `M`
+1125# - snapshot_flag: `Y`, `N`
+11263:{
+1127"cadence":GABCadence.WEEK.name,
+1128"recon":{
+1129GABCadence.DAY.name,
+1130GABCadence.MONTH.name,
+1131GABCadence.QUARTER.name,
+1132GABCadence.YEAR.name,
+1133},
+1134"week_start":"M",
+1135"snap_flag":{"Y","N"},
+1136"join_select":"""
+1137 select distinct case
+1138 when '${config_week_start}' = 'Monday' then weekstart_mon
+1139 when '${config_week_start}' = 'Sunday' then weekstart_sun
+1140 end as cadence_start_date,
+1141 case
+1142 when '${config_week_start}' = 'Monday' then weekend_mon
+1143 when '${config_week_start}' = 'Sunday' then weekend_sun
+1144 end as cadence_end_date""",
+1145"project_start":_DEFAULT_PROJECT_START,
+1146"project_end":_DEFAULT_PROJECT_END,
+1147},
+11484:{
+1149"cadence":GABCadence.MONTH.name,
+1150"recon":GABCadence.DAY.name,
+1151"week_start":GABStartOfWeek.get_values(),
+1152"snap_flag":"Y",
+1153"join_select":"""
+1154 select distinct month_start as cadence_start_date,
+1155 calendar_date as cadence_end_date
+1156 """,
+1157"project_start":_DEFAULT_PROJECT_START,
+1158"project_end":_DEFAULT_PROJECT_END,
+1159},
+11605:{
+1161"cadence":GABCadence.MONTH.name,
+1162"recon":GABCadence.WEEK.name,
+1163"week_start":GABStartOfWeek.MONDAY.value,
+1164"snap_flag":"Y",
+1165"join_select":"""
+1166 select distinct month_start as cadence_start_date,
+1167 case
+1168 when date(
+1169 date_trunc('MONTH',add_months(calendar_date, 1))
+1170 )-1 < weekend_mon
+1171 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
+1172 else weekend_mon
+1173 end as cadence_end_date""",
+1174"project_start":_DEFAULT_PROJECT_START,
+1175"project_end":_DEFAULT_PROJECT_END,
+1176},
+11776:{
+1178"cadence":GABCadence.MONTH.name,
+1179"recon":GABCadence.WEEK.name,
+1180"week_start":GABStartOfWeek.SUNDAY.value,
+1181"snap_flag":"Y",
+1182"join_select":"""
+1183 select distinct month_start as cadence_start_date,
+1184 case
+1185 when date(
+1186 date_trunc('MONTH',add_months(calendar_date, 1))
+1187 )-1 < weekend_sun
+1188 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
+1189 else weekend_sun
+1190 end as cadence_end_date""",
+1191"project_start":_DEFAULT_PROJECT_START,
+1192"project_end":_DEFAULT_PROJECT_END,
+1193},
+11947:{
+1195"cadence":GABCadence.MONTH.name,
+1196"recon":GABCadence.get_cadences(),
+1197"week_start":GABStartOfWeek.get_values(),
+1198"snap_flag":{"Y","N"},
+1199"join_select":"",
+1200"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1201"project_end":"date(date_trunc('MONTH',add_months(${date_column}, 1)))-1",
+1202},
+12038:{
+1204"cadence":GABCadence.QUARTER.name,
+1205"recon":GABCadence.DAY.name,
+1206"week_start":GABStartOfWeek.get_values(),
+1207"snap_flag":"Y",
+1208"join_select":"""
+1209 select distinct quarter_start as cadence_start_date,
+1210 calendar_date as cadence_end_date
+1211 """,
+1212"project_start":_DEFAULT_PROJECT_START,
+1213"project_end":_DEFAULT_PROJECT_END,
+1214},
+12159:{
+1216"cadence":GABCadence.QUARTER.name,
+1217"recon":GABCadence.WEEK.name,
+1218"week_start":GABStartOfWeek.MONDAY.value,
+1219"snap_flag":"Y",
+1220"join_select":"""
+1221 select distinct quarter_start as cadence_start_date,
+1222 case
+1223 when weekend_mon > date(
+1224 date_trunc('QUARTER',add_months(calendar_date, 3))
+1225 )-1
+1226 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
+1227 else weekend_mon
+1228 end as cadence_end_date""",
+1229"project_start":_DEFAULT_PROJECT_START,
+1230"project_end":_DEFAULT_PROJECT_END,
+1231},
+123210:{
+1233"cadence":GABCadence.QUARTER.name,
+1234"recon":GABCadence.WEEK.name,
+1235"week_start":GABStartOfWeek.SUNDAY.value,
+1236"snap_flag":"Y",
+1237"join_select":"""
+1238 select distinct quarter_start as cadence_start_date,
+1239 case
+1240 when weekend_sun > date(
+1241 date_trunc('QUARTER',add_months(calendar_date, 3))
+1242 )-1
+1243 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
+1244 else weekend_sun
+1245 end as cadence_end_date""",
+1246"project_start":_DEFAULT_PROJECT_START,
+1247"project_end":_DEFAULT_PROJECT_END,
+1248},
+124911:{
+1250"cadence":GABCadence.QUARTER.name,
+1251"recon":GABCadence.MONTH.name,
+1252"week_start":GABStartOfWeek.get_values(),
+1253"snap_flag":"Y",
+1254"join_select":"""
+1255 select distinct quarter_start as cadence_start_date,
+1256 month_end as cadence_end_date
+1257 """,
+1258"project_start":_DEFAULT_PROJECT_START,
+1259"project_end":_DEFAULT_PROJECT_END,
+1260},
+126112:{
+1262"cadence":GABCadence.QUARTER.name,
+1263"recon":GABCadence.YEAR.name,
+1264"week_start":GABStartOfWeek.get_values(),
+1265"snap_flag":"N",
+1266"join_select":"",
+1267"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1268"project_end":"""
+1269 date(
+1270 date_trunc(
+1271 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)
+1272 )
+1273 )-1
+1274 """,
+1275},
+127613:{
+1277"cadence":GABCadence.QUARTER.name,
+1278"recon":GABCadence.get_cadences(),
+1279"week_start":GABStartOfWeek.get_values(),
+1280"snap_flag":"N",
+1281"join_select":"",
+1282"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1283"project_end":"""
+1284 date(
+1285 date_trunc(
+1286 '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)
+1287 )
+1288 )-1
+1289 """,
+1290},
+129114:{
+1292"cadence":GABCadence.YEAR.name,
+1293"recon":GABCadence.WEEK.name,
+1294"week_start":GABStartOfWeek.MONDAY.value,
+1295"snap_flag":"Y",
+1296"join_select":"""
+1297 select distinct year_start as cadence_start_date,
+1298 case
+1299 when weekend_mon > date(
+1300 date_trunc('YEAR',add_months(calendar_date, 12))
+1301 )-1
+1302 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
+1303 else weekend_mon
+1304 end as cadence_end_date""",
+1305"project_start":_DEFAULT_PROJECT_START,
+1306"project_end":_DEFAULT_PROJECT_END,
+1307},
+130815:{
+1309"cadence":GABCadence.YEAR.name,
+1310"recon":GABCadence.WEEK.name,
+1311"week_start":GABStartOfWeek.SUNDAY.value,
+1312"snap_flag":"Y",
+1313"join_select":"""
+1314 select distinct year_start as cadence_start_date,
+1315 case
+1316 when weekend_sun > date(
+1317 date_trunc('YEAR',add_months(calendar_date, 12))
+1318 )-1
+1319 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
+1320 else weekend_sun
+1321 end as cadence_end_date""",
+1322"project_start":_DEFAULT_PROJECT_START,
+1323"project_end":_DEFAULT_PROJECT_END,
+1324},
+132516:{
+1326"cadence":GABCadence.YEAR.name,
+1327"recon":GABCadence.get_cadences(),
+1328"week_start":GABStartOfWeek.get_values(),
+1329"snap_flag":"N",
+1330"inverse_flag":"Y",
+1331"join_select":"",
+1332"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1333"project_end":"""
+1334 date(
+1335 date_trunc(
+1336 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)
+1337 )
+1338 )-1
+1339 """,
+1340},
+134117:{
+1342"cadence":GABCadence.YEAR.name,
+1343"recon":{
+1344GABCadence.DAY.name,
+1345GABCadence.MONTH.name,
+1346GABCadence.QUARTER.name,
+1347},
+1348"week_start":GABStartOfWeek.get_values(),
+1349"snap_flag":"Y",
+1350"join_select":"""
+1351 select distinct year_start as cadence_start_date,
+1352 case
+1353 when '${rec_cadence}' = 'DAY' then calendar_date
+1354 when '${rec_cadence}' = 'MONTH' then month_end
+1355 when '${rec_cadence}' = 'QUARTER' then quarter_end
+1356 end as cadence_end_date
+1357 """,
+1358"project_start":_DEFAULT_PROJECT_START,
+1359"project_end":_DEFAULT_PROJECT_END,
+1360},
+136118:{
+1362"cadence":GABCadence.get_cadences(),
+1363"recon":GABCadence.get_cadences(),
+1364"week_start":GABStartOfWeek.get_values(),
+1365"snap_flag":{"Y","N"},
+1366"join_select":"""
+1367 select distinct
+1368 case
+1369 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
+1370 then weekstart_mon
+1371 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
+1372 then weekstart_sun
+1373 else
+1374 date(date_trunc('${cad}',calendar_date))
+1375 end as cadence_start_date,
+1376 case
+1377 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
+1378 then weekend_mon
+1379 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
+1380 then weekend_sun
+1381 when '${cad}' = 'DAY'
+1382 then date(date_trunc('${cad}',calendar_date))
+1383 when '${cad}' = 'MONTH'
+1384 then date(
+1385 date_trunc(
+1386 'MONTH',
+1387 add_months(date(date_trunc('${cad}',calendar_date)), 1)
+1388 )
+1389 )-1
+1390 when '${cad}' = 'QUARTER'
+1391 then date(
+1392 date_trunc(
+1393 'QUARTER',
+1394 add_months(date(date_trunc('${cad}',calendar_date)) , 3)
+1395 )
+1396 )-1
+1397 when '${cad}' = 'YEAR'
+1398 then date(
+1399 date_trunc(
+1400 'YEAR',
+1401 add_months(date(date_trunc('${cad}',calendar_date)), 12)
+1402 )
+1403 )-1
+1404 end as cadence_end_date
+1405 """,
+1406"project_start":_DEFAULT_PROJECT_START,
+1407"project_end":_DEFAULT_PROJECT_END,
+1408},
+1409}
@@ -2669,30 +2674,33 @@
Inherited Members
37classEngineConfig(object):38"""Definitions that can come from the Engine Config file.39
-40 - dq_bucket: S3 bucket used to store data quality related artifacts.
-41 - notif_disallowed_email_servers: email servers not allowed to be used
-42 for sending notifications.
-43 - engine_usage_path: path where the engine prod usage stats are stored.
-44 - engine_dev_usage_path: path where the engine dev usage stats are stored.
-45 - collect_engine_usage: whether to enable the collection of lakehouse
-46 engine usage stats or not.
-47 - dq_functions_column_list: list of columns to be added to the meta argument
-48 of GX when using PRISMA.
-49 """
-50
-51dq_bucket:Optional[str]=None
-52notif_disallowed_email_servers:Optional[list]=None
-53engine_usage_path:Optional[str]=None
-54engine_dev_usage_path:Optional[str]=None
-55collect_engine_usage:str=CollectEngineUsage.ENABLED.value
-56dq_functions_column_list:Optional[list]=None
+40 - dq_bucket: S3 prod bucket used to store data quality related artifacts.
+41 - dq_dev_bucket: S3 dev bucket used to store data quality related artifacts.
+42 - notif_disallowed_email_servers: email servers not allowed to be used
+43 for sending notifications.
+44 - engine_usage_path: path where the engine prod usage stats are stored.
+45 - engine_dev_usage_path: path where the engine dev usage stats are stored.
+46 - collect_engine_usage: whether to enable the collection of lakehouse
+47 engine usage stats or not.
+48 - dq_functions_column_list: list of columns to be added to the meta argument
+49 of GX when using PRISMA.
+50 """
+51
+52dq_bucket:Optional[str]=None
+53dq_dev_bucket:Optional[str]=None
+54notif_disallowed_email_servers:Optional[list]=None
+55engine_usage_path:Optional[str]=None
+56engine_dev_usage_path:Optional[str]=None
+57collect_engine_usage:str=CollectEngineUsage.ENABLED.value
+58dq_functions_column_list:Optional[list]=None
Definitions that can come from the Engine Config file.
-
dq_bucket: S3 bucket used to store data quality related artifacts.
+
dq_bucket: S3 prod bucket used to store data quality related artifacts.
+
dq_dev_bucket: S3 dev bucket used to store data quality related artifacts.
notif_disallowed_email_servers: email servers not allowed to be used
for sending notifications.
engine_usage_path: path where the engine prod usage stats are stored.
59classEngineStats(Enum):
-60"""Definitions for collection of Lakehouse Engine Stats.
-61
-62 .. note::
-63 Note: whenever the value comes from a key inside a Spark Config
-64 that returns an array, it can be specified with a '#' so that it
-65 is adequately processed.
-66 """
-67
-68CLUSTER_USAGE_TAGS="spark.databricks.clusterUsageTags"
-69DEF_SPARK_CONFS={
-70"dp_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#accountName",
-71"environment":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#environment",
-72"workspace_id":f"{CLUSTER_USAGE_TAGS}.orgId",
-73"job_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#JobId",
-74"job_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#RunName",
-75"run_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#ClusterName",
-76}
+
61classEngineStats(Enum):
+62"""Definitions for collection of Lakehouse Engine Stats.
+63
+64 .. note::
+65 Note: whenever the value comes from a key inside a Spark Config
+66 that returns an array, it can be specified with a '#' so that it
+67 is adequately processed.
+68 """
+69
+70CLUSTER_USAGE_TAGS="spark.databricks.clusterUsageTags"
+71DEF_SPARK_CONFS={
+72"dp_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#accountName",
+73"environment":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#environment",
+74"workspace_id":f"{CLUSTER_USAGE_TAGS}.orgId",
+75"job_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#JobId",
+76"job_name":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#RunName",
+77"run_id":f"{CLUSTER_USAGE_TAGS}.clusterAllTags#ClusterName",
+78}
@@ -2881,43 +2901,43 @@
Inherited Members
-
79classInputFormat(Enum):
- 80"""Formats of algorithm input."""
- 81
- 82JDBC="jdbc"
- 83AVRO="avro"
- 84JSON="json"
- 85CSV="csv"
- 86PARQUET="parquet"
- 87DELTAFILES="delta"
- 88CLOUDFILES="cloudfiles"
- 89KAFKA="kafka"
- 90SQL="sql"
- 91SAP_BW="sap_bw"
- 92SAP_B4="sap_b4"
- 93DATAFRAME="dataframe"
- 94SFTP="sftp"
- 95
- 96@classmethod
- 97defvalues(cls):# type: ignore
- 98"""Generates a list containing all enum values.
- 99
-100 Return:
-101 A list with all enum values.
-102 """
-103return(c.valueforcincls)
-104
-105@classmethod
-106defexists(cls,input_format:str)->bool:
-107"""Checks if the input format exists in the enum values.
-108
-109 Args:
-110 input_format: format to check if exists.
-111
-112 Return:
-113 If the input format exists in our enum.
-114 """
-115returninput_formatincls.values()
+
81classInputFormat(Enum):
+ 82"""Formats of algorithm input."""
+ 83
+ 84JDBC="jdbc"
+ 85AVRO="avro"
+ 86JSON="json"
+ 87CSV="csv"
+ 88PARQUET="parquet"
+ 89DELTAFILES="delta"
+ 90CLOUDFILES="cloudfiles"
+ 91KAFKA="kafka"
+ 92SQL="sql"
+ 93SAP_BW="sap_bw"
+ 94SAP_B4="sap_b4"
+ 95DATAFRAME="dataframe"
+ 96SFTP="sftp"
+ 97
+ 98@classmethod
+ 99defvalues(cls):# type: ignore
+100"""Generates a list containing all enum values.
+101
+102 Return:
+103 A list with all enum values.
+104 """
+105return(c.valueforcincls)
+106
+107@classmethod
+108defexists(cls,input_format:str)->bool:
+109"""Checks if the input format exists in the enum values.
+110
+111 Args:
+112 input_format: format to check if exists.
+113
+114 Return:
+115 If the input format exists in our enum.
+116 """
+117returninput_formatincls.values()
@@ -3093,14 +3113,14 @@
Inherited Members
-
96@classmethod
- 97defvalues(cls):# type: ignore
- 98"""Generates a list containing all enum values.
- 99
-100 Return:
-101 A list with all enum values.
-102 """
-103return(c.valueforcincls)
+
98@classmethod
+ 99defvalues(cls):# type: ignore
+100"""Generates a list containing all enum values.
+101
+102 Return:
+103 A list with all enum values.
+104 """
+105return(c.valueforcincls)
@@ -3127,17 +3147,17 @@
Return:
-
105@classmethod
-106defexists(cls,input_format:str)->bool:
-107"""Checks if the input format exists in the enum values.
-108
-109 Args:
-110 input_format: format to check if exists.
-111
-112 Return:
-113 If the input format exists in our enum.
-114 """
-115returninput_formatincls.values()
+
107@classmethod
+108defexists(cls,input_format:str)->bool:
+109"""Checks if the input format exists in the enum values.
+110
+111 Args:
+112 input_format: format to check if exists.
+113
+114 Return:
+115 If the input format exists in our enum.
+116 """
+117returninput_formatincls.values()
@@ -3192,43 +3212,43 @@
Inherited Members
-
129classOutputFormat(Enum):
-130"""Formats of algorithm output."""
-131
-132JDBC="jdbc"
-133AVRO="avro"
-134JSON="json"
-135CSV="csv"
-136PARQUET="parquet"
-137DELTAFILES="delta"
-138KAFKA="kafka"
-139CONSOLE="console"
-140NOOP="noop"
-141DATAFRAME="dataframe"
-142REST_API="rest_api"
-143FILE="file"# Internal use only
-144TABLE="table"# Internal use only
-145
-146@classmethod
-147defvalues(cls):# type: ignore
-148"""Generates a list containing all enum values.
-149
-150 Return:
-151 A list with all enum values.
-152 """
-153return(c.valueforcincls)
-154
-155@classmethod
-156defexists(cls,output_format:str)->bool:
-157"""Checks if the output format exists in the enum values.
-158
-159 Args:
-160 output_format: format to check if exists.
-161
-162 Return:
-163 If the output format exists in our enum.
-164 """
-165returnoutput_formatincls.values()
+
131classOutputFormat(Enum):
+132"""Formats of algorithm output."""
+133
+134JDBC="jdbc"
+135AVRO="avro"
+136JSON="json"
+137CSV="csv"
+138PARQUET="parquet"
+139DELTAFILES="delta"
+140KAFKA="kafka"
+141CONSOLE="console"
+142NOOP="noop"
+143DATAFRAME="dataframe"
+144REST_API="rest_api"
+145FILE="file"# Internal use only
+146TABLE="table"# Internal use only
+147
+148@classmethod
+149defvalues(cls):# type: ignore
+150"""Generates a list containing all enum values.
+151
+152 Return:
+153 A list with all enum values.
+154 """
+155return(c.valueforcincls)
+156
+157@classmethod
+158defexists(cls,output_format:str)->bool:
+159"""Checks if the output format exists in the enum values.
+160
+161 Args:
+162 output_format: format to check if exists.
+163
+164 Return:
+165 If the output format exists in our enum.
+166 """
+167returnoutput_formatincls.values()
@@ -3404,14 +3424,14 @@
Inherited Members
-
146@classmethod
-147defvalues(cls):# type: ignore
-148"""Generates a list containing all enum values.
-149
-150 Return:
-151 A list with all enum values.
-152 """
-153return(c.valueforcincls)
+
148@classmethod
+149defvalues(cls):# type: ignore
+150"""Generates a list containing all enum values.
+151
+152 Return:
+153 A list with all enum values.
+154 """
+155return(c.valueforcincls)
@@ -3438,17 +3458,17 @@
Return:
-
155@classmethod
-156defexists(cls,output_format:str)->bool:
-157"""Checks if the output format exists in the enum values.
-158
-159 Args:
-160 output_format: format to check if exists.
-161
-162 Return:
-163 If the output format exists in our enum.
-164 """
-165returnoutput_formatincls.values()
+
157@classmethod
+158defexists(cls,output_format:str)->bool:
+159"""Checks if the output format exists in the enum values.
+160
+161 Args:
+162 output_format: format to check if exists.
+163
+164 Return:
+165 If the output format exists in our enum.
+166 """
+167returnoutput_formatincls.values()
@@ -3503,10 +3523,10 @@
Inherited Members
-
178classNotifierType(Enum):
-179"""Type of notifier available."""
-180
-181EMAIL="email"
+
180classNotifierType(Enum):
+181"""Type of notifier available."""
+182
+183EMAIL="email"
@@ -3548,11 +3568,11 @@
Inherited Members
-
184classNotificationRuntimeParameters(Enum):
-185"""Parameters to be replaced in runtime."""
-186
-187DATABRICKS_JOB_NAME="databricks_job_name"
-188DATABRICKS_WORKSPACE_ID="databricks_workspace_id"
+
186classNotificationRuntimeParameters(Enum):
+187"""Parameters to be replaced in runtime."""
+188
+189DATABRICKS_JOB_NAME="databricks_job_name"
+190DATABRICKS_WORKSPACE_ID="databricks_workspace_id"
@@ -3618,15 +3638,15 @@
Inherited Members
-
197classReadType(Enum):
-198"""Define the types of read operations.
-199
-200 - BATCH - read the data in batch mode (e.g., Spark batch).
-201 - STREAMING - read the data in streaming mode (e.g., Spark streaming).
-202 """
-203
-204BATCH="batch"
-205STREAMING="streaming"
+
199classReadType(Enum):
+200"""Define the types of read operations.
+201
+202 - BATCH - read the data in batch mode (e.g., Spark batch).
+203 - STREAMING - read the data in streaming mode (e.g., Spark streaming).
+204 """
+205
+206BATCH="batch"
+207STREAMING="streaming"
@@ -3685,15 +3705,15 @@
Inherited Members
-
208classReadMode(Enum):
-209"""Different modes that control how we handle compliance to the provided schema.
-210
-211 These read modes map to Spark's read modes at the moment.
-212 """
-213
-214PERMISSIVE="PERMISSIVE"
-215FAILFAST="FAILFAST"
-216DROPMALFORMED="DROPMALFORMED"
+
210classReadMode(Enum):
+211"""Different modes that control how we handle compliance to the provided schema.
+212
+213 These read modes map to Spark's read modes at the moment.
+214 """
+215
+216PERMISSIVE="PERMISSIVE"
+217FAILFAST="FAILFAST"
+218DROPMALFORMED="DROPMALFORMED"
285@dataclass
-286classInputSpec(object):
-287"""Specification of an algorithm input.
-288
-289 This is very aligned with the way the execution environment connects to the sources
-290 (e.g., spark sources).
-291
-292 - spec_id: spec_id of the input specification read_type: ReadType type of read
-293 operation.
-294 - data_format: format of the input.
-295 - sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp
-296 directory.
-297 - df_name: dataframe name.
-298 - db_table: table name in the form of `<db>.<table>`.
-299 - location: uri that identifies from where to read data in the specified format.
-300 - enforce_schema_from_table: if we want to enforce the table schema or not, by
-301 providing a table name in the form of `<db>.<table>`.
-302 - query: sql query to execute and return the dataframe. Use it if you do not want to
-303 read from a file system nor from a table, but rather from a sql query instead.
-304 - schema: dict representation of a schema of the input (e.g., Spark struct type
-305 schema).
-306 - schema_path: path to a file with a representation of a schema of the input (e.g.,
-307 Spark struct type schema).
-308 - disable_dbfs_retry: optional flag to disable file storage dbfs.
-309 - with_filepath: if we want to include the path of the file that is being read. Only
-310 works with the file reader (batch and streaming modes are supported).
-311 - options: dict with other relevant options according to the execution
-312 environment (e.g., spark) possible sources.
-313 - calculate_upper_bound: when to calculate upper bound to extract from SAP BW
-314 or not.
-315 - calc_upper_bound_schema: specific schema for the calculated upper_bound.
-316 - generate_predicates: when to generate predicates to extract from SAP BW or not.
-317 - predicates_add_null: if we want to include is null on partition by predicates.
-318 - temp_view: optional name of a view to point to the input dataframe to be used
-319 to create or replace a temp view on top of the dataframe.
-320 """
-321
-322spec_id:str
-323read_type:str
-324data_format:Optional[str]=None
-325sftp_files_format:Optional[str]=None
-326df_name:Optional[DataFrame]=None
-327db_table:Optional[str]=None
-328location:Optional[str]=None
-329query:Optional[str]=None
-330enforce_schema_from_table:Optional[str]=None
-331schema:Optional[dict]=None
-332schema_path:Optional[str]=None
-333disable_dbfs_retry:bool=False
-334with_filepath:bool=False
-335options:Optional[dict]=None
-336jdbc_args:Optional[dict]=None
-337calculate_upper_bound:bool=False
-338calc_upper_bound_schema:Optional[str]=None
-339generate_predicates:bool=False
-340predicates_add_null:bool=True
-341temp_view:Optional[str]=None
+
287@dataclass
+288classInputSpec(object):
+289"""Specification of an algorithm input.
+290
+291 This is very aligned with the way the execution environment connects to the sources
+292 (e.g., spark sources).
+293
+294 - spec_id: spec_id of the input specification read_type: ReadType type of read
+295 operation.
+296 - data_format: format of the input.
+297 - sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp
+298 directory.
+299 - df_name: dataframe name.
+300 - db_table: table name in the form of `<db>.<table>`.
+301 - location: uri that identifies from where to read data in the specified format.
+302 - enforce_schema_from_table: if we want to enforce the table schema or not, by
+303 providing a table name in the form of `<db>.<table>`.
+304 - query: sql query to execute and return the dataframe. Use it if you do not want to
+305 read from a file system nor from a table, but rather from a sql query instead.
+306 - schema: dict representation of a schema of the input (e.g., Spark struct type
+307 schema).
+308 - schema_path: path to a file with a representation of a schema of the input (e.g.,
+309 Spark struct type schema).
+310 - disable_dbfs_retry: optional flag to disable file storage dbfs.
+311 - with_filepath: if we want to include the path of the file that is being read. Only
+312 works with the file reader (batch and streaming modes are supported).
+313 - options: dict with other relevant options according to the execution
+314 environment (e.g., spark) possible sources.
+315 - calculate_upper_bound: when to calculate upper bound to extract from SAP BW
+316 or not.
+317 - calc_upper_bound_schema: specific schema for the calculated upper_bound.
+318 - generate_predicates: when to generate predicates to extract from SAP BW or not.
+319 - predicates_add_null: if we want to include is null on partition by predicates.
+320 - temp_view: optional name of a view to point to the input dataframe to be used
+321 to create or replace a temp view on top of the dataframe.
+322 """
+323
+324spec_id:str
+325read_type:str
+326data_format:Optional[str]=None
+327sftp_files_format:Optional[str]=None
+328df_name:Optional[DataFrame]=None
+329db_table:Optional[str]=None
+330location:Optional[str]=None
+331query:Optional[str]=None
+332enforce_schema_from_table:Optional[str]=None
+333schema:Optional[dict]=None
+334schema_path:Optional[str]=None
+335disable_dbfs_retry:bool=False
+336with_filepath:bool=False
+337options:Optional[dict]=None
+338jdbc_args:Optional[dict]=None
+339calculate_upper_bound:bool=False
+340calc_upper_bound_schema:Optional[str]=None
+341generate_predicates:bool=False
+342predicates_add_null:bool=True
+343temp_view:Optional[str]=None
@@ -4533,18 +4553,18 @@
Inherited Members
-
344@dataclass
-345classTransformerSpec(object):
-346"""Transformer Specification, i.e., a single transformation amongst many.
-347
-348 - function: name of the function (or callable function) to be executed.
-349 - args: (not applicable if using a callable function) dict with the arguments
-350 to pass to the function `<k,v>` pairs with the name of the parameter of
-351 the function and the respective value.
-352 """
-353
-354function:str
-355args:dict
+
346@dataclass
+347classTransformerSpec(object):
+348"""Transformer Specification, i.e., a single transformation amongst many.
+349
+350 - function: name of the function (or callable function) to be executed.
+351 - args: (not applicable if using a callable function) dict with the arguments
+352 to pass to the function `<k,v>` pairs with the name of the parameter of
+353 the function and the respective value.
+354 """
+355
+356function:str
+357args:dict
@@ -4606,26 +4626,26 @@
Inherited Members
-
358@dataclass
-359classTransformSpec(object):
-360"""Transformation Specification.
-361
-362 I.e., the specification that defines the many transformations to be done to the data
-363 that was read.
-364
-365 - spec_id: id of the terminate specification
-366 - input_id: id of the corresponding input
-367 specification.
-368 - transformers: list of transformers to execute.
-369 - force_streaming_foreach_batch_processing: sometimes, when using streaming, we want
-370 to force the transform to be executed in the foreachBatch function to ensure
-371 non-supported streaming operations can be properly executed.
-372 """
-373
-374spec_id:str
-375input_id:str
-376transformers:List[TransformerSpec]
-377force_streaming_foreach_batch_processing:bool=False
+
360@dataclass
+361classTransformSpec(object):
+362"""Transformation Specification.
+363
+364 I.e., the specification that defines the many transformations to be done to the data
+365 that was read.
+366
+367 - spec_id: id of the terminate specification
+368 - input_id: id of the corresponding input
+369 specification.
+370 - transformers: list of transformers to execute.
+371 - force_streaming_foreach_batch_processing: sometimes, when using streaming, we want
+372 to force the transform to be executed in the foreachBatch function to ensure
+373 non-supported streaming operations can be properly executed.
+374 """
+375
+376spec_id:str
+377input_id:str
+378transformers:List[TransformerSpec]
+379force_streaming_foreach_batch_processing:bool=False
@@ -4715,11 +4735,11 @@
Inherited Members
-
380classDQType(Enum):
-381"""Available data quality tasks."""
-382
-383VALIDATOR="validator"
-384PRISMA="prisma"
+
382classDQType(Enum):
+383"""Available data quality tasks."""
+384
+385VALIDATOR="validator"
+386PRISMA="prisma"
@@ -4773,11 +4793,11 @@
Inherited Members
-
387classDQExecutionPoint(Enum):
-388"""Available data quality execution points."""
-389
-390IN_MOTION="in_motion"
-391AT_REST="at_rest"
+
389classDQExecutionPoint(Enum):
+390"""Available data quality execution points."""
+391
+392IN_MOTION="in_motion"
+393AT_REST="at_rest"
@@ -4831,10 +4851,10 @@
Inherited Members
-
394classDQTableBaseParameters(Enum):
-395"""Base parameters for importing DQ rules from a table."""
-396
-397PRISMA_BASE_PARAMETERS=["arguments","dq_tech_function"]
+
396classDQTableBaseParameters(Enum):
+397"""Base parameters for importing DQ rules from a table."""
+398
+399PRISMA_BASE_PARAMETERS=["arguments","dq_tech_function"]
@@ -4877,17 +4897,17 @@
Inherited Members
-
400@dataclass
-401classDQFunctionSpec(object):
-402"""Defines a data quality function specification.
-403
-404 - function - name of the data quality function (expectation) to execute.
-405 It follows the great_expectations api https://greatexpectations.io/expectations/.
-406 - args - args of the function (expectation). Follow the same api as above.
-407 """
-408
-409function:str
-410args:Optional[dict]=None
+
402@dataclass
+403classDQFunctionSpec(object):
+404"""Defines a data quality function specification.
+405
+406 - function - name of the data quality function (expectation) to execute.
+407 It follows the great_expectations api https://greatexpectations.io/expectations/.
+408 - args - args of the function (expectation). Follow the same api as above.
+409 """
+410
+411function:str
+412args:Optional[dict]=None
@@ -4949,123 +4969,123 @@
Inherited Members
-
413@dataclass
-414classDQSpec(object):
-415"""Data quality overall specification.
-416
-417 - spec_id - id of the specification.
-418 - input_id - id of the input specification.
-419 - dq_type - type of DQ process to execute (e.g. validator).
-420 - dq_functions - list of function specifications to execute.
-421 - dq_db_table - name of table to derive the dq functions from.
-422 - dq_table_table_filter - name of the table which rules are to be applied in the
-423 validations (Only used when deriving dq functions).
-424 - dq_table_extra_filters - extra filters to be used when deriving dq functions.
-425 This is a sql expression to be applied to the dq_db_table.
-426 - execution_point - execution point of the dq functions. [at_rest, in_motion].
-427 This is set during the load_data or dq_validator functions.
-428 - unexpected_rows_pk - the list of columns composing the primary key of the
-429 source data to identify the rows failing the DQ validations. Note: only one
-430 of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It
-431 is mandatory to provide one of these arguments when using tag_source_data
-432 as True. When tag_source_data is False, this is not mandatory, but still
-433 recommended.
-434 - tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
-435 Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to
-436 be provided. It is mandatory to provide one of these arguments when using
-437 tag_source_data as True. hen tag_source_data is False, this is not
-438 mandatory, but still recommended.
-439 - gx_result_format - great expectations result format. Default: "COMPLETE".
-440 - tag_source_data - when set to true, this will ensure that the DQ process ends by
-441 tagging the source data with an additional column with information about the
-442 DQ results. This column makes it possible to identify if the DQ run was
-443 succeeded in general and, if not, it unlocks the insights to know what
-444 specific rows have made the DQ validations fail and why. Default: False.
-445 Note: it only works if result_sink_explode is True, gx_result_format is
-446 COMPLETE, fail_on_error is False (which is done automatically when
-447 you specify tag_source_data as True) and tbl_to_derive_pk or
-448 unexpected_rows_pk is configured.
-449 - store_backend - which store_backend to use (e.g. s3 or file_system).
-450 - local_fs_root_dir - path of the root directory. Note: only applicable for
-451 store_backend file_system.
-452 - data_docs_local_fs - the path for data docs only for store_backend
-453 file_system.
-454 - bucket - the bucket name to consider for the store_backend (store DQ artefacts).
-455 Note: only applicable for store_backend s3.
-456 - data_docs_bucket - the bucket name for data docs only. When defined, it will
-457 supersede bucket parameter. Note: only applicable for store_backend s3.
-458 - expectations_store_prefix - prefix where to store expectations' data. Note: only
-459 applicable for store_backend s3.
-460 - validations_store_prefix - prefix where to store validations' data. Note: only
+
415@dataclass
+416classDQSpec(object):
+417"""Data quality overall specification.
+418
+419 - spec_id - id of the specification.
+420 - input_id - id of the input specification.
+421 - dq_type - type of DQ process to execute (e.g. validator).
+422 - dq_functions - list of function specifications to execute.
+423 - dq_db_table - name of table to derive the dq functions from.
+424 - dq_table_table_filter - name of the table which rules are to be applied in the
+425 validations (Only used when deriving dq functions).
+426 - dq_table_extra_filters - extra filters to be used when deriving dq functions.
+427 This is a sql expression to be applied to the dq_db_table.
+428 - execution_point - execution point of the dq functions. [at_rest, in_motion].
+429 This is set during the load_data or dq_validator functions.
+430 - unexpected_rows_pk - the list of columns composing the primary key of the
+431 source data to identify the rows failing the DQ validations. Note: only one
+432 of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It
+433 is mandatory to provide one of these arguments when using tag_source_data
+434 as True. When tag_source_data is False, this is not mandatory, but still
+435 recommended.
+436 - tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
+437 Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to
+438 be provided. It is mandatory to provide one of these arguments when using
+439 tag_source_data as True. hen tag_source_data is False, this is not
+440 mandatory, but still recommended.
+441 - gx_result_format - great expectations result format. Default: "COMPLETE".
+442 - tag_source_data - when set to true, this will ensure that the DQ process ends by
+443 tagging the source data with an additional column with information about the
+444 DQ results. This column makes it possible to identify if the DQ run was
+445 succeeded in general and, if not, it unlocks the insights to know what
+446 specific rows have made the DQ validations fail and why. Default: False.
+447 Note: it only works if result_sink_explode is True, gx_result_format is
+448 COMPLETE, fail_on_error is False (which is done automatically when
+449 you specify tag_source_data as True) and tbl_to_derive_pk or
+450 unexpected_rows_pk is configured.
+451 - store_backend - which store_backend to use (e.g. s3 or file_system).
+452 - local_fs_root_dir - path of the root directory. Note: only applicable for
+453 store_backend file_system.
+454 - data_docs_local_fs - the path for data docs only for store_backend
+455 file_system.
+456 - bucket - the bucket name to consider for the store_backend (store DQ artefacts).
+457 Note: only applicable for store_backend s3.
+458 - data_docs_bucket - the bucket name for data docs only. When defined, it will
+459 supersede bucket parameter. Note: only applicable for store_backend s3.
+460 - expectations_store_prefix - prefix where to store expectations' data. Note: only461 applicable for store_backend s3.
-462 - data_docs_prefix - prefix where to store data_docs' data.
-463 - checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only
-464 applicable for store_backend s3.
-465 - data_asset_name - name of the data asset to consider when configuring the great
-466 expectations' data source.
-467 - expectation_suite_name - name to consider for great expectations' suite.
-468 - result_sink_db_table - db.table_name indicating the database and table in which
-469 to save the results of the DQ process.
-470 - result_sink_location - file system location in which to save the results of the
-471 DQ process.
-472 - data_product_name - name of the data product.
-473 - result_sink_partitions - the list of partitions to consider.
-474 - result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
-475 - result_sink_options - extra spark options for configuring the result sink.
-476 E.g: can be used to configure a Kafka sink if result_sink_format is kafka.
-477 - result_sink_explode - flag to determine if the output table/location should have
-478 the columns exploded (as True) or not (as False). Default: True.
-479 - result_sink_extra_columns - list of extra columns to be exploded (following
-480 the pattern "<name>.*") or columns to be selected. It is only used when
-481 result_sink_explode is set to True.
-482 - source - name of data source, to be easier to identify in analysis. If not
-483 specified, it is set as default <input_id>. This will be only used
-484 when result_sink_explode is set to True.
-485 - fail_on_error - whether to fail the algorithm if the validations of your data in
-486 the DQ process failed.
-487 - cache_df - whether to cache the dataframe before running the DQ process or not.
-488 - critical_functions - functions that should not fail. When this argument is
-489 defined, fail_on_error is nullified.
-490 - max_percentage_failure - percentage of failure that should be allowed.
-491 This argument has priority over both fail_on_error and critical_functions.
-492 """
-493
-494spec_id:str
-495input_id:str
-496dq_type:str
-497dq_functions:Optional[List[DQFunctionSpec]]=None
-498dq_db_table:Optional[str]=None
-499dq_table_table_filter:Optional[str]=None
-500dq_table_extra_filters:Optional[str]=None
-501execution_point:Optional[str]=None
-502unexpected_rows_pk:Optional[List[str]]=None
-503tbl_to_derive_pk:Optional[str]=None
-504gx_result_format:Optional[str]="COMPLETE"
-505tag_source_data:Optional[bool]=False
-506store_backend:str=DQDefaults.STORE_BACKEND.value
-507local_fs_root_dir:Optional[str]=None
-508data_docs_local_fs:Optional[str]=None
-509bucket:Optional[str]=None
-510data_docs_bucket:Optional[str]=None
-511expectations_store_prefix:str=DQDefaults.EXPECTATIONS_STORE_PREFIX.value
-512validations_store_prefix:str=DQDefaults.VALIDATIONS_STORE_PREFIX.value
-513data_docs_prefix:str=DQDefaults.DATA_DOCS_PREFIX.value
-514checkpoint_store_prefix:str=DQDefaults.CHECKPOINT_STORE_PREFIX.value
-515data_asset_name:Optional[str]=None
-516expectation_suite_name:Optional[str]=None
-517result_sink_db_table:Optional[str]=None
-518result_sink_location:Optional[str]=None
-519data_product_name:Optional[str]=None
-520result_sink_partitions:Optional[List[str]]=None
-521result_sink_format:str=OutputFormat.DELTAFILES.value
-522result_sink_options:Optional[dict]=None
-523result_sink_explode:bool=True
-524result_sink_extra_columns:Optional[List[str]]=None
-525source:Optional[str]=None
-526fail_on_error:bool=True
-527cache_df:bool=False
-528critical_functions:Optional[List[DQFunctionSpec]]=None
-529max_percentage_failure:Optional[float]=None
+462 - validations_store_prefix - prefix where to store validations' data. Note: only
+463 applicable for store_backend s3.
+464 - data_docs_prefix - prefix where to store data_docs' data.
+465 - checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only
+466 applicable for store_backend s3.
+467 - data_asset_name - name of the data asset to consider when configuring the great
+468 expectations' data source.
+469 - expectation_suite_name - name to consider for great expectations' suite.
+470 - result_sink_db_table - db.table_name indicating the database and table in which
+471 to save the results of the DQ process.
+472 - result_sink_location - file system location in which to save the results of the
+473 DQ process.
+474 - data_product_name - name of the data product.
+475 - result_sink_partitions - the list of partitions to consider.
+476 - result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
+477 - result_sink_options - extra spark options for configuring the result sink.
+478 E.g: can be used to configure a Kafka sink if result_sink_format is kafka.
+479 - result_sink_explode - flag to determine if the output table/location should have
+480 the columns exploded (as True) or not (as False). Default: True.
+481 - result_sink_extra_columns - list of extra columns to be exploded (following
+482 the pattern "<name>.*") or columns to be selected. It is only used when
+483 result_sink_explode is set to True.
+484 - source - name of data source, to be easier to identify in analysis. If not
+485 specified, it is set as default <input_id>. This will be only used
+486 when result_sink_explode is set to True.
+487 - fail_on_error - whether to fail the algorithm if the validations of your data in
+488 the DQ process failed.
+489 - cache_df - whether to cache the dataframe before running the DQ process or not.
+490 - critical_functions - functions that should not fail. When this argument is
+491 defined, fail_on_error is nullified.
+492 - max_percentage_failure - percentage of failure that should be allowed.
+493 This argument has priority over both fail_on_error and critical_functions.
+494 """
+495
+496spec_id:str
+497input_id:str
+498dq_type:str
+499dq_functions:Optional[List[DQFunctionSpec]]=None
+500dq_db_table:Optional[str]=None
+501dq_table_table_filter:Optional[str]=None
+502dq_table_extra_filters:Optional[str]=None
+503execution_point:Optional[str]=None
+504unexpected_rows_pk:Optional[List[str]]=None
+505tbl_to_derive_pk:Optional[str]=None
+506gx_result_format:Optional[str]="COMPLETE"
+507tag_source_data:Optional[bool]=False
+508store_backend:str=DQDefaults.STORE_BACKEND.value
+509local_fs_root_dir:Optional[str]=None
+510data_docs_local_fs:Optional[str]=None
+511bucket:Optional[str]=None
+512data_docs_bucket:Optional[str]=None
+513expectations_store_prefix:str=DQDefaults.EXPECTATIONS_STORE_PREFIX.value
+514validations_store_prefix:str=DQDefaults.VALIDATIONS_STORE_PREFIX.value
+515data_docs_prefix:str=DQDefaults.DATA_DOCS_PREFIX.value
+516checkpoint_store_prefix:str=DQDefaults.CHECKPOINT_STORE_PREFIX.value
+517data_asset_name:Optional[str]=None
+518expectation_suite_name:Optional[str]=None
+519result_sink_db_table:Optional[str]=None
+520result_sink_location:Optional[str]=None
+521data_product_name:Optional[str]=None
+522result_sink_partitions:Optional[List[str]]=None
+523result_sink_format:str=OutputFormat.DELTAFILES.value
+524result_sink_options:Optional[dict]=None
+525result_sink_explode:bool=True
+526result_sink_extra_columns:Optional[List[str]]=None
+527source:Optional[str]=None
+528fail_on_error:bool=True
+529cache_df:bool=False
+530critical_functions:Optional[List[DQFunctionSpec]]=None
+531max_percentage_failure:Optional[float]=None
@@ -5605,33 +5625,33 @@
Inherited Members
-
532@dataclass
-533classMergeOptions(object):
-534"""Options for a merge operation.
-535
-536 - merge_predicate: predicate to apply to the merge operation so that we can
-537 check if a new record corresponds to a record already included in the
-538 historical data.
-539 - insert_only: indicates if the merge should only insert data (e.g., deduplicate
-540 scenarios).
-541 - delete_predicate: predicate to apply to the delete operation.
-542 - update_predicate: predicate to apply to the update operation.
-543 - insert_predicate: predicate to apply to the insert operation.
-544 - update_column_set: rules to apply to the update operation which allows to
-545 set the value for each column to be updated.
-546 (e.g. {"data": "new.data", "count": "current.count + 1"} )
-547 - insert_column_set: rules to apply to the insert operation which allows to
-548 set the value for each column to be inserted.
-549 (e.g. {"date": "updates.date", "count": "1"} )
-550 """
-551
-552merge_predicate:str
-553insert_only:bool=False
-554delete_predicate:Optional[str]=None
-555update_predicate:Optional[str]=None
-556insert_predicate:Optional[str]=None
-557update_column_set:Optional[dict]=None
-558insert_column_set:Optional[dict]=None
+
534@dataclass
+535classMergeOptions(object):
+536"""Options for a merge operation.
+537
+538 - merge_predicate: predicate to apply to the merge operation so that we can
+539 check if a new record corresponds to a record already included in the
+540 historical data.
+541 - insert_only: indicates if the merge should only insert data (e.g., deduplicate
+542 scenarios).
+543 - delete_predicate: predicate to apply to the delete operation.
+544 - update_predicate: predicate to apply to the update operation.
+545 - insert_predicate: predicate to apply to the insert operation.
+546 - update_column_set: rules to apply to the update operation which allows to
+547 set the value for each column to be updated.
+548 (e.g. {"data": "new.data", "count": "current.count + 1"} )
+549 - insert_column_set: rules to apply to the insert operation which allows to
+550 set the value for each column to be inserted.
+551 (e.g. {"date": "updates.date", "count": "1"} )
+552 """
+553
+554merge_predicate:str
+555insert_only:bool=False
+556delete_predicate:Optional[str]=None
+557update_predicate:Optional[str]=None
+558insert_predicate:Optional[str]=None
+559update_column_set:Optional[dict]=None
+560insert_column_set:Optional[dict]=None
@@ -5764,70 +5784,70 @@
Inherited Members
-
561@dataclass
-562classOutputSpec(object):
-563"""Specification of an algorithm output.
-564
-565 This is very aligned with the way the execution environment connects to the output
-566 systems (e.g., spark outputs).
-567
-568 - spec_id: id of the output specification.
-569 - input_id: id of the corresponding input specification.
-570 - write_type: type of write operation.
-571 - data_format: format of the output. Defaults to DELTA.
-572 - db_table: table name in the form of `<db>.<table>`.
-573 - location: uri that identifies from where to write data in the specified format.
-574 - partitions: list of partition input_col names.
-575 - merge_opts: options to apply to the merge operation.
-576 - streaming_micro_batch_transformers: transformers to invoke for each streaming
-577 micro batch, before writing (i.e., in Spark's foreachBatch structured
-578 streaming function). Note: the lakehouse engine manages this for you, so
-579 you don't have to manually specify streaming transformations here, so we don't
-580 advise you to manually specify transformations through this parameter. Supply
-581 them as regular transformers in the transform_specs sections of an ACON.
-582 - streaming_once: if the streaming query is to be executed just once, or not,
-583 generating just one micro batch.
-584 - streaming_processing_time: if streaming query is to be kept alive, this indicates
-585 the processing time of each micro batch.
-586 - streaming_available_now: if set to True, set a trigger that processes all
-587 available data in multiple batches then terminates the query.
-588 When using streaming, this is the default trigger that the lakehouse-engine will
-589 use, unless you configure a different one.
-590 - streaming_continuous: set a trigger that runs a continuous query with a given
-591 checkpoint interval.
-592 - streaming_await_termination: whether to wait (True) for the termination of the
-593 streaming query (e.g. timeout or exception) or not (False). Default: True.
-594 - streaming_await_termination_timeout: a timeout to set to the
-595 streaming_await_termination. Default: None.
-596 - with_batch_id: whether to include the streaming batch id in the final data,
-597 or not. It only takes effect in streaming mode.
-598 - options: dict with other relevant options according to the execution environment
-599 (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for
-600 streaming, etc.
-601 - streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers
-602 but for the DQ functions to be executed. Used internally by the lakehouse
-603 engine, so you don't have to supply DQ functions through this parameter. Use the
-604 dq_specs of the acon instead.
-605 """
-606
-607spec_id:str
-608input_id:str
-609write_type:str
-610data_format:str=OutputFormat.DELTAFILES.value
-611db_table:Optional[str]=None
-612location:Optional[str]=None
-613merge_opts:Optional[MergeOptions]=None
-614partitions:Optional[List[str]]=None
-615streaming_micro_batch_transformers:Optional[List[TransformerSpec]]=None
-616streaming_once:Optional[bool]=None
-617streaming_processing_time:Optional[str]=None
-618streaming_available_now:bool=True
-619streaming_continuous:Optional[str]=None
-620streaming_await_termination:bool=True
-621streaming_await_termination_timeout:Optional[int]=None
-622with_batch_id:bool=False
-623options:Optional[dict]=None
-624streaming_micro_batch_dq_processors:Optional[List[DQSpec]]=None
+
563@dataclass
+564classOutputSpec(object):
+565"""Specification of an algorithm output.
+566
+567 This is very aligned with the way the execution environment connects to the output
+568 systems (e.g., spark outputs).
+569
+570 - spec_id: id of the output specification.
+571 - input_id: id of the corresponding input specification.
+572 - write_type: type of write operation.
+573 - data_format: format of the output. Defaults to DELTA.
+574 - db_table: table name in the form of `<db>.<table>`.
+575 - location: uri that identifies from where to write data in the specified format.
+576 - partitions: list of partition input_col names.
+577 - merge_opts: options to apply to the merge operation.
+578 - streaming_micro_batch_transformers: transformers to invoke for each streaming
+579 micro batch, before writing (i.e., in Spark's foreachBatch structured
+580 streaming function). Note: the lakehouse engine manages this for you, so
+581 you don't have to manually specify streaming transformations here, so we don't
+582 advise you to manually specify transformations through this parameter. Supply
+583 them as regular transformers in the transform_specs sections of an ACON.
+584 - streaming_once: if the streaming query is to be executed just once, or not,
+585 generating just one micro batch.
+586 - streaming_processing_time: if streaming query is to be kept alive, this indicates
+587 the processing time of each micro batch.
+588 - streaming_available_now: if set to True, set a trigger that processes all
+589 available data in multiple batches then terminates the query.
+590 When using streaming, this is the default trigger that the lakehouse-engine will
+591 use, unless you configure a different one.
+592 - streaming_continuous: set a trigger that runs a continuous query with a given
+593 checkpoint interval.
+594 - streaming_await_termination: whether to wait (True) for the termination of the
+595 streaming query (e.g. timeout or exception) or not (False). Default: True.
+596 - streaming_await_termination_timeout: a timeout to set to the
+597 streaming_await_termination. Default: None.
+598 - with_batch_id: whether to include the streaming batch id in the final data,
+599 or not. It only takes effect in streaming mode.
+600 - options: dict with other relevant options according to the execution environment
+601 (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for
+602 streaming, etc.
+603 - streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers
+604 but for the DQ functions to be executed. Used internally by the lakehouse
+605 engine, so you don't have to supply DQ functions through this parameter. Use the
+606 dq_specs of the acon instead.
+607 """
+608
+609spec_id:str
+610input_id:str
+611write_type:str
+612data_format:str=OutputFormat.DELTAFILES.value
+613db_table:Optional[str]=None
+614location:Optional[str]=None
+615merge_opts:Optional[MergeOptions]=None
+616partitions:Optional[List[str]]=None
+617streaming_micro_batch_transformers:Optional[List[TransformerSpec]]=None
+618streaming_once:Optional[bool]=None
+619streaming_processing_time:Optional[str]=None
+620streaming_available_now:bool=True
+621streaming_continuous:Optional[str]=None
+622streaming_await_termination:bool=True
+623streaming_await_termination_timeout:Optional[int]=None
+624with_batch_id:bool=False
+625options:Optional[dict]=None
+626streaming_micro_batch_dq_processors:Optional[List[DQSpec]]=None
@@ -6116,21 +6136,21 @@
Inherited Members
-
627@dataclass
-628classTerminatorSpec(object):
-629"""Terminator Specification.
-630
-631 I.e., the specification that defines a terminator operation to be executed. Examples
-632 are compute statistics, vacuum, optimize, etc.
-633
-634 - function: terminator function to execute.
-635 - args: arguments of the terminator function.
-636 - input_id: id of the corresponding output specification (Optional).
-637 """
-638
-639function:str
-640args:Optional[dict]=None
-641input_id:Optional[str]=None
+
629@dataclass
+630classTerminatorSpec(object):
+631"""Terminator Specification.
+632
+633 I.e., the specification that defines a terminator operation to be executed. Examples
+634 are compute statistics, vacuum, optimize, etc.
+635
+636 - function: terminator function to execute.
+637 - args: arguments of the terminator function.
+638 - input_id: id of the corresponding output specification (Optional).
+639 """
+640
+641function:str
+642args:Optional[dict]=None
+643input_id:Optional[str]=None
@@ -6207,52 +6227,52 @@
Inherited Members
-
644@dataclass
-645classReconciliatorSpec(object):
-646"""Reconciliator Specification.
-647
-648 - metrics: list of metrics in the form of:
-649 [{
-650 metric: name of the column present in both truth and current datasets,
-651 aggregation: sum, avg, max, min, ...,
-652 type: percentage or absolute,
-653 yellow: value,
-654 red: value
-655 }].
-656 - recon_type: reconciliation type (percentage or absolute). Percentage calculates
-657 the difference between truth and current results as a percentage (x-y/x), and
-658 absolute calculates the raw difference (x - y).
-659 - truth_input_spec: input specification of the truth data.
-660 - current_input_spec: input specification of the current results data
-661 - truth_preprocess_query: additional query on top of the truth input data to
-662 preprocess the truth data before it gets fueled into the reconciliation process.
-663 Important note: you need to assume that the data out of
-664 the truth_input_spec is referencable by a table called 'truth'.
-665 - truth_preprocess_query_args: optional dict having the functions/transformations to
-666 apply on top of the truth_preprocess_query and respective arguments. Note: cache
-667 is being applied on the Dataframe, by default. For turning the default behavior
-668 off, pass `"truth_preprocess_query_args": []`.
-669 - current_preprocess_query: additional query on top of the current results input
-670 data to preprocess the current results data before it gets fueled into the
-671 reconciliation process. Important note: you need to assume that the data out of
-672 the current_results_input_spec is referencable by a table called 'current'.
-673 - current_preprocess_query_args: optional dict having the
-674 functions/transformations to apply on top of the current_preprocess_query
-675 and respective arguments. Note: cache is being applied on the Dataframe,
-676 by default. For turning the default behavior off, pass
-677 `"current_preprocess_query_args": []`.
-678 - ignore_empty_df: optional boolean, to ignore the recon process if source & target
-679 dataframes are empty, recon will exit success code (passed)
-680 """
-681
-682metrics:List[dict]
-683truth_input_spec:InputSpec
-684current_input_spec:InputSpec
-685truth_preprocess_query:Optional[str]=None
-686truth_preprocess_query_args:Optional[List[dict]]=None
-687current_preprocess_query:Optional[str]=None
-688current_preprocess_query_args:Optional[List[dict]]=None
-689ignore_empty_df:Optional[bool]=False
+
646@dataclass
+647classReconciliatorSpec(object):
+648"""Reconciliator Specification.
+649
+650 - metrics: list of metrics in the form of:
+651 [{
+652 metric: name of the column present in both truth and current datasets,
+653 aggregation: sum, avg, max, min, ...,
+654 type: percentage or absolute,
+655 yellow: value,
+656 red: value
+657 }].
+658 - recon_type: reconciliation type (percentage or absolute). Percentage calculates
+659 the difference between truth and current results as a percentage (x-y/x), and
+660 absolute calculates the raw difference (x - y).
+661 - truth_input_spec: input specification of the truth data.
+662 - current_input_spec: input specification of the current results data
+663 - truth_preprocess_query: additional query on top of the truth input data to
+664 preprocess the truth data before it gets fueled into the reconciliation process.
+665 Important note: you need to assume that the data out of
+666 the truth_input_spec is referencable by a table called 'truth'.
+667 - truth_preprocess_query_args: optional dict having the functions/transformations to
+668 apply on top of the truth_preprocess_query and respective arguments. Note: cache
+669 is being applied on the Dataframe, by default. For turning the default behavior
+670 off, pass `"truth_preprocess_query_args": []`.
+671 - current_preprocess_query: additional query on top of the current results input
+672 data to preprocess the current results data before it gets fueled into the
+673 reconciliation process. Important note: you need to assume that the data out of
+674 the current_results_input_spec is referencable by a table called 'current'.
+675 - current_preprocess_query_args: optional dict having the
+676 functions/transformations to apply on top of the current_preprocess_query
+677 and respective arguments. Note: cache is being applied on the Dataframe,
+678 by default. For turning the default behavior off, pass
+679 `"current_preprocess_query_args": []`.
+680 - ignore_empty_df: optional boolean, to ignore the recon process if source & target
+681 dataframes are empty, recon will exit success code (passed)
+682 """
+683
+684metrics:List[dict]
+685truth_input_spec:InputSpec
+686current_input_spec:InputSpec
+687truth_preprocess_query:Optional[str]=None
+688truth_preprocess_query_args:Optional[List[dict]]=None
+689current_preprocess_query:Optional[str]=None
+690current_preprocess_query_args:Optional[List[dict]]=None
+691ignore_empty_df:Optional[bool]=False
@@ -6413,21 +6433,21 @@
Inherited Members
-
692@dataclass
-693classDQValidatorSpec(object):
-694"""Data Quality Validator Specification.
-695
-696 - input_spec: input specification of the data to be checked/validated.
-697 - dq_spec: data quality specification.
-698 - restore_prev_version: specify if, having
-699 delta table/files as input, they should be restored to the
-700 previous version if the data quality process fails. Note: this
-701 is only considered if fail_on_error is kept as True.
-702 """
-703
-704input_spec:InputSpec
-705dq_spec:DQSpec
-706restore_prev_version:Optional[bool]=False
+
694@dataclass
+695classDQValidatorSpec(object):
+696"""Data Quality Validator Specification.
+697
+698 - input_spec: input specification of the data to be checked/validated.
+699 - dq_spec: data quality specification.
+700 - restore_prev_version: specify if, having
+701 delta table/files as input, they should be restored to the
+702 previous version if the data quality process fails. Note: this
+703 is only considered if fail_on_error is kept as True.
+704 """
+705
+706input_spec:InputSpec
+707dq_spec:DQSpec
+708restore_prev_version:Optional[bool]=False
@@ -6502,17 +6522,17 @@
Inherited Members
-
709classSQLDefinitions(Enum):
-710"""SQL definitions statements."""
-711
-712compute_table_stats="ANALYZE TABLE {} COMPUTE STATISTICS"
-713drop_table_stmt="DROP TABLE IF EXISTS"
-714drop_view_stmt="DROP VIEW IF EXISTS"
-715truncate_stmt="TRUNCATE TABLE"
-716describe_stmt="DESCRIBE TABLE"
-717optimize_stmt="OPTIMIZE"
-718show_tbl_props_stmt="SHOW TBLPROPERTIES"
-719delete_where_stmt="DELETE FROM {} WHERE {}"
+
711classSQLDefinitions(Enum):
+712"""SQL definitions statements."""
+713
+714compute_table_stats="ANALYZE TABLE {} COMPUTE STATISTICS"
+715drop_table_stmt="DROP TABLE IF EXISTS"
+716drop_view_stmt="DROP VIEW IF EXISTS"
+717truncate_stmt="TRUNCATE TABLE"
+718describe_stmt="DESCRIBE TABLE"
+719optimize_stmt="OPTIMIZE"
+720show_tbl_props_stmt="SHOW TBLPROPERTIES"
+721delete_where_stmt="DELETE FROM {} WHERE {}"
732@dataclass
-733classSensorSpec(object):
-734"""Sensor Specification.
-735
-736 - sensor_id: sensor id.
-737 - assets: a list of assets that are considered as available to
-738 consume downstream after this sensor has status
-739 PROCESSED_NEW_DATA.
-740 - control_db_table_name: db.table to store sensor metadata.
-741 - input_spec: input specification of the source to be checked for new data.
-742 - preprocess_query: SQL query to transform/filter the result from the
-743 upstream. Consider that we should refer to 'new_data' whenever
-744 we are referring to the input of the sensor. E.g.:
-745 "SELECT dummy_col FROM new_data WHERE ..."
-746 - checkpoint_location: optional location to store checkpoints to resume
-747 from. These checkpoints use the same as Spark checkpoint strategy.
-748 For Spark readers that do not support checkpoints, use the
-749 preprocess_query parameter to form a SQL query to filter the result
-750 from the upstream accordingly.
-751 - fail_on_empty_result: if the sensor should throw an error if there is no new
-752 data in the upstream. Default: True.
-753 """
-754
-755sensor_id:str
-756assets:List[str]
-757control_db_table_name:str
-758input_spec:InputSpec
-759preprocess_query:Optional[str]
-760checkpoint_location:Optional[str]
-761fail_on_empty_result:bool=True
-762
-763@classmethod
-764defcreate_from_acon(cls,acon:dict):# type: ignore
-765"""Create SensorSpec from acon.
-766
-767 Args:
-768 acon: sensor ACON.
-769 """
-770checkpoint_location=acon.get("base_checkpoint_location")
-771ifcheckpoint_location:
-772checkpoint_location=(
-773f"{checkpoint_location.rstrip('/')}/lakehouse_engine/"
-774f"sensors/{acon['sensor_id']}"
-775)
-776
-777returncls(
-778sensor_id=acon["sensor_id"],
-779assets=acon["assets"],
-780control_db_table_name=acon["control_db_table_name"],
-781input_spec=InputSpec(**acon["input_spec"]),
-782preprocess_query=acon.get("preprocess_query"),
-783checkpoint_location=checkpoint_location,
-784fail_on_empty_result=acon.get("fail_on_empty_result",True),
-785)
+
734@dataclass
+735classSensorSpec(object):
+736"""Sensor Specification.
+737
+738 - sensor_id: sensor id.
+739 - assets: a list of assets that are considered as available to
+740 consume downstream after this sensor has status
+741 PROCESSED_NEW_DATA.
+742 - control_db_table_name: db.table to store sensor metadata.
+743 - input_spec: input specification of the source to be checked for new data.
+744 - preprocess_query: SQL query to transform/filter the result from the
+745 upstream. Consider that we should refer to 'new_data' whenever
+746 we are referring to the input of the sensor. E.g.:
+747 "SELECT dummy_col FROM new_data WHERE ..."
+748 - checkpoint_location: optional location to store checkpoints to resume
+749 from. These checkpoints use the same as Spark checkpoint strategy.
+750 For Spark readers that do not support checkpoints, use the
+751 preprocess_query parameter to form a SQL query to filter the result
+752 from the upstream accordingly.
+753 - fail_on_empty_result: if the sensor should throw an error if there is no new
+754 data in the upstream. Default: True.
+755 """
+756
+757sensor_id:str
+758assets:List[str]
+759control_db_table_name:str
+760input_spec:InputSpec
+761preprocess_query:Optional[str]
+762checkpoint_location:Optional[str]
+763fail_on_empty_result:bool=True
+764
+765@classmethod
+766defcreate_from_acon(cls,acon:dict):# type: ignore
+767"""Create SensorSpec from acon.
+768
+769 Args:
+770 acon: sensor ACON.
+771 """
+772checkpoint_location=acon.get("base_checkpoint_location")
+773ifcheckpoint_location:
+774checkpoint_location=(
+775f"{checkpoint_location.rstrip('/')}/lakehouse_engine/"
+776f"sensors/{acon['sensor_id']}"
+777)
+778
+779returncls(
+780sensor_id=acon["sensor_id"],
+781assets=acon["assets"],
+782control_db_table_name=acon["control_db_table_name"],
+783input_spec=InputSpec(**acon["input_spec"]),
+784preprocess_query=acon.get("preprocess_query"),
+785checkpoint_location=checkpoint_location,
+786fail_on_empty_result=acon.get("fail_on_empty_result",True),
+787)
788classSensorStatus(Enum):
-789"""Status for a sensor."""
-790
-791ACQUIRED_NEW_DATA="ACQUIRED_NEW_DATA"
-792PROCESSED_NEW_DATA="PROCESSED_NEW_DATA"
+
790classSensorStatus(Enum):
+791"""Status for a sensor."""
+792
+793ACQUIRED_NEW_DATA="ACQUIRED_NEW_DATA"
+794PROCESSED_NEW_DATA="PROCESSED_NEW_DATA"
@@ -7065,12 +7085,12 @@
Inherited Members
-
822classSAPLogchain(Enum):
-823"""Defaults used on consuming data from SAP Logchain."""
-824
-825DBTABLE="SAPPHA.RSPCLOGCHAIN"
-826GREEN_STATUS="G"
-827ENGINE_TABLE="sensor_new_data"
+
824classSAPLogchain(Enum):
+825"""Defaults used on consuming data from SAP Logchain."""
+826
+827DBTABLE="SAPPHA.RSPCLOGCHAIN"
+828GREEN_STATUS="G"
+829ENGINE_TABLE="sensor_new_data"
@@ -7136,33 +7156,33 @@
Inherited Members
-
830classRestoreType(Enum):
-831"""Archive types."""
-832
-833BULK="Bulk"
-834STANDARD="Standard"
-835EXPEDITED="Expedited"
-836
-837@classmethod
-838defvalues(cls):# type: ignore
-839"""Generates a list containing all enum values.
-840
-841 Return:
-842 A list with all enum values.
-843 """
-844return(c.valueforcincls)
-845
-846@classmethod
-847defexists(cls,restore_type:str)->bool:
-848"""Checks if the restore type exists in the enum values.
-849
-850 Args:
-851 restore_type: restore type to check if exists.
-852
-853 Return:
-854 If the restore type exists in our enum.
-855 """
-856returnrestore_typeincls.values()
+
832classRestoreType(Enum):
+833"""Archive types."""
+834
+835BULK="Bulk"
+836STANDARD="Standard"
+837EXPEDITED="Expedited"
+838
+839@classmethod
+840defvalues(cls):# type: ignore
+841"""Generates a list containing all enum values.
+842
+843 Return:
+844 A list with all enum values.
+845 """
+846return(c.valueforcincls)
+847
+848@classmethod
+849defexists(cls,restore_type:str)->bool:
+850"""Checks if the restore type exists in the enum values.
+851
+852 Args:
+853 restore_type: restore type to check if exists.
+854
+855 Return:
+856 If the restore type exists in our enum.
+857 """
+858returnrestore_typeincls.values()
@@ -7218,14 +7238,14 @@
Inherited Members
-
837@classmethod
-838defvalues(cls):# type: ignore
-839"""Generates a list containing all enum values.
-840
-841 Return:
-842 A list with all enum values.
-843 """
-844return(c.valueforcincls)
+
839@classmethod
+840defvalues(cls):# type: ignore
+841"""Generates a list containing all enum values.
+842
+843 Return:
+844 A list with all enum values.
+845 """
+846return(c.valueforcincls)
@@ -7252,17 +7272,17 @@
Return:
-
846@classmethod
-847defexists(cls,restore_type:str)->bool:
-848"""Checks if the restore type exists in the enum values.
-849
-850 Args:
-851 restore_type: restore type to check if exists.
-852
-853 Return:
-854 If the restore type exists in our enum.
-855 """
-856returnrestore_typeincls.values()
+
848@classmethod
+849defexists(cls,restore_type:str)->bool:
+850"""Checks if the restore type exists in the enum values.
+851
+852 Args:
+853 restore_type: restore type to check if exists.
+854
+855 Return:
+856 If the restore type exists in our enum.
+857 """
+858returnrestore_typeincls.values()
874classSQLParser(Enum):
-875"""Defaults to use for parsing."""
-876
-877DOUBLE_QUOTES='"'
-878SINGLE_QUOTES="'"
-879BACKSLASH="\\"
-880SINGLE_TRACE="-"
-881DOUBLE_TRACES="--"
-882SLASH="/"
-883OPENING_MULTIPLE_LINE_COMMENT="/*"
-884CLOSING_MULTIPLE_LINE_COMMENT="*/"
-885PARAGRAPH="\n"
-886STAR="*"
-887
-888MULTIPLE_LINE_COMMENT=[
-889OPENING_MULTIPLE_LINE_COMMENT,
-890CLOSING_MULTIPLE_LINE_COMMENT,
-891]
+
876classSQLParser(Enum):
+877"""Defaults to use for parsing."""
+878
+879DOUBLE_QUOTES='"'
+880SINGLE_QUOTES="'"
+881BACKSLASH="\\"
+882SINGLE_TRACE="-"
+883DOUBLE_TRACES="--"
+884SLASH="/"
+885OPENING_MULTIPLE_LINE_COMMENT="/*"
+886CLOSING_MULTIPLE_LINE_COMMENT="*/"
+887PARAGRAPH="\n"
+888STAR="*"
+889
+890MULTIPLE_LINE_COMMENT=[
+891OPENING_MULTIPLE_LINE_COMMENT,
+892CLOSING_MULTIPLE_LINE_COMMENT,
+893]
@@ -7567,13 +7587,13 @@
Inherited Members
-
894classGABDefaults(Enum):
-895"""Defaults used on the GAB process."""
-896
-897DATE_FORMAT="%Y-%m-%d"
-898DIMENSIONS_DEFAULT_COLUMNS=["from_date","to_date"]
-899DEFAULT_DIMENSION_CALENDAR_TABLE="dim_calendar"
-900DEFAULT_LOOKUP_QUERY_BUILDER_TABLE="lkp_query_builder"
+
896classGABDefaults(Enum):
+897"""Defaults used on the GAB process."""
+898
+899DATE_FORMAT="%Y-%m-%d"
+900DIMENSIONS_DEFAULT_COLUMNS=["from_date","to_date"]
+901DEFAULT_DIMENSION_CALENDAR_TABLE="dim_calendar"
+902DEFAULT_LOOKUP_QUERY_BUILDER_TABLE="lkp_query_builder"
@@ -7651,32 +7671,32 @@
Inherited Members
-
903classGABStartOfWeek(Enum):
-904"""Representation of start of week values on GAB."""
-905
-906SUNDAY="S"
-907MONDAY="M"
-908
-909@classmethod
-910defget_start_of_week(cls)->dict:
-911"""Get the start of week enum as a dict.
-912
-913 Returns:
-914 dict containing all enum entries as `{name:value}`.
-915 """
-916return{
-917start_of_week.name:start_of_week.value
-918forstart_of_weekinlist(GABStartOfWeek)
-919}
-920
-921@classmethod
-922defget_values(cls)->set[str]:
-923"""Get the start of week enum values as set.
-924
-925 Returns:
-926 set containing all possible values `{value}`.
-927 """
-928return{start_of_week.valueforstart_of_weekinlist(GABStartOfWeek)}
+
905classGABStartOfWeek(Enum):
+906"""Representation of start of week values on GAB."""
+907
+908SUNDAY="S"
+909MONDAY="M"
+910
+911@classmethod
+912defget_start_of_week(cls)->dict:
+913"""Get the start of week enum as a dict.
+914
+915 Returns:
+916 dict containing all enum entries as `{name:value}`.
+917 """
+918return{
+919start_of_week.name:start_of_week.value
+920forstart_of_weekinlist(GABStartOfWeek)
+921}
+922
+923@classmethod
+924defget_values(cls)->set[str]:
+925"""Get the start of week enum values as set.
+926
+927 Returns:
+928 set containing all possible values `{value}`.
+929 """
+930return{start_of_week.valueforstart_of_weekinlist(GABStartOfWeek)}
@@ -7720,17 +7740,17 @@
Inherited Members
-
909@classmethod
-910defget_start_of_week(cls)->dict:
-911"""Get the start of week enum as a dict.
-912
-913 Returns:
-914 dict containing all enum entries as `{name:value}`.
-915 """
-916return{
-917start_of_week.name:start_of_week.value
-918forstart_of_weekinlist(GABStartOfWeek)
-919}
+
911@classmethod
+912defget_start_of_week(cls)->dict:
+913"""Get the start of week enum as a dict.
+914
+915 Returns:
+916 dict containing all enum entries as `{name:value}`.
+917 """
+918return{
+919start_of_week.name:start_of_week.value
+920forstart_of_weekinlist(GABStartOfWeek)
+921}
@@ -7757,14 +7777,14 @@
Returns:
-
921@classmethod
-922defget_values(cls)->set[str]:
-923"""Get the start of week enum values as set.
-924
-925 Returns:
-926 set containing all possible values `{value}`.
-927 """
-928return{start_of_week.valueforstart_of_weekinlist(GABStartOfWeek)}
+
923@classmethod
+924defget_values(cls)->set[str]:
+925"""Get the start of week enum values as set.
+926
+927 Returns:
+928 set containing all possible values `{value}`.
+929 """
+930return{start_of_week.valueforstart_of_weekinlist(GABStartOfWeek)}
1006classGABCadence(Enum):
-1007"""Representation of the supported cadences on GAB."""
-1008
-1009DAY=1
-1010WEEK=2
-1011MONTH=3
-1012QUARTER=4
-1013YEAR=5
-1014
-1015@classmethod
-1016defget_ordered_cadences(cls)->dict:
-1017"""Get the cadences ordered by the value.
-1018
-1019 Returns:
-1020 dict containing ordered cadences as `{name:value}`.
-1021 """
-1022cadences=list(GABCadence)
-1023return{
-1024cadence.name:cadence.value
-1025forcadenceinsorted(cadences,key=lambdagab_cadence:gab_cadence.value)
-1026}
-1027
-1028@classmethod
-1029defget_cadences(cls)->set[str]:
-1030"""Get the cadences values as set.
-1031
-1032 Returns:
-1033 set containing all possible cadence values as `{value}`.
-1034 """
-1035return{cadence.nameforcadenceinlist(GABCadence)}
-1036
-1037@classmethod
-1038deforder_cadences(cls,cadences_to_order:list[str])->list[str]:
-1039"""Order a list of cadences by value.
-1040
-1041 Returns:
-1042 ordered set containing the received cadences.
-1043 """
-1044returnsorted(
-1045cadences_to_order,
-1046key=lambdaitem:cls.get_ordered_cadences().get(item),# type: ignore
-1047)
+
1008classGABCadence(Enum):
+1009"""Representation of the supported cadences on GAB."""
+1010
+1011DAY=1
+1012WEEK=2
+1013MONTH=3
+1014QUARTER=4
+1015YEAR=5
+1016
+1017@classmethod
+1018defget_ordered_cadences(cls)->dict:
+1019"""Get the cadences ordered by the value.
+1020
+1021 Returns:
+1022 dict containing ordered cadences as `{name:value}`.
+1023 """
+1024cadences=list(GABCadence)
+1025return{
+1026cadence.name:cadence.value
+1027forcadenceinsorted(cadences,key=lambdagab_cadence:gab_cadence.value)
+1028}
+1029
+1030@classmethod
+1031defget_cadences(cls)->set[str]:
+1032"""Get the cadences values as set.
+1033
+1034 Returns:
+1035 set containing all possible cadence values as `{value}`.
+1036 """
+1037return{cadence.nameforcadenceinlist(GABCadence)}
+1038
+1039@classmethod
+1040deforder_cadences(cls,cadences_to_order:list[str])->list[str]:
+1041"""Order a list of cadences by value.
+1042
+1043 Returns:
+1044 ordered set containing the received cadences.
+1045 """
+1046returnsorted(
+1047cadences_to_order,
+1048key=lambdaitem:cls.get_ordered_cadences().get(item),# type: ignore
+1049)
@@ -8250,18 +8270,18 @@
Arguments:
-
1015@classmethod
-1016defget_ordered_cadences(cls)->dict:
-1017"""Get the cadences ordered by the value.
-1018
-1019 Returns:
-1020 dict containing ordered cadences as `{name:value}`.
-1021 """
-1022cadences=list(GABCadence)
-1023return{
-1024cadence.name:cadence.value
-1025forcadenceinsorted(cadences,key=lambdagab_cadence:gab_cadence.value)
-1026}
+
1017@classmethod
+1018defget_ordered_cadences(cls)->dict:
+1019"""Get the cadences ordered by the value.
+1020
+1021 Returns:
+1022 dict containing ordered cadences as `{name:value}`.
+1023 """
+1024cadences=list(GABCadence)
+1025return{
+1026cadence.name:cadence.value
+1027forcadenceinsorted(cadences,key=lambdagab_cadence:gab_cadence.value)
+1028}
@@ -8288,14 +8308,14 @@
Returns:
-
1028@classmethod
-1029defget_cadences(cls)->set[str]:
-1030"""Get the cadences values as set.
-1031
-1032 Returns:
-1033 set containing all possible cadence values as `{value}`.
-1034 """
-1035return{cadence.nameforcadenceinlist(GABCadence)}
+
1030@classmethod
+1031defget_cadences(cls)->set[str]:
+1032"""Get the cadences values as set.
+1033
+1034 Returns:
+1035 set containing all possible cadence values as `{value}`.
+1036 """
+1037return{cadence.nameforcadenceinlist(GABCadence)}
@@ -8322,17 +8342,17 @@
Returns:
-
1037@classmethod
-1038deforder_cadences(cls,cadences_to_order:list[str])->list[str]:
-1039"""Order a list of cadences by value.
-1040
-1041 Returns:
-1042 ordered set containing the received cadences.
-1043 """
-1044returnsorted(
-1045cadences_to_order,
-1046key=lambdaitem:cls.get_ordered_cadences().get(item),# type: ignore
-1047)
+
1039@classmethod
+1040deforder_cadences(cls,cadences_to_order:list[str])->list[str]:
+1041"""Order a list of cadences by value.
+1042
+1043 Returns:
+1044 ordered set containing the received cadences.
+1045 """
+1046returnsorted(
+1047cadences_to_order,
+1048key=lambdaitem:cls.get_ordered_cadences().get(item),# type: ignore
+1049)
@@ -8369,12 +8389,12 @@
Inherited Members
-
1050classGABKeys:
-1051"""Constants used to update pre-configured gab dict key."""
-1052
-1053JOIN_SELECT="join_select"
-1054PROJECT_START="project_start"
-1055PROJECT_END="project_end"
+
1052classGABKeys:
+1053"""Constants used to update pre-configured gab dict key."""
+1054
+1055JOIN_SELECT="join_select"
+1056PROJECT_START="project_start"
+1057PROJECT_END="project_end"
@@ -8430,13 +8450,13 @@
Inherited Members
-
1058classGABReplaceableKeys:
-1059"""Constants used to replace pre-configured gab dict values."""
-1060
-1061CADENCE="${cad}"
-1062DATE_COLUMN="${date_column}"
-1063CONFIG_WEEK_START="${config_week_start}"
-1064RECONCILIATION_CADENCE="${rec_cadence}"
+
1060classGABReplaceableKeys:
+1061"""Constants used to replace pre-configured gab dict values."""
+1062
+1063CADENCE="${cad}"
+1064DATE_COLUMN="${date_column}"
+1065CONFIG_WEEK_START="${config_week_start}"
+1066RECONCILIATION_CADENCE="${rec_cadence}"
@@ -8504,348 +8524,348 @@
Inherited Members
-
1067classGABCombinedConfiguration(Enum):
-1068"""GAB combined configuration.
-1069
-1070 Based on the use case configuration return the values to override in the SQL file.
-1071 This enum aims to exhaustively map each combination of `cadence`, `reconciliation`,
-1072 `week_start` and `snap_flag` return the corresponding values `join_select`,
-1073 `project_start` and `project_end` to replace this values in the stages SQL file.
-1074
-1075 Return corresponding configuration (join_select, project_start, project_end) for
-1076 each combination (cadence x recon x week_start x snap_flag).
-1077 """
-1078
-1079_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE=(
-1080"date(date_trunc('${cad}',${date_column}))"
-1081)
-1082_DEFAULT_PROJECT_START="df_cal.cadence_start_date"
-1083_DEFAULT_PROJECT_END="df_cal.cadence_end_date"
-1084
-1085COMBINED_CONFIGURATION={
-1086# Combination of:
-1087# - cadence: `DAY`
-1088# - reconciliation_window: `DAY`, `WEEK`, `MONTH`, `QUARTER`, `YEAR`
-1089# - week_start: `S`, `M`
-1090# - snapshot_flag: `Y`, `N`
-10911:{
-1092"cadence":GABCadence.DAY.name,
-1093"recon":GABCadence.get_cadences(),
-1094"week_start":GABStartOfWeek.get_values(),
-1095"snap_flag":{"Y","N"},
-1096"join_select":"",
-1097"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1098"project_end":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1099},
-1100# Combination of:
-1101# - cadence: `WEEK`
-1102# - reconciliation_window: `DAY`
-1103# - week_start: `S`, `M`
-1104# - snapshot_flag: `Y`
-11052:{
-1106"cadence":GABCadence.WEEK.name,
-1107"recon":GABCadence.DAY.name,
-1108"week_start":GABStartOfWeek.get_values(),
-1109"snap_flag":"Y",
-1110"join_select":"""
-1111 select distinct case
-1112 when '${config_week_start}' = 'Monday' then weekstart_mon
-1113 when '${config_week_start}' = 'Sunday' then weekstart_sun
-1114 end as cadence_start_date,
-1115 calendar_date as cadence_end_date
-1116 """,
-1117"project_start":_DEFAULT_PROJECT_START,
-1118"project_end":_DEFAULT_PROJECT_END,
-1119},
-1120# Combination of:
-1121# - cadence: `WEEK`
-1122# - reconciliation_window: `DAY, `MONTH`, `QUARTER`, `YEAR`
-1123# - week_start: `M`
-1124# - snapshot_flag: `Y`, `N`
-11253:{
-1126"cadence":GABCadence.WEEK.name,
-1127"recon":{
-1128GABCadence.DAY.name,
-1129GABCadence.MONTH.name,
-1130GABCadence.QUARTER.name,
-1131GABCadence.YEAR.name,
-1132},
-1133"week_start":"M",
-1134"snap_flag":{"Y","N"},
-1135"join_select":"""
-1136 select distinct case
-1137 when '${config_week_start}' = 'Monday' then weekstart_mon
-1138 when '${config_week_start}' = 'Sunday' then weekstart_sun
-1139 end as cadence_start_date,
-1140 case
-1141 when '${config_week_start}' = 'Monday' then weekend_mon
-1142 when '${config_week_start}' = 'Sunday' then weekend_sun
-1143 end as cadence_end_date""",
-1144"project_start":_DEFAULT_PROJECT_START,
-1145"project_end":_DEFAULT_PROJECT_END,
-1146},
-11474:{
-1148"cadence":GABCadence.MONTH.name,
-1149"recon":GABCadence.DAY.name,
-1150"week_start":GABStartOfWeek.get_values(),
-1151"snap_flag":"Y",
-1152"join_select":"""
-1153 select distinct month_start as cadence_start_date,
-1154 calendar_date as cadence_end_date
-1155 """,
-1156"project_start":_DEFAULT_PROJECT_START,
-1157"project_end":_DEFAULT_PROJECT_END,
-1158},
-11595:{
-1160"cadence":GABCadence.MONTH.name,
-1161"recon":GABCadence.WEEK.name,
-1162"week_start":GABStartOfWeek.MONDAY.value,
-1163"snap_flag":"Y",
-1164"join_select":"""
-1165 select distinct month_start as cadence_start_date,
-1166 case
-1167 when date(
-1168 date_trunc('MONTH',add_months(calendar_date, 1))
-1169 )-1 < weekend_mon
-1170 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
-1171 else weekend_mon
-1172 end as cadence_end_date""",
-1173"project_start":_DEFAULT_PROJECT_START,
-1174"project_end":_DEFAULT_PROJECT_END,
-1175},
-11766:{
-1177"cadence":GABCadence.MONTH.name,
-1178"recon":GABCadence.WEEK.name,
-1179"week_start":GABStartOfWeek.SUNDAY.value,
-1180"snap_flag":"Y",
-1181"join_select":"""
-1182 select distinct month_start as cadence_start_date,
-1183 case
-1184 when date(
-1185 date_trunc('MONTH',add_months(calendar_date, 1))
-1186 )-1 < weekend_sun
-1187 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
-1188 else weekend_sun
-1189 end as cadence_end_date""",
-1190"project_start":_DEFAULT_PROJECT_START,
-1191"project_end":_DEFAULT_PROJECT_END,
-1192},
-11937:{
-1194"cadence":GABCadence.MONTH.name,
-1195"recon":GABCadence.get_cadences(),
-1196"week_start":GABStartOfWeek.get_values(),
-1197"snap_flag":{"Y","N"},
-1198"join_select":"",
-1199"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1200"project_end":"date(date_trunc('MONTH',add_months(${date_column}, 1)))-1",
-1201},
-12028:{
-1203"cadence":GABCadence.QUARTER.name,
-1204"recon":GABCadence.DAY.name,
-1205"week_start":GABStartOfWeek.get_values(),
-1206"snap_flag":"Y",
-1207"join_select":"""
-1208 select distinct quarter_start as cadence_start_date,
-1209 calendar_date as cadence_end_date
-1210 """,
-1211"project_start":_DEFAULT_PROJECT_START,
-1212"project_end":_DEFAULT_PROJECT_END,
-1213},
-12149:{
-1215"cadence":GABCadence.QUARTER.name,
-1216"recon":GABCadence.WEEK.name,
-1217"week_start":GABStartOfWeek.MONDAY.value,
-1218"snap_flag":"Y",
-1219"join_select":"""
-1220 select distinct quarter_start as cadence_start_date,
-1221 case
-1222 when weekend_mon > date(
-1223 date_trunc('QUARTER',add_months(calendar_date, 3))
-1224 )-1
-1225 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
-1226 else weekend_mon
-1227 end as cadence_end_date""",
-1228"project_start":_DEFAULT_PROJECT_START,
-1229"project_end":_DEFAULT_PROJECT_END,
-1230},
-123110:{
-1232"cadence":GABCadence.QUARTER.name,
-1233"recon":GABCadence.WEEK.name,
-1234"week_start":GABStartOfWeek.SUNDAY.value,
-1235"snap_flag":"Y",
-1236"join_select":"""
-1237 select distinct quarter_start as cadence_start_date,
-1238 case
-1239 when weekend_sun > date(
-1240 date_trunc('QUARTER',add_months(calendar_date, 3))
-1241 )-1
-1242 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
-1243 else weekend_sun
-1244 end as cadence_end_date""",
-1245"project_start":_DEFAULT_PROJECT_START,
-1246"project_end":_DEFAULT_PROJECT_END,
-1247},
-124811:{
-1249"cadence":GABCadence.QUARTER.name,
-1250"recon":GABCadence.MONTH.name,
-1251"week_start":GABStartOfWeek.get_values(),
-1252"snap_flag":"Y",
-1253"join_select":"""
-1254 select distinct quarter_start as cadence_start_date,
-1255 month_end as cadence_end_date
-1256 """,
-1257"project_start":_DEFAULT_PROJECT_START,
-1258"project_end":_DEFAULT_PROJECT_END,
-1259},
-126012:{
-1261"cadence":GABCadence.QUARTER.name,
-1262"recon":GABCadence.YEAR.name,
-1263"week_start":GABStartOfWeek.get_values(),
-1264"snap_flag":"N",
-1265"join_select":"",
-1266"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1267"project_end":"""
-1268 date(
-1269 date_trunc(
-1270 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)
-1271 )
-1272 )-1
-1273 """,
-1274},
-127513:{
-1276"cadence":GABCadence.QUARTER.name,
-1277"recon":GABCadence.get_cadences(),
-1278"week_start":GABStartOfWeek.get_values(),
-1279"snap_flag":"N",
-1280"join_select":"",
-1281"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1282"project_end":"""
-1283 date(
-1284 date_trunc(
-1285 '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)
-1286 )
-1287 )-1
-1288 """,
-1289},
-129014:{
-1291"cadence":GABCadence.YEAR.name,
-1292"recon":GABCadence.WEEK.name,
-1293"week_start":GABStartOfWeek.MONDAY.value,
-1294"snap_flag":"Y",
-1295"join_select":"""
-1296 select distinct year_start as cadence_start_date,
-1297 case
-1298 when weekend_mon > date(
-1299 date_trunc('YEAR',add_months(calendar_date, 12))
-1300 )-1
-1301 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
-1302 else weekend_mon
-1303 end as cadence_end_date""",
-1304"project_start":_DEFAULT_PROJECT_START,
-1305"project_end":_DEFAULT_PROJECT_END,
-1306},
-130715:{
-1308"cadence":GABCadence.YEAR.name,
-1309"recon":GABCadence.WEEK.name,
-1310"week_start":GABStartOfWeek.SUNDAY.value,
-1311"snap_flag":"Y",
-1312"join_select":"""
-1313 select distinct year_start as cadence_start_date,
-1314 case
-1315 when weekend_sun > date(
-1316 date_trunc('YEAR',add_months(calendar_date, 12))
-1317 )-1
-1318 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
-1319 else weekend_sun
-1320 end as cadence_end_date""",
-1321"project_start":_DEFAULT_PROJECT_START,
-1322"project_end":_DEFAULT_PROJECT_END,
-1323},
-132416:{
-1325"cadence":GABCadence.YEAR.name,
-1326"recon":GABCadence.get_cadences(),
-1327"week_start":GABStartOfWeek.get_values(),
-1328"snap_flag":"N",
-1329"inverse_flag":"Y",
-1330"join_select":"",
-1331"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
-1332"project_end":"""
-1333 date(
-1334 date_trunc(
-1335 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)
-1336 )
-1337 )-1
-1338 """,
-1339},
-134017:{
-1341"cadence":GABCadence.YEAR.name,
-1342"recon":{
-1343GABCadence.DAY.name,
-1344GABCadence.MONTH.name,
-1345GABCadence.QUARTER.name,
-1346},
-1347"week_start":GABStartOfWeek.get_values(),
-1348"snap_flag":"Y",
-1349"join_select":"""
-1350 select distinct year_start as cadence_start_date,
-1351 case
-1352 when '${rec_cadence}' = 'DAY' then calendar_date
-1353 when '${rec_cadence}' = 'MONTH' then month_end
-1354 when '${rec_cadence}' = 'QUARTER' then quarter_end
-1355 end as cadence_end_date
-1356 """,
-1357"project_start":_DEFAULT_PROJECT_START,
-1358"project_end":_DEFAULT_PROJECT_END,
-1359},
-136018:{
-1361"cadence":GABCadence.get_cadences(),
-1362"recon":GABCadence.get_cadences(),
-1363"week_start":GABStartOfWeek.get_values(),
-1364"snap_flag":{"Y","N"},
-1365"join_select":"""
-1366 select distinct
-1367 case
-1368 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
-1369 then weekstart_mon
-1370 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
-1371 then weekstart_sun
-1372 else
-1373 date(date_trunc('${cad}',calendar_date))
-1374 end as cadence_start_date,
-1375 case
-1376 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
-1377 then weekend_mon
-1378 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
-1379 then weekend_sun
-1380 when '${cad}' = 'DAY'
-1381 then date(date_trunc('${cad}',calendar_date))
-1382 when '${cad}' = 'MONTH'
-1383 then date(
-1384 date_trunc(
-1385 'MONTH',
-1386 add_months(date(date_trunc('${cad}',calendar_date)), 1)
-1387 )
-1388 )-1
-1389 when '${cad}' = 'QUARTER'
-1390 then date(
-1391 date_trunc(
-1392 'QUARTER',
-1393 add_months(date(date_trunc('${cad}',calendar_date)) , 3)
-1394 )
-1395 )-1
-1396 when '${cad}' = 'YEAR'
-1397 then date(
-1398 date_trunc(
-1399 'YEAR',
-1400 add_months(date(date_trunc('${cad}',calendar_date)), 12)
-1401 )
-1402 )-1
-1403 end as cadence_end_date
-1404 """,
-1405"project_start":_DEFAULT_PROJECT_START,
-1406"project_end":_DEFAULT_PROJECT_END,
-1407},
-1408}
+
1069classGABCombinedConfiguration(Enum):
+1070"""GAB combined configuration.
+1071
+1072 Based on the use case configuration return the values to override in the SQL file.
+1073 This enum aims to exhaustively map each combination of `cadence`, `reconciliation`,
+1074 `week_start` and `snap_flag` return the corresponding values `join_select`,
+1075 `project_start` and `project_end` to replace this values in the stages SQL file.
+1076
+1077 Return corresponding configuration (join_select, project_start, project_end) for
+1078 each combination (cadence x recon x week_start x snap_flag).
+1079 """
+1080
+1081_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE=(
+1082"date(date_trunc('${cad}',${date_column}))"
+1083)
+1084_DEFAULT_PROJECT_START="df_cal.cadence_start_date"
+1085_DEFAULT_PROJECT_END="df_cal.cadence_end_date"
+1086
+1087COMBINED_CONFIGURATION={
+1088# Combination of:
+1089# - cadence: `DAY`
+1090# - reconciliation_window: `DAY`, `WEEK`, `MONTH`, `QUARTER`, `YEAR`
+1091# - week_start: `S`, `M`
+1092# - snapshot_flag: `Y`, `N`
+10931:{
+1094"cadence":GABCadence.DAY.name,
+1095"recon":GABCadence.get_cadences(),
+1096"week_start":GABStartOfWeek.get_values(),
+1097"snap_flag":{"Y","N"},
+1098"join_select":"",
+1099"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1100"project_end":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1101},
+1102# Combination of:
+1103# - cadence: `WEEK`
+1104# - reconciliation_window: `DAY`
+1105# - week_start: `S`, `M`
+1106# - snapshot_flag: `Y`
+11072:{
+1108"cadence":GABCadence.WEEK.name,
+1109"recon":GABCadence.DAY.name,
+1110"week_start":GABStartOfWeek.get_values(),
+1111"snap_flag":"Y",
+1112"join_select":"""
+1113 select distinct case
+1114 when '${config_week_start}' = 'Monday' then weekstart_mon
+1115 when '${config_week_start}' = 'Sunday' then weekstart_sun
+1116 end as cadence_start_date,
+1117 calendar_date as cadence_end_date
+1118 """,
+1119"project_start":_DEFAULT_PROJECT_START,
+1120"project_end":_DEFAULT_PROJECT_END,
+1121},
+1122# Combination of:
+1123# - cadence: `WEEK`
+1124# - reconciliation_window: `DAY, `MONTH`, `QUARTER`, `YEAR`
+1125# - week_start: `M`
+1126# - snapshot_flag: `Y`, `N`
+11273:{
+1128"cadence":GABCadence.WEEK.name,
+1129"recon":{
+1130GABCadence.DAY.name,
+1131GABCadence.MONTH.name,
+1132GABCadence.QUARTER.name,
+1133GABCadence.YEAR.name,
+1134},
+1135"week_start":"M",
+1136"snap_flag":{"Y","N"},
+1137"join_select":"""
+1138 select distinct case
+1139 when '${config_week_start}' = 'Monday' then weekstart_mon
+1140 when '${config_week_start}' = 'Sunday' then weekstart_sun
+1141 end as cadence_start_date,
+1142 case
+1143 when '${config_week_start}' = 'Monday' then weekend_mon
+1144 when '${config_week_start}' = 'Sunday' then weekend_sun
+1145 end as cadence_end_date""",
+1146"project_start":_DEFAULT_PROJECT_START,
+1147"project_end":_DEFAULT_PROJECT_END,
+1148},
+11494:{
+1150"cadence":GABCadence.MONTH.name,
+1151"recon":GABCadence.DAY.name,
+1152"week_start":GABStartOfWeek.get_values(),
+1153"snap_flag":"Y",
+1154"join_select":"""
+1155 select distinct month_start as cadence_start_date,
+1156 calendar_date as cadence_end_date
+1157 """,
+1158"project_start":_DEFAULT_PROJECT_START,
+1159"project_end":_DEFAULT_PROJECT_END,
+1160},
+11615:{
+1162"cadence":GABCadence.MONTH.name,
+1163"recon":GABCadence.WEEK.name,
+1164"week_start":GABStartOfWeek.MONDAY.value,
+1165"snap_flag":"Y",
+1166"join_select":"""
+1167 select distinct month_start as cadence_start_date,
+1168 case
+1169 when date(
+1170 date_trunc('MONTH',add_months(calendar_date, 1))
+1171 )-1 < weekend_mon
+1172 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
+1173 else weekend_mon
+1174 end as cadence_end_date""",
+1175"project_start":_DEFAULT_PROJECT_START,
+1176"project_end":_DEFAULT_PROJECT_END,
+1177},
+11786:{
+1179"cadence":GABCadence.MONTH.name,
+1180"recon":GABCadence.WEEK.name,
+1181"week_start":GABStartOfWeek.SUNDAY.value,
+1182"snap_flag":"Y",
+1183"join_select":"""
+1184 select distinct month_start as cadence_start_date,
+1185 case
+1186 when date(
+1187 date_trunc('MONTH',add_months(calendar_date, 1))
+1188 )-1 < weekend_sun
+1189 then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1
+1190 else weekend_sun
+1191 end as cadence_end_date""",
+1192"project_start":_DEFAULT_PROJECT_START,
+1193"project_end":_DEFAULT_PROJECT_END,
+1194},
+11957:{
+1196"cadence":GABCadence.MONTH.name,
+1197"recon":GABCadence.get_cadences(),
+1198"week_start":GABStartOfWeek.get_values(),
+1199"snap_flag":{"Y","N"},
+1200"join_select":"",
+1201"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1202"project_end":"date(date_trunc('MONTH',add_months(${date_column}, 1)))-1",
+1203},
+12048:{
+1205"cadence":GABCadence.QUARTER.name,
+1206"recon":GABCadence.DAY.name,
+1207"week_start":GABStartOfWeek.get_values(),
+1208"snap_flag":"Y",
+1209"join_select":"""
+1210 select distinct quarter_start as cadence_start_date,
+1211 calendar_date as cadence_end_date
+1212 """,
+1213"project_start":_DEFAULT_PROJECT_START,
+1214"project_end":_DEFAULT_PROJECT_END,
+1215},
+12169:{
+1217"cadence":GABCadence.QUARTER.name,
+1218"recon":GABCadence.WEEK.name,
+1219"week_start":GABStartOfWeek.MONDAY.value,
+1220"snap_flag":"Y",
+1221"join_select":"""
+1222 select distinct quarter_start as cadence_start_date,
+1223 case
+1224 when weekend_mon > date(
+1225 date_trunc('QUARTER',add_months(calendar_date, 3))
+1226 )-1
+1227 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
+1228 else weekend_mon
+1229 end as cadence_end_date""",
+1230"project_start":_DEFAULT_PROJECT_START,
+1231"project_end":_DEFAULT_PROJECT_END,
+1232},
+123310:{
+1234"cadence":GABCadence.QUARTER.name,
+1235"recon":GABCadence.WEEK.name,
+1236"week_start":GABStartOfWeek.SUNDAY.value,
+1237"snap_flag":"Y",
+1238"join_select":"""
+1239 select distinct quarter_start as cadence_start_date,
+1240 case
+1241 when weekend_sun > date(
+1242 date_trunc('QUARTER',add_months(calendar_date, 3))
+1243 )-1
+1244 then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1
+1245 else weekend_sun
+1246 end as cadence_end_date""",
+1247"project_start":_DEFAULT_PROJECT_START,
+1248"project_end":_DEFAULT_PROJECT_END,
+1249},
+125011:{
+1251"cadence":GABCadence.QUARTER.name,
+1252"recon":GABCadence.MONTH.name,
+1253"week_start":GABStartOfWeek.get_values(),
+1254"snap_flag":"Y",
+1255"join_select":"""
+1256 select distinct quarter_start as cadence_start_date,
+1257 month_end as cadence_end_date
+1258 """,
+1259"project_start":_DEFAULT_PROJECT_START,
+1260"project_end":_DEFAULT_PROJECT_END,
+1261},
+126212:{
+1263"cadence":GABCadence.QUARTER.name,
+1264"recon":GABCadence.YEAR.name,
+1265"week_start":GABStartOfWeek.get_values(),
+1266"snap_flag":"N",
+1267"join_select":"",
+1268"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1269"project_end":"""
+1270 date(
+1271 date_trunc(
+1272 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)
+1273 )
+1274 )-1
+1275 """,
+1276},
+127713:{
+1278"cadence":GABCadence.QUARTER.name,
+1279"recon":GABCadence.get_cadences(),
+1280"week_start":GABStartOfWeek.get_values(),
+1281"snap_flag":"N",
+1282"join_select":"",
+1283"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1284"project_end":"""
+1285 date(
+1286 date_trunc(
+1287 '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)
+1288 )
+1289 )-1
+1290 """,
+1291},
+129214:{
+1293"cadence":GABCadence.YEAR.name,
+1294"recon":GABCadence.WEEK.name,
+1295"week_start":GABStartOfWeek.MONDAY.value,
+1296"snap_flag":"Y",
+1297"join_select":"""
+1298 select distinct year_start as cadence_start_date,
+1299 case
+1300 when weekend_mon > date(
+1301 date_trunc('YEAR',add_months(calendar_date, 12))
+1302 )-1
+1303 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
+1304 else weekend_mon
+1305 end as cadence_end_date""",
+1306"project_start":_DEFAULT_PROJECT_START,
+1307"project_end":_DEFAULT_PROJECT_END,
+1308},
+130915:{
+1310"cadence":GABCadence.YEAR.name,
+1311"recon":GABCadence.WEEK.name,
+1312"week_start":GABStartOfWeek.SUNDAY.value,
+1313"snap_flag":"Y",
+1314"join_select":"""
+1315 select distinct year_start as cadence_start_date,
+1316 case
+1317 when weekend_sun > date(
+1318 date_trunc('YEAR',add_months(calendar_date, 12))
+1319 )-1
+1320 then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1
+1321 else weekend_sun
+1322 end as cadence_end_date""",
+1323"project_start":_DEFAULT_PROJECT_START,
+1324"project_end":_DEFAULT_PROJECT_END,
+1325},
+132616:{
+1327"cadence":GABCadence.YEAR.name,
+1328"recon":GABCadence.get_cadences(),
+1329"week_start":GABStartOfWeek.get_values(),
+1330"snap_flag":"N",
+1331"inverse_flag":"Y",
+1332"join_select":"",
+1333"project_start":_PROJECT_DATE_COLUMN_TRUNCATED_BY_CADENCE,
+1334"project_end":"""
+1335 date(
+1336 date_trunc(
+1337 '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)
+1338 )
+1339 )-1
+1340 """,
+1341},
+134217:{
+1343"cadence":GABCadence.YEAR.name,
+1344"recon":{
+1345GABCadence.DAY.name,
+1346GABCadence.MONTH.name,
+1347GABCadence.QUARTER.name,
+1348},
+1349"week_start":GABStartOfWeek.get_values(),
+1350"snap_flag":"Y",
+1351"join_select":"""
+1352 select distinct year_start as cadence_start_date,
+1353 case
+1354 when '${rec_cadence}' = 'DAY' then calendar_date
+1355 when '${rec_cadence}' = 'MONTH' then month_end
+1356 when '${rec_cadence}' = 'QUARTER' then quarter_end
+1357 end as cadence_end_date
+1358 """,
+1359"project_start":_DEFAULT_PROJECT_START,
+1360"project_end":_DEFAULT_PROJECT_END,
+1361},
+136218:{
+1363"cadence":GABCadence.get_cadences(),
+1364"recon":GABCadence.get_cadences(),
+1365"week_start":GABStartOfWeek.get_values(),
+1366"snap_flag":{"Y","N"},
+1367"join_select":"""
+1368 select distinct
+1369 case
+1370 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
+1371 then weekstart_mon
+1372 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
+1373 then weekstart_sun
+1374 else
+1375 date(date_trunc('${cad}',calendar_date))
+1376 end as cadence_start_date,
+1377 case
+1378 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'
+1379 then weekend_mon
+1380 when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'
+1381 then weekend_sun
+1382 when '${cad}' = 'DAY'
+1383 then date(date_trunc('${cad}',calendar_date))
+1384 when '${cad}' = 'MONTH'
+1385 then date(
+1386 date_trunc(
+1387 'MONTH',
+1388 add_months(date(date_trunc('${cad}',calendar_date)), 1)
+1389 )
+1390 )-1
+1391 when '${cad}' = 'QUARTER'
+1392 then date(
+1393 date_trunc(
+1394 'QUARTER',
+1395 add_months(date(date_trunc('${cad}',calendar_date)) , 3)
+1396 )
+1397 )-1
+1398 when '${cad}' = 'YEAR'
+1399 then date(
+1400 date_trunc(
+1401 'YEAR',
+1402 add_months(date(date_trunc('${cad}',calendar_date)), 12)
+1403 )
+1404 )-1
+1405 end as cadence_end_date
+1406 """,
+1407"project_start":_DEFAULT_PROJECT_START,
+1408"project_end":_DEFAULT_PROJECT_END,
+1409},
+1410}
@@ -8865,7 +8885,7 @@
Inherited Members
COMBINED_CONFIGURATION =
- <GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n calendar_date as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'DAY', 'MONTH', 'QUARTER', 'YEAR'}, 'week_start': 'M', 'snap_flag': {'Y', 'N'}, 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n case\n when '${config_week_start}' = 'Monday' then weekend_mon\n when '${config_week_start}' = 'Sunday' then weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct month_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_mon\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_sun\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n month_end as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 13: {'cadence': 'QUARTER', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\n )\n )-1\n "}, 17: {'cadence': 'YEAR', 'recon': {'DAY', 'MONTH', 'QUARTER'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when '${rec_cadence}' = 'DAY' then calendar_date\n when '${rec_cadence}' = 'MONTH' then month_end\n when '${rec_cadence}' = 'QUARTER' then quarter_end\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': "\n select distinct\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekstart_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekstart_sun\n else\n date(date_trunc('${cad}',calendar_date))\n end as cadence_start_date,\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekend_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekend_sun\n when '${cad}' = 'DAY'\n then date(date_trunc('${cad}',calendar_date))\n when '${cad}' = 'MONTH'\n then date(\n date_trunc(\n 'MONTH',\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\n )\n )-1\n when '${cad}' = 'QUARTER'\n then date(\n date_trunc(\n 'QUARTER',\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\n )\n )-1\n when '${cad}' = 'YEAR'\n then date(\n date_trunc(\n 'YEAR',\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\n )\n )-1\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>
+ <GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n calendar_date as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'YEAR', 'QUARTER', 'MONTH', 'DAY'}, 'week_start': 'M', 'snap_flag': {'Y', 'N'}, 'join_select': "\n select distinct case\n when '${config_week_start}' = 'Monday' then weekstart_mon\n when '${config_week_start}' = 'Sunday' then weekstart_sun\n end as cadence_start_date,\n case\n when '${config_week_start}' = 'Monday' then weekend_mon\n when '${config_week_start}' = 'Sunday' then weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct month_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_mon\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct month_start as cadence_start_date,\n case\n when date(\n date_trunc('MONTH',add_months(calendar_date, 1))\n )-1 < weekend_sun\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n calendar_date as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct quarter_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('QUARTER',add_months(calendar_date, 3))\n )-1\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\n select distinct quarter_start as cadence_start_date,\n month_end as cadence_end_date\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 13: {'cadence': 'QUARTER', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\n )\n )-1\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_mon > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_mon\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when weekend_sun > date(\n date_trunc('YEAR',add_months(calendar_date, 12))\n )-1\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\n else weekend_sun\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\n date(\n date_trunc(\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\n )\n )-1\n "}, 17: {'cadence': 'YEAR', 'recon': {'QUARTER', 'MONTH', 'DAY'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\n select distinct year_start as cadence_start_date,\n case\n when '${rec_cadence}' = 'DAY' then calendar_date\n when '${rec_cadence}' = 'MONTH' then month_end\n when '${rec_cadence}' = 'QUARTER' then quarter_end\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': "\n select distinct\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekstart_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekstart_sun\n else\n date(date_trunc('${cad}',calendar_date))\n end as cadence_start_date,\n case\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\n then weekend_mon\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\n then weekend_sun\n when '${cad}' = 'DAY'\n then date(date_trunc('${cad}',calendar_date))\n when '${cad}' = 'MONTH'\n then date(\n date_trunc(\n 'MONTH',\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\n )\n )-1\n when '${cad}' = 'QUARTER'\n then date(\n date_trunc(\n 'QUARTER',\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\n )\n )-1\n when '${cad}' = 'YEAR'\n then date(\n date_trunc(\n 'YEAR',\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\n )\n )-1\n end as cadence_end_date\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>
1"""Module to take care of creating a singleton of the execution environment class.""" 2
- 3importos
- 4
- 5frompyspark.sqlimportSparkSession
- 6
- 7fromlakehouse_engine.core.definitionsimportEngineConfig
- 8fromlakehouse_engine.utils.configs.config_utilsimportConfigUtils
- 9fromlakehouse_engine.utils.logging_handlerimportLoggingHandler
- 10
+ 3importast
+ 4importos
+ 5
+ 6frompyspark.sqlimportSparkSession
+ 7
+ 8fromlakehouse_engine.core.definitionsimportEngineConfig
+ 9fromlakehouse_engine.utils.configs.config_utilsimportConfigUtils
+ 10fromlakehouse_engine.utils.logging_handlerimportLoggingHandler 11
- 12classExecEnv(object):
- 13"""Represents the basic resources regarding the engine execution environment.
- 14
- 15 Currently, it is used to encapsulate both the logic to get the Spark
- 16 session and the engine configurations.
- 17 """
- 18
- 19SESSION:SparkSession
- 20_LOGGER=LoggingHandler(__name__).get_logger()
- 21DEFAULT_AWS_REGION="eu-west-1"
- 22ENGINE_CONFIG:EngineConfig=EngineConfig(**ConfigUtils.get_config())
- 23
- 24@classmethod
- 25defset_default_engine_config(cls,package:str)->None:
- 26"""Set default engine configurations by reading them from a specified package.
- 27
- 28 Args:
- 29 package: package where the engine configurations can be found.
- 30 """
- 31cls.ENGINE_CONFIG=EngineConfig(**ConfigUtils.get_config(package))
- 32
- 33@classmethod
- 34defget_or_create(
- 35cls,
- 36session:SparkSession=None,
- 37enable_hive_support:bool=True,
- 38app_name:str=None,
- 39config:dict=None,
- 40)->None:
- 41"""Get or create an execution environment session (currently Spark).
- 42
- 43 It instantiates a singleton session that can be accessed anywhere from the
- 44 lakehouse engine.
- 45
- 46 Args:
- 47 session: spark session.
- 48 enable_hive_support: whether to enable hive support or not.
- 49 app_name: application name.
- 50 config: extra spark configs to supply to the spark session.
- 51 """
- 52default_config={
- 53"spark.databricks.delta.optimizeWrite.enabled":True,
- 54"spark.sql.adaptive.enabled":True,
- 55"spark.databricks.delta.merge.enableLowShuffle":True,
- 56}
- 57cls._LOGGER.info(
- 58f"Using the following default configs you may want to override them for "
- 59f"your job: {default_config}"
- 60)
- 61final_config:dict={**default_config,**(configifconfigelse{})}
- 62cls._LOGGER.info(f"Final config is: {final_config}")
- 63
- 64ifsession:
- 65cls.SESSION=session
- 66else:
- 67# with active session we do not need app name
- 68ifSparkSession.getActiveSession():
- 69app_name=SparkSession.getActiveSession().conf.get("spark.app.name")
- 70cls._LOGGER.info(f"Detected active session: {app_name}")
- 71elifnotSparkSession.getActiveSession()andnotapp_name:
- 72cls._LOGGER.info("No active session or appname detected")
- 73app_name="lakehouse_engine"
- 74# we will still add this part to set configs
- 75session_builder=SparkSession.builder.appName(app_name)
- 76ifconfig:
- 77fork,vinfinal_config.items():
- 78session_builder.config(k,v)
- 79
- 80ifenable_hive_support:
- 81session_builder=session_builder.enableHiveSupport()
- 82cls.SESSION=session_builder.getOrCreate()
- 83
- 84cls._set_environment_variables(final_config.get("os_env_vars"))
- 85
- 86@classmethod
- 87def_set_environment_variables(cls,os_env_vars:dict=None)->None:
- 88"""Set environment variables at OS level.
- 89
- 90 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
- 91 beneficial to avoid getBucketLocation permission problems.
- 92
- 93 Args:
- 94 os_env_vars: this parameter can be used to pass the environment variables to
- 95 be defined.
- 96 """
- 97ifos_env_varsisNone:
- 98os_env_vars={}
- 99
-100forenv_varinos_env_vars.items():
-101os.environ[env_var[0]]=env_var[1]
-102
-103if"AWS_DEFAULT_REGION"notinos_env_vars:
-104os.environ["AWS_DEFAULT_REGION"]=cls.SESSION.conf.get(
-105"spark.databricks.clusterUsageTags.region",cls.DEFAULT_AWS_REGION
-106)
+ 12
+ 13classExecEnv(object):
+ 14"""Represents the basic resources regarding the engine execution environment.
+ 15
+ 16 Currently, it is used to encapsulate both the logic to get the Spark
+ 17 session and the engine configurations.
+ 18 """
+ 19
+ 20SESSION:SparkSession
+ 21_LOGGER=LoggingHandler(__name__).get_logger()
+ 22DEFAULT_AWS_REGION="eu-west-1"
+ 23ENGINE_CONFIG:EngineConfig=EngineConfig(**ConfigUtils.get_config())
+ 24
+ 25@classmethod
+ 26defset_default_engine_config(cls,package:str)->None:
+ 27"""Set default engine configurations by reading them from a specified package.
+ 28
+ 29 Args:
+ 30 package: package where the engine configurations can be found.
+ 31 """
+ 32cls.ENGINE_CONFIG=EngineConfig(**ConfigUtils.get_config(package))
+ 33
+ 34@classmethod
+ 35defget_or_create(
+ 36cls,
+ 37session:SparkSession=None,
+ 38enable_hive_support:bool=True,
+ 39app_name:str=None,
+ 40config:dict=None,
+ 41)->None:
+ 42"""Get or create an execution environment session (currently Spark).
+ 43
+ 44 It instantiates a singleton session that can be accessed anywhere from the
+ 45 lakehouse engine.
+ 46
+ 47 Args:
+ 48 session: spark session.
+ 49 enable_hive_support: whether to enable hive support or not.
+ 50 app_name: application name.
+ 51 config: extra spark configs to supply to the spark session.
+ 52 """
+ 53default_config={
+ 54"spark.databricks.delta.optimizeWrite.enabled":True,
+ 55"spark.sql.adaptive.enabled":True,
+ 56"spark.databricks.delta.merge.enableLowShuffle":True,
+ 57}
+ 58cls._LOGGER.info(
+ 59f"Using the following default configs you may want to override them for "
+ 60f"your job: {default_config}"
+ 61)
+ 62final_config:dict={**default_config,**(configifconfigelse{})}
+ 63cls._LOGGER.info(f"Final config is: {final_config}")
+ 64
+ 65ifsession:
+ 66cls.SESSION=session
+ 67else:
+ 68# with active session we do not need app name
+ 69ifSparkSession.getActiveSession():
+ 70app_name=SparkSession.getActiveSession().conf.get("spark.app.name")
+ 71cls._LOGGER.info(f"Detected active session: {app_name}")
+ 72elifnotSparkSession.getActiveSession()andnotapp_name:
+ 73cls._LOGGER.info("No active session or appname detected")
+ 74app_name="lakehouse_engine"
+ 75# we will still add this part to set configs
+ 76session_builder=SparkSession.builder.appName(app_name)
+ 77ifconfig:
+ 78fork,vinfinal_config.items():
+ 79session_builder.config(k,v)
+ 80
+ 81ifenable_hive_support:
+ 82session_builder=session_builder.enableHiveSupport()
+ 83cls.SESSION=session_builder.getOrCreate()
+ 84
+ 85cls._set_environment_variables(final_config.get("os_env_vars"))
+ 86
+ 87@classmethod
+ 88def_set_environment_variables(cls,os_env_vars:dict=None)->None:
+ 89"""Set environment variables at OS level.
+ 90
+ 91 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
+ 92 beneficial to avoid getBucketLocation permission problems.
+ 93
+ 94 Args:
+ 95 os_env_vars: this parameter can be used to pass the environment variables to
+ 96 be defined.
+ 97 """
+ 98ifos_env_varsisNone:
+ 99os_env_vars={}
+100
+101forenv_varinos_env_vars.items():
+102os.environ[env_var[0]]=env_var[1]
+103
+104if"AWS_DEFAULT_REGION"notinos_env_vars:
+105os.environ["AWS_DEFAULT_REGION"]=cls.SESSION.conf.get(
+106"spark.databricks.clusterUsageTags.region",cls.DEFAULT_AWS_REGION
+107)
+108
+109@classmethod
+110defget_environment(cls)->str:
+111"""Get the environment where the process is running.
+112
+113 Returns:
+114 Name of the environment.
+115 """
+116tag_array=ast.literal_eval(
+117cls.SESSION.conf.get(
+118"spark.databricks.clusterUsageTags.clusterAllTags","[]"
+119)
+120)
+121
+122forkey_valintag_array:
+123ifkey_val["key"]=="environment":
+124returnstr(key_val["value"])
+125return"prod"
@@ -210,101 +232,119 @@
-
13classExecEnv(object):
- 14"""Represents the basic resources regarding the engine execution environment.
- 15
- 16 Currently, it is used to encapsulate both the logic to get the Spark
- 17 session and the engine configurations.
- 18 """
- 19
- 20SESSION:SparkSession
- 21_LOGGER=LoggingHandler(__name__).get_logger()
- 22DEFAULT_AWS_REGION="eu-west-1"
- 23ENGINE_CONFIG:EngineConfig=EngineConfig(**ConfigUtils.get_config())
- 24
- 25@classmethod
- 26defset_default_engine_config(cls,package:str)->None:
- 27"""Set default engine configurations by reading them from a specified package.
- 28
- 29 Args:
- 30 package: package where the engine configurations can be found.
- 31 """
- 32cls.ENGINE_CONFIG=EngineConfig(**ConfigUtils.get_config(package))
- 33
- 34@classmethod
- 35defget_or_create(
- 36cls,
- 37session:SparkSession=None,
- 38enable_hive_support:bool=True,
- 39app_name:str=None,
- 40config:dict=None,
- 41)->None:
- 42"""Get or create an execution environment session (currently Spark).
- 43
- 44 It instantiates a singleton session that can be accessed anywhere from the
- 45 lakehouse engine.
- 46
- 47 Args:
- 48 session: spark session.
- 49 enable_hive_support: whether to enable hive support or not.
- 50 app_name: application name.
- 51 config: extra spark configs to supply to the spark session.
- 52 """
- 53default_config={
- 54"spark.databricks.delta.optimizeWrite.enabled":True,
- 55"spark.sql.adaptive.enabled":True,
- 56"spark.databricks.delta.merge.enableLowShuffle":True,
- 57}
- 58cls._LOGGER.info(
- 59f"Using the following default configs you may want to override them for "
- 60f"your job: {default_config}"
- 61)
- 62final_config:dict={**default_config,**(configifconfigelse{})}
- 63cls._LOGGER.info(f"Final config is: {final_config}")
- 64
- 65ifsession:
- 66cls.SESSION=session
- 67else:
- 68# with active session we do not need app name
- 69ifSparkSession.getActiveSession():
- 70app_name=SparkSession.getActiveSession().conf.get("spark.app.name")
- 71cls._LOGGER.info(f"Detected active session: {app_name}")
- 72elifnotSparkSession.getActiveSession()andnotapp_name:
- 73cls._LOGGER.info("No active session or appname detected")
- 74app_name="lakehouse_engine"
- 75# we will still add this part to set configs
- 76session_builder=SparkSession.builder.appName(app_name)
- 77ifconfig:
- 78fork,vinfinal_config.items():
- 79session_builder.config(k,v)
- 80
- 81ifenable_hive_support:
- 82session_builder=session_builder.enableHiveSupport()
- 83cls.SESSION=session_builder.getOrCreate()
- 84
- 85cls._set_environment_variables(final_config.get("os_env_vars"))
- 86
- 87@classmethod
- 88def_set_environment_variables(cls,os_env_vars:dict=None)->None:
- 89"""Set environment variables at OS level.
- 90
- 91 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
- 92 beneficial to avoid getBucketLocation permission problems.
- 93
- 94 Args:
- 95 os_env_vars: this parameter can be used to pass the environment variables to
- 96 be defined.
- 97 """
- 98ifos_env_varsisNone:
- 99os_env_vars={}
-100
-101forenv_varinos_env_vars.items():
-102os.environ[env_var[0]]=env_var[1]
-103
-104if"AWS_DEFAULT_REGION"notinos_env_vars:
-105os.environ["AWS_DEFAULT_REGION"]=cls.SESSION.conf.get(
-106"spark.databricks.clusterUsageTags.region",cls.DEFAULT_AWS_REGION
-107)
+
14classExecEnv(object):
+ 15"""Represents the basic resources regarding the engine execution environment.
+ 16
+ 17 Currently, it is used to encapsulate both the logic to get the Spark
+ 18 session and the engine configurations.
+ 19 """
+ 20
+ 21SESSION:SparkSession
+ 22_LOGGER=LoggingHandler(__name__).get_logger()
+ 23DEFAULT_AWS_REGION="eu-west-1"
+ 24ENGINE_CONFIG:EngineConfig=EngineConfig(**ConfigUtils.get_config())
+ 25
+ 26@classmethod
+ 27defset_default_engine_config(cls,package:str)->None:
+ 28"""Set default engine configurations by reading them from a specified package.
+ 29
+ 30 Args:
+ 31 package: package where the engine configurations can be found.
+ 32 """
+ 33cls.ENGINE_CONFIG=EngineConfig(**ConfigUtils.get_config(package))
+ 34
+ 35@classmethod
+ 36defget_or_create(
+ 37cls,
+ 38session:SparkSession=None,
+ 39enable_hive_support:bool=True,
+ 40app_name:str=None,
+ 41config:dict=None,
+ 42)->None:
+ 43"""Get or create an execution environment session (currently Spark).
+ 44
+ 45 It instantiates a singleton session that can be accessed anywhere from the
+ 46 lakehouse engine.
+ 47
+ 48 Args:
+ 49 session: spark session.
+ 50 enable_hive_support: whether to enable hive support or not.
+ 51 app_name: application name.
+ 52 config: extra spark configs to supply to the spark session.
+ 53 """
+ 54default_config={
+ 55"spark.databricks.delta.optimizeWrite.enabled":True,
+ 56"spark.sql.adaptive.enabled":True,
+ 57"spark.databricks.delta.merge.enableLowShuffle":True,
+ 58}
+ 59cls._LOGGER.info(
+ 60f"Using the following default configs you may want to override them for "
+ 61f"your job: {default_config}"
+ 62)
+ 63final_config:dict={**default_config,**(configifconfigelse{})}
+ 64cls._LOGGER.info(f"Final config is: {final_config}")
+ 65
+ 66ifsession:
+ 67cls.SESSION=session
+ 68else:
+ 69# with active session we do not need app name
+ 70ifSparkSession.getActiveSession():
+ 71app_name=SparkSession.getActiveSession().conf.get("spark.app.name")
+ 72cls._LOGGER.info(f"Detected active session: {app_name}")
+ 73elifnotSparkSession.getActiveSession()andnotapp_name:
+ 74cls._LOGGER.info("No active session or appname detected")
+ 75app_name="lakehouse_engine"
+ 76# we will still add this part to set configs
+ 77session_builder=SparkSession.builder.appName(app_name)
+ 78ifconfig:
+ 79fork,vinfinal_config.items():
+ 80session_builder.config(k,v)
+ 81
+ 82ifenable_hive_support:
+ 83session_builder=session_builder.enableHiveSupport()
+ 84cls.SESSION=session_builder.getOrCreate()
+ 85
+ 86cls._set_environment_variables(final_config.get("os_env_vars"))
+ 87
+ 88@classmethod
+ 89def_set_environment_variables(cls,os_env_vars:dict=None)->None:
+ 90"""Set environment variables at OS level.
+ 91
+ 92 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
+ 93 beneficial to avoid getBucketLocation permission problems.
+ 94
+ 95 Args:
+ 96 os_env_vars: this parameter can be used to pass the environment variables to
+ 97 be defined.
+ 98 """
+ 99ifos_env_varsisNone:
+100os_env_vars={}
+101
+102forenv_varinos_env_vars.items():
+103os.environ[env_var[0]]=env_var[1]
+104
+105if"AWS_DEFAULT_REGION"notinos_env_vars:
+106os.environ["AWS_DEFAULT_REGION"]=cls.SESSION.conf.get(
+107"spark.databricks.clusterUsageTags.region",cls.DEFAULT_AWS_REGION
+108)
+109
+110@classmethod
+111defget_environment(cls)->str:
+112"""Get the environment where the process is running.
+113
+114 Returns:
+115 Name of the environment.
+116 """
+117tag_array=ast.literal_eval(
+118cls.SESSION.conf.get(
+119"spark.databricks.clusterUsageTags.clusterAllTags","[]"
+120)
+121)
+122
+123forkey_valintag_array:
+124ifkey_val["key"]=="environment":
+125returnstr(key_val["value"])
+126return"prod"
25@classmethod
-26defset_default_engine_config(cls,package:str)->None:
-27"""Set default engine configurations by reading them from a specified package.
-28
-29 Args:
-30 package: package where the engine configurations can be found.
-31 """
-32cls.ENGINE_CONFIG=EngineConfig(**ConfigUtils.get_config(package))
+
26@classmethod
+27defset_default_engine_config(cls,package:str)->None:
+28"""Set default engine configurations by reading them from a specified package.
+29
+30 Args:
+31 package: package where the engine configurations can be found.
+32 """
+33cls.ENGINE_CONFIG=EngineConfig(**ConfigUtils.get_config(package))
@@ -397,58 +437,58 @@
Arguments:
-
34@classmethod
-35defget_or_create(
-36cls,
-37session:SparkSession=None,
-38enable_hive_support:bool=True,
-39app_name:str=None,
-40config:dict=None,
-41)->None:
-42"""Get or create an execution environment session (currently Spark).
-43
-44 It instantiates a singleton session that can be accessed anywhere from the
-45 lakehouse engine.
-46
-47 Args:
-48 session: spark session.
-49 enable_hive_support: whether to enable hive support or not.
-50 app_name: application name.
-51 config: extra spark configs to supply to the spark session.
-52 """
-53default_config={
-54"spark.databricks.delta.optimizeWrite.enabled":True,
-55"spark.sql.adaptive.enabled":True,
-56"spark.databricks.delta.merge.enableLowShuffle":True,
-57}
-58cls._LOGGER.info(
-59f"Using the following default configs you may want to override them for "
-60f"your job: {default_config}"
-61)
-62final_config:dict={**default_config,**(configifconfigelse{})}
-63cls._LOGGER.info(f"Final config is: {final_config}")
-64
-65ifsession:
-66cls.SESSION=session
-67else:
-68# with active session we do not need app name
-69ifSparkSession.getActiveSession():
-70app_name=SparkSession.getActiveSession().conf.get("spark.app.name")
-71cls._LOGGER.info(f"Detected active session: {app_name}")
-72elifnotSparkSession.getActiveSession()andnotapp_name:
-73cls._LOGGER.info("No active session or appname detected")
-74app_name="lakehouse_engine"
-75# we will still add this part to set configs
-76session_builder=SparkSession.builder.appName(app_name)
-77ifconfig:
-78fork,vinfinal_config.items():
-79session_builder.config(k,v)
-80
-81ifenable_hive_support:
-82session_builder=session_builder.enableHiveSupport()
-83cls.SESSION=session_builder.getOrCreate()
-84
-85cls._set_environment_variables(final_config.get("os_env_vars"))
+
35@classmethod
+36defget_or_create(
+37cls,
+38session:SparkSession=None,
+39enable_hive_support:bool=True,
+40app_name:str=None,
+41config:dict=None,
+42)->None:
+43"""Get or create an execution environment session (currently Spark).
+44
+45 It instantiates a singleton session that can be accessed anywhere from the
+46 lakehouse engine.
+47
+48 Args:
+49 session: spark session.
+50 enable_hive_support: whether to enable hive support or not.
+51 app_name: application name.
+52 config: extra spark configs to supply to the spark session.
+53 """
+54default_config={
+55"spark.databricks.delta.optimizeWrite.enabled":True,
+56"spark.sql.adaptive.enabled":True,
+57"spark.databricks.delta.merge.enableLowShuffle":True,
+58}
+59cls._LOGGER.info(
+60f"Using the following default configs you may want to override them for "
+61f"your job: {default_config}"
+62)
+63final_config:dict={**default_config,**(configifconfigelse{})}
+64cls._LOGGER.info(f"Final config is: {final_config}")
+65
+66ifsession:
+67cls.SESSION=session
+68else:
+69# with active session we do not need app name
+70ifSparkSession.getActiveSession():
+71app_name=SparkSession.getActiveSession().conf.get("spark.app.name")
+72cls._LOGGER.info(f"Detected active session: {app_name}")
+73elifnotSparkSession.getActiveSession()andnotapp_name:
+74cls._LOGGER.info("No active session or appname detected")
+75app_name="lakehouse_engine"
+76# we will still add this part to set configs
+77session_builder=SparkSession.builder.appName(app_name)
+78ifconfig:
+79fork,vinfinal_config.items():
+80session_builder.config(k,v)
+81
+82ifenable_hive_support:
+83session_builder=session_builder.enableHiveSupport()
+84cls.SESSION=session_builder.getOrCreate()
+85
+86cls._set_environment_variables(final_config.get("os_env_vars"))
@@ -468,6 +508,49 @@
Arguments:
+
+
+
+
+
@classmethod
+
+ def
+ get_environment(cls) -> str:
+
+
+
+
+
+
110@classmethod
+111defget_environment(cls)->str:
+112"""Get the environment where the process is running.
+113
+114 Returns:
+115 Name of the environment.
+116 """
+117tag_array=ast.literal_eval(
+118cls.SESSION.conf.get(
+119"spark.databricks.clusterUsageTags.clusterAllTags","[]"
+120)
+121)
+122
+123forkey_valintag_array:
+124ifkey_val["key"]=="environment":
+125returnstr(key_val["value"])
+126return"prod"
+
The SQL Custom Transformer executes a SQL transformation provided by the user.This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers.
+
+
The transformer receives the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view.
+
+
+
To register a dataframe as a temp view you can use the "temp_view" config in the input_specs, as shown below.
+
+
+
+
+
\ No newline at end of file
diff --git a/search.js b/search.js
index e41f7bf..85dc8ec 100644
--- a/search.js
+++ b/search.js
@@ -1,6 +1,6 @@
window.pdocSearch = (function(){
/** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oLakehouse engine package containing all the system subpackages.\n\n
Load data using an algorithm configuration (ACON represented as dict).
\n\n
This algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.
\n\n
Since an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.
\n\n
As the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.
If there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.
\n\n
Arguments:
\n\n
\n
data: input dataframes in an ordered dict.
\n
\n\n
Returns:
\n\n
\n
Another ordered dict with the transformed dataframes, according to the\n transformation specification.
Process the data quality tasks for the data that was read and/or transformed.
\n\n
It supports multiple input dataframes. Although just one is advisable.
\n\n
It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\n
Arguments:
\n\n
\n
data: dataframes from previous steps of the algorithm that we which to\nrun the DQ process on.
\n
\n\n
Returns:
\n\n
\n
Another ordered dict with the validated dataframes.
Write the data that was read and transformed (if applicable).
\n\n
It supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.
\n\n
Arguments:
\n\n
\n
data: dataframes that were read and transformed (if applicable).
Validate data using an algorithm configuration (ACON represented as dict).
\n\n
This algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).
A data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.
Process the data quality tasks for the data that was read.
\n\n
It supports a single input dataframe.
\n\n
It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\n
Arguments:
\n\n
\n
data: input dataframe on which to run the DQ process.
Class to define the behavior of an algorithm that checks if data reconciles.
\n\n
Checking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.
\n\n
The reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).
\n\n
All of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.
\n\n
\n\n
It is crucial that both the current and truth datasets have exactly the same\nstructure.
\n\n
\n\n
\n\n
You should not use 0 as yellow or red threshold, as the algorithm will verify\nif the difference between the truth and current values is bigger\nor equal than those thresholds.
\n\n
\n\n
\n\n
The reconciliation does not produce any negative values or percentages, as we\nuse the absolute value of the differences. This means that the recon result\nwill not indicate if it was the current values that were bigger or smaller\nthan the truth values, or vice versa.
Definitions for collection of Lakehouse Engine Stats.
\n\n
\n\n
Note: whenever the value comes from a key inside a Spark Config\nthat returns an array, it can be specified with a '#' so that it\nis adequately processed.
This is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).
\n\n
\n
spec_id: spec_id of the input specification read_type: ReadType type of read\noperation.
\n
data_format: format of the input.
\n
sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\ndirectory.
\n
df_name: dataframe name.
\n
db_table: table name in the form of <db>.<table>.
\n
location: uri that identifies from where to read data in the specified format.
\n
enforce_schema_from_table: if we want to enforce the table schema or not, by\nproviding a table name in the form of <db>.<table>.
\n
query: sql query to execute and return the dataframe. Use it if you do not want to\nread from a file system nor from a table, but rather from a sql query instead.
\n
schema: dict representation of a schema of the input (e.g., Spark struct type\nschema).
\n
schema_path: path to a file with a representation of a schema of the input (e.g.,\nSpark struct type schema).
\n
disable_dbfs_retry: optional flag to disable file storage dbfs.
\n
with_filepath: if we want to include the path of the file that is being read. Only\nworks with the file reader (batch and streaming modes are supported).
\n
options: dict with other relevant options according to the execution\nenvironment (e.g., spark) possible sources.
\n
calculate_upper_bound: when to calculate upper bound to extract from SAP BW\nor not.
\n
calc_upper_bound_schema: specific schema for the calculated upper_bound.
\n
generate_predicates: when to generate predicates to extract from SAP BW or not.
\n
predicates_add_null: if we want to include is null on partition by predicates.
\n
temp_view: optional name of a view to point to the input dataframe to be used\nto create or replace a temp view on top of the dataframe.
Transformer Specification, i.e., a single transformation amongst many.
\n\n
\n
function: name of the function (or callable function) to be executed.
\n
args: (not applicable if using a callable function) dict with the arguments\nto pass to the function <k,v> pairs with the name of the parameter of\nthe function and the respective value.
I.e., the specification that defines the many transformations to be done to the data\nthat was read.
\n\n
\n
spec_id: id of the terminate specification
\n
input_id: id of the corresponding input\nspecification.
\n
transformers: list of transformers to execute.
\n
force_streaming_foreach_batch_processing: sometimes, when using streaming, we want\nto force the transform to be executed in the foreachBatch function to ensure\nnon-supported streaming operations can be properly executed.
dq_type - type of DQ process to execute (e.g. validator).
\n
dq_functions - list of function specifications to execute.
\n
dq_db_table - name of table to derive the dq functions from.
\n
dq_table_table_filter - name of the table which rules are to be applied in the\nvalidations (Only used when deriving dq functions).
\n
dq_table_extra_filters - extra filters to be used when deriving dq functions.\nThis is a sql expression to be applied to the dq_db_table.
\n
execution_point - execution point of the dq functions. [at_rest, in_motion].\nThis is set during the load_data or dq_validator functions.
\n
unexpected_rows_pk - the list of columns composing the primary key of the\nsource data to identify the rows failing the DQ validations. Note: only one\nof tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\nis mandatory to provide one of these arguments when using tag_source_data\nas True. When tag_source_data is False, this is not mandatory, but still\nrecommended.
\n
tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\nNote: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\nbe provided. It is mandatory to provide one of these arguments when using\ntag_source_data as True. hen tag_source_data is False, this is not\nmandatory, but still recommended.
\n
gx_result_format - great expectations result format. Default: \"COMPLETE\".
\n
tag_source_data - when set to true, this will ensure that the DQ process ends by\ntagging the source data with an additional column with information about the\nDQ results. This column makes it possible to identify if the DQ run was\nsucceeded in general and, if not, it unlocks the insights to know what\nspecific rows have made the DQ validations fail and why. Default: False.\nNote: it only works if result_sink_explode is True, gx_result_format is\nCOMPLETE, fail_on_error is False (which is done automatically when\nyou specify tag_source_data as True) and tbl_to_derive_pk or\nunexpected_rows_pk is configured.
\n
store_backend - which store_backend to use (e.g. s3 or file_system).
\n
local_fs_root_dir - path of the root directory. Note: only applicable for\nstore_backend file_system.
\n
data_docs_local_fs - the path for data docs only for store_backend\nfile_system.
\n
bucket - the bucket name to consider for the store_backend (store DQ artefacts).\nNote: only applicable for store_backend s3.
\n
data_docs_bucket - the bucket name for data docs only. When defined, it will\nsupersede bucket parameter. Note: only applicable for store_backend s3.
\n
expectations_store_prefix - prefix where to store expectations' data. Note: only\napplicable for store_backend s3.
\n
validations_store_prefix - prefix where to store validations' data. Note: only\napplicable for store_backend s3.
\n
data_docs_prefix - prefix where to store data_docs' data.
\n
checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\napplicable for store_backend s3.
\n
data_asset_name - name of the data asset to consider when configuring the great\nexpectations' data source.
\n
expectation_suite_name - name to consider for great expectations' suite.
\n
result_sink_db_table - db.table_name indicating the database and table in which\nto save the results of the DQ process.
\n
result_sink_location - file system location in which to save the results of the\nDQ process.
\n
data_product_name - name of the data product.
\n
result_sink_partitions - the list of partitions to consider.
\n
result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
\n
result_sink_options - extra spark options for configuring the result sink.\nE.g: can be used to configure a Kafka sink if result_sink_format is kafka.
\n
result_sink_explode - flag to determine if the output table/location should have\nthe columns exploded (as True) or not (as False). Default: True.
\n
result_sink_extra_columns - list of extra columns to be exploded (following\nthe pattern \".*\") or columns to be selected. It is only used when\nresult_sink_explode is set to True.
\n
source - name of data source, to be easier to identify in analysis. If not\nspecified, it is set as default . This will be only used\nwhen result_sink_explode is set to True.
\n
fail_on_error - whether to fail the algorithm if the validations of your data in\nthe DQ process failed.
\n
cache_df - whether to cache the dataframe before running the DQ process or not.
\n
critical_functions - functions that should not fail. When this argument is\ndefined, fail_on_error is nullified.
\n
max_percentage_failure - percentage of failure that should be allowed.\nThis argument has priority over both fail_on_error and critical_functions.
merge_predicate: predicate to apply to the merge operation so that we can\ncheck if a new record corresponds to a record already included in the\nhistorical data.
\n
insert_only: indicates if the merge should only insert data (e.g., deduplicate\nscenarios).
\n
delete_predicate: predicate to apply to the delete operation.
\n
update_predicate: predicate to apply to the update operation.
\n
insert_predicate: predicate to apply to the insert operation.
\n
update_column_set: rules to apply to the update operation which allows to\nset the value for each column to be updated.\n(e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )
\n
insert_column_set: rules to apply to the insert operation which allows to\nset the value for each column to be inserted.\n(e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )
This is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).
\n\n
\n
spec_id: id of the output specification.
\n
input_id: id of the corresponding input specification.
\n
write_type: type of write operation.
\n
data_format: format of the output. Defaults to DELTA.
\n
db_table: table name in the form of <db>.<table>.
\n
location: uri that identifies from where to write data in the specified format.
\n
partitions: list of partition input_col names.
\n
merge_opts: options to apply to the merge operation.
\n
streaming_micro_batch_transformers: transformers to invoke for each streaming\nmicro batch, before writing (i.e., in Spark's foreachBatch structured\nstreaming function). Note: the lakehouse engine manages this for you, so\nyou don't have to manually specify streaming transformations here, so we don't\nadvise you to manually specify transformations through this parameter. Supply\nthem as regular transformers in the transform_specs sections of an ACON.
\n
streaming_once: if the streaming query is to be executed just once, or not,\ngenerating just one micro batch.
\n
streaming_processing_time: if streaming query is to be kept alive, this indicates\nthe processing time of each micro batch.
\n
streaming_available_now: if set to True, set a trigger that processes all\navailable data in multiple batches then terminates the query.\nWhen using streaming, this is the default trigger that the lakehouse-engine will\nuse, unless you configure a different one.
\n
streaming_continuous: set a trigger that runs a continuous query with a given\ncheckpoint interval.
\n
streaming_await_termination: whether to wait (True) for the termination of the\nstreaming query (e.g. timeout or exception) or not (False). Default: True.
\n
streaming_await_termination_timeout: a timeout to set to the\nstreaming_await_termination. Default: None.
\n
with_batch_id: whether to include the streaming batch id in the final data,\nor not. It only takes effect in streaming mode.
\n
options: dict with other relevant options according to the execution environment\n(e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\nstreaming, etc.
\n
streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\nbut for the DQ functions to be executed. Used internally by the lakehouse\nengine, so you don't have to supply DQ functions through this parameter. Use the\ndq_specs of the acon instead.
metrics: list of metrics in the form of:\n[{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n}].
\n
recon_type: reconciliation type (percentage or absolute). Percentage calculates\nthe difference between truth and current results as a percentage (x-y/x), and\nabsolute calculates the raw difference (x - y).
\n
truth_input_spec: input specification of the truth data.
\n
current_input_spec: input specification of the current results data
\n
truth_preprocess_query: additional query on top of the truth input data to\npreprocess the truth data before it gets fueled into the reconciliation process.\nImportant note: you need to assume that the data out of\nthe truth_input_spec is referencable by a table called 'truth'.
\n
truth_preprocess_query_args: optional dict having the functions/transformations to\napply on top of the truth_preprocess_query and respective arguments. Note: cache\nis being applied on the Dataframe, by default. For turning the default behavior\noff, pass \"truth_preprocess_query_args\": [].
\n
current_preprocess_query: additional query on top of the current results input\ndata to preprocess the current results data before it gets fueled into the\nreconciliation process. Important note: you need to assume that the data out of\nthe current_results_input_spec is referencable by a table called 'current'.
\n
current_preprocess_query_args: optional dict having the\nfunctions/transformations to apply on top of the current_preprocess_query\nand respective arguments. Note: cache is being applied on the Dataframe,\nby default. For turning the default behavior off, pass\n\"current_preprocess_query_args\": [].
\n
ignore_empty_df: optional boolean, to ignore the recon process if source & target\ndataframes are empty, recon will exit success code (passed)
input_spec: input specification of the data to be checked/validated.
\n
dq_spec: data quality specification.
\n
restore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.
assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
\n
control_db_table_name: db.table to store sensor metadata.
\n
input_spec: input specification of the source to be checked for new data.
\n
preprocess_query: SQL query to transform/filter the result from the\nupstream. Consider that we should refer to 'new_data' whenever\nwe are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"
\n
checkpoint_location: optional location to store checkpoints to resume\nfrom. These checkpoints use the same as Spark checkpoint strategy.\nFor Spark readers that do not support checkpoints, use the\npreprocess_query parameter to form a SQL query to filter the result\nfrom the upstream accordingly.
\n
fail_on_empty_result: if the sensor should throw an error if there is no new\ndata in the upstream. Default: True.
query_label_filter: query use-case label to execute.\nqueue_filter: queue to execute the job.\ncadence_filter: selected cadences to build the asset.\ntarget_database: target database to write.\ncurr_date: current date.\nstart_date: period start date.\nend_date: period end date.\nrerun_flag: rerun flag.\ntarget_table: target table to write.\nsource_database: source database.\ngab_base_path: base path to read the use cases.\nlookup_table: gab configuration table.\ncalendar_table: gab calendar table.
Based on the use case configuration return the values to override in the SQL file.\nThis enum aims to exhaustively map each combination of cadence, reconciliation,\n week_start and snap_flag return the corresponding values join_select,\n project_start and project_end to replace this values in the stages SQL file.
\n\n
Return corresponding configuration (join_select, project_start, project_end) for\n each combination (cadence x recon x week_start x snap_flag).
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABCombinedConfiguration.COMBINED_CONFIGURATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCombinedConfiguration.COMBINED_CONFIGURATION", "kind": "variable", "doc": "\n", "default_value": "<GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n calendar_date as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'DAY', 'MONTH', 'QUARTER', 'YEAR'}, 'week_start': 'M', 'snap_flag': {'Y', 'N'}, 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n case\\n when '${config_week_start}' = 'Monday' then weekend_mon\\n when '${config_week_start}' = 'Sunday' then weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct month_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_mon\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_sun\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n month_end as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 13: {'cadence': 'QUARTER', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\\n )\\n )-1\\n "}, 17: {'cadence': 'YEAR', 'recon': {'DAY', 'MONTH', 'QUARTER'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when '${rec_cadence}' = 'DAY' then calendar_date\\n when '${rec_cadence}' = 'MONTH' then month_end\\n when '${rec_cadence}' = 'QUARTER' then quarter_end\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'recon': {'YEAR', 'WEEK', 'DAY', 'QUARTER', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': "\\n select distinct\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekstart_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekstart_sun\\n else\\n date(date_trunc('${cad}',calendar_date))\\n end as cadence_start_date,\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekend_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekend_sun\\n when '${cad}' = 'DAY'\\n then date(date_trunc('${cad}',calendar_date))\\n when '${cad}' = 'MONTH'\\n then date(\\n date_trunc(\\n 'MONTH',\\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\\n )\\n )-1\\n when '${cad}' = 'QUARTER'\\n then date(\\n date_trunc(\\n 'QUARTER',\\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\\n )\\n )-1\\n when '${cad}' = 'YEAR'\\n then date(\\n date_trunc(\\n 'YEAR',\\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\\n )\\n )-1\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "
Module to take care of creating a singleton of the execution environment class.
Generate the new set of extended start and end dates based on the cadence.
\n\n
Running week cadence again to extend to correct week start and end date in case\n of recon window for Week cadence is present.\nFor end_date 2012-12-31,in case of Quarter Recon window present for Week\n cadence, start and end dates are recalculated to 2022-10-01 to 2022-12-31.\nBut these are not start and end dates of week. Hence, to correct this, new dates\n are passed again to get the correct dates.
\n\n
Arguments:
\n\n
\n
cadence: cadence to process.
\n
derived_cadence: cadence reconciliation to process.
\n
start_date: start date of the period to process.
\n
end_date: end date of the period to process.
\n
query_type: use case query type.
\n
current_date: current date to be used in the end date, in case the end date\nis greater than current date so the end date should be the current date.
Read data from delta table containing sensor status info.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id. If this parameter is defined search occurs\nonly considering this parameter. Otherwise, it considers sensor\nassets and checkpoint location.
\n
control_db_table_name: db.table to control sensor runs.
\n
assets: list of assets that are fueled by the pipeline\nwhere this sensor is.
\n
\n\n
Return:
\n\n
\n
Row containing the data for the provided sensor_id.
Generates a sensor preprocess query based on timestamp logic.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
filter_exp: expression to filter incoming new data.\nYou can use the placeholder ?upstream_value so that\nit can be replaced by the upstream_value in the\ncontrol_db_table_name for this specific sensor_id.
\n
control_db_table_name: db.table to retrieve the last status change\ntimestamp. This is only relevant for the jdbc sensor.
\n
upstream_key: the key of custom sensor information\nto control how to identify new data from the\nupstream (e.g., a time column in the upstream).
\n
upstream_value: value for custom sensor\nto identify new data from the upstream\n(e.g., the value of a time present in the upstream)\nIf none we will set the default value.\nNote: This parameter is used just to override the\ndefault value -2147483647.
\n
upstream_table_name: value for custom sensor\nto query new data from the upstream.\nIf none we will set the default value,\nour sensor_new_data view.
Expect values in column A to be not equal to column B.
\n\n
Arguments:
\n\n
\n
column_A: The first column name.
\n
column_B: The second column name.
\n
\n\n
Keyword Args:
\n\n
\n
\n
allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
Expect values in column A to be lower or equal than column B.
\n\n
Arguments:
\n\n
\n
column_A: The first column name.
\n
column_B: The second column name.
\n
margin: additional approximation to column B value.
\n
\n\n
Keyword Args:
\n\n
\n
\n
allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
Expect value in column to be date that is not older than a given time.
\n\n
Since timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.
\n\n
Arguments:
\n\n
\n
column: Name of column to validate
\n
Note: Column must be of type Date, Timestamp or String (with Timestamp format).\nFormat: yyyy-MM-ddTHH:mm:ss
\n
timeframe: dict with the definition of the timeframe.
\n
kwargs: dict with additional parameters.
\n
\n\n
Keyword Args:
\n\n
\n
\n
allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
This expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.
Run the specified data quality process on a dataframe.
\n\n
Based on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.
\n\n
Arguments:
\n\n
\n
dq_spec: data quality specification.
\n
data: input dataframe to run the dq process on.
\n
\n\n
Returns:
\n\n
\n
The DataFrame containing the results of the DQ process.
This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.
\n\n
Arguments:
\n\n
\n
store_backend: which store_backend to use (e.g. s3 or file_system).
\n
local_fs_root_dir: path of the root directory. Note: only applicable\nfor store_backend file_system
\n
data_docs_local_fs: path of the root directory. Note: only applicable\nfor store_backend file_system.
\n
data_docs_prefix: prefix where to store data_docs' data.
\n
bucket: the bucket name to consider for the store_backend\n(store DQ artefacts). Note: only applicable for store_backend s3.
\n
data_docs_bucket: the bucket name for data docs only. When defined,\nit will supersede bucket parameter.\nNote: only applicable for store_backend s3.
\n
expectations_store_prefix: prefix where to store expectations' data.\nNote: only applicable for store_backend s3.
\n
validations_store_prefix: prefix where to store validations' data.\nNote: only applicable for store_backend s3.
\n
checkpoint_store_prefix: prefix where to store checkpoints' data.\nNote: only applicable for store_backend s3.
We use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.
\n\n
Arguments:
\n\n
\n
context: the BaseDataContext containing the configurations for the data\nsource and store backend.
\n
batch_request: run time batch request to be able to query underlying data.
\n
expectation_suite_name: name of the expectation suite.
\n
dq_functions: a list of DQFunctionSpec to consider in the expectation suite.
\n
critical_functions: list of critical expectations in the expectation suite.
Update the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
control_db_table_name:db.table to store sensor checkpoints.
\n
status: status of the sensor.
\n
assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
Generates a preprocess query to be used in a sensor configuration.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
filter_exp: expression to filter incoming new data.\nYou can use the placeholder ?default_upstream_key and\n?default_upstream_value, so that it can be replaced by the\nrespective values in the control_db_table_name for this specific\nsensor_id.
\n
control_db_table_name:db.table to retrieve the last status change\ntimestamp. This is only relevant for the jdbc sensor.
\n
upstream_key: the key of custom sensor information to control how to\nidentify new data from the upstream (e.g., a time column in the\nupstream).
\n
upstream_value: the upstream value\nto identify new data from the upstream (e.g., the value of a time\npresent in the upstream).
\n
upstream_table_name: value for custom sensor\nto query new data from the upstream\nIf none we will set the default value,\nour sensor_new_data view.
This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.
\n\n
Arguments:
\n\n
\n
store_backend: which store_backend to use (e.g. s3 or file_system).
\n
local_fs_root_dir: path of the root directory. Note: only applicable\nfor store_backend file_system
\n
data_docs_local_fs: path of the root directory. Note: only applicable\nfor store_backend file_system.
\n
data_docs_prefix: prefix where to store data_docs' data.
\n
bucket: the bucket name to consider for the store_backend\n(store DQ artefacts). Note: only applicable for store_backend s3.
\n
data_docs_bucket: the bucket name for data docs only. When defined,\nit will supersede bucket parameter.\nNote: only applicable for store_backend s3.
\n
expectations_store_prefix: prefix where to store expectations' data.\nNote: only applicable for store_backend s3.
\n
validations_store_prefix: prefix where to store validations' data.\nNote: only applicable for store_backend s3.
\n
checkpoint_store_prefix: prefix where to store checkpoints' data.\nNote: only applicable for store_backend s3.
Exception for when the input of an incremental filter is not found.
\n\n
This may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.
Define how to write a streaming micro batch after transforming it.
\n\n
This function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.
\n\n
Arguments:
\n\n
\n
kwargs: any keyword arguments.
\n
\n\n
Returns:
\n\n
\n
A function to be executed in the foreachBatch spark write method.
After the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.
Optimize a dataset based on a set of pre-conceived optimizations.
\n\n
Most of the time the dataset is a table, but it can be a file-based one only.
\n\n
Arguments:
\n\n
\n
db_table:database_name.table_name.
\n
location: dataset/table filesystem location.
\n
compute_table_stats: to compute table statistics or not.
\n
vacuum: (delta lake tables only) whether to vacuum the delta lake\ntable or not.
\n
vacuum_hours: (delta lake tables only) number of hours to consider\nin vacuum operation.
\n
optimize: (delta lake tables only) whether to optimize the table or\nnot. Custom optimize parameters can be supplied through ExecEnv (Spark)\nconfigs
\n
optimize_where: expression to use in the optimize function.
\n
optimize_zorder_col_list: (delta lake tables only) list of\ncolumns to consider in the zorder optimization process. Custom optimize\nparameters can be supplied through ExecEnv (Spark) configs.
\n
debug: flag indicating if we are just debugging this for local\ntests and therefore pass through all the exceptions to perform some\nassertions in local tests.
Update the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
control_db_table_name:db.table to store sensor checkpoints.
\n
status: status of the sensor.
\n
assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
Explode columns with types like ArrayType and MapType.
\n\n
After it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.
\n\n
Arguments:
\n\n
\n
explode_arrays: whether you want to explode array columns (True)\nor not (False). Default: False.
\n
array_cols_to_explode: array columns which you want to explode.\nIf you don't specify it will get all array columns and explode them.\nDefault: None.
\n
explode_maps: whether you want to explode map columns (True)\nor not (False). Default: False.
\n
map_cols_to_explode: map columns which you want to explode.\nIf you don't specify it will get all map columns and explode them.\nDefault: None.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
cols: dict with columns and respective target names.
\n
escape_col_names: whether to escape column names (e.g. /BIC/COL1) or not.\nIf True it creates a column with the new name and drop the old one.\nIf False, uses the native withColumnRenamed Spark function.\nDefault: True.
\n
\n\n
Returns:
\n\n
\n
Function to be called in .transform() spark function.
Convert a json string into a json column (struct).
\n\n
The new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.
\n\n
Arguments:
\n\n
\n
input_col: dict with columns and respective target names.
\n
schema_path: path to the StructType schema (spark schema).
\n
schema: dict with the StructType schema (spark schema).
\n
json_options: options to parse the json value.
\n
drop_all_cols: whether to drop all the input columns or not.\nDefaults to False.
\n
disable_dbfs_retry: optional flag to disable file storage dbfs.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
in_cols: name(s) of the input column(s).\nExample values:\n\"*\" - all\ncolumns; \"my_col\" - one column named \"my_col\";\n\"my_col1, my_col2\" - two columns.
\n
out_col: name of the output column.
\n
json_options: options to parse the json value.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
Condense Change Data Capture (CDC) based on record_mode strategy.
\n\n
This CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.
\n\n
Arguments:
\n\n
\n
business_key: The business key (logical primary key) of the data.
\n
ranking_key_desc: In this type of CDC condensation the data needs to be\nin descending order in a certain way, using columns specified in this\nparameter.
\n
ranking_key_asc: In this type of CDC condensation the data needs to be\nin ascending order in a certain way, using columns specified in\nthis parameter.
\n
record_mode_col: Name of the record mode input_col.
\n
valid_record_modes: Depending on the context, not all record modes may be\nconsidered for condensation. Use this parameter to skip those.
\n
\n\n
Returns:
\n\n
\n
A function to be executed in the .transform() spark function.
Execute a custom transformation provided by the user.
\n\n
This transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.
\n\n
\n\n
Attention!
\n\n
Please bear in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame,\nbecause it is how Spark's .transform method is able to chain the\ntransformations.
\n\n
\n\n
Example:
\n\n
\n
defmy_custom_logic(df:DataFrame)->DataFrame:\n
\n
\n\n
Arguments:
\n\n
\n
custom_transformer: custom transformer function. A python function with all\nrequired pyspark logic provided by the user.
\n
\n\n
Returns:
\n\n
\n
Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.
Execute a SQL transformation provided by the user.
\n\n
This transformer can be very useful whenever the user wants to perform\nSQL-based transformations that are not natively supported by the\nlakehouse engine transformers.
\n\n
Arguments:
\n\n
\n
sql: the SQL query to be executed. This can read from any table or\nview from the catalog, or any dataframe registered as a temp\nview.
\n
\n\n
Returns:
\n\n
\n
Callable: A function to be called in .transform() spark function.
Create day/month/week/quarter/year hierarchy for the provided date columns.
\n\n
Uses Spark's extract function.
\n\n
Arguments:
\n\n
\n
cols: list of names of the date columns to create the hierarchy.
\n
formats: dict with the correspondence between the hierarchy and the format\nto apply. Check here.\nExample: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n}
\n
\n\n
Returns:
\n\n
\n
A function to be executed in the .transform() spark function.
Incrementally Filter a certain dataframe given an increment logic.
\n\n
This logic can either be an increment value or an increment dataframe from\nwhich the get the latest value from. By default, the operator for the\nfiltering process is greater or equal to cover cases where we receive late\narriving data not cover in a previous load. You can change greater_or_equal\nto false to use greater, when you trust the source will never output more data\nwith the increment after you have load the data (e.g., you will never load\ndata until the source is still dumping data, which may cause you to get an\nincomplete picture of the last arrived data).
\n\n
Arguments:
\n\n
\n
input_col: input column name
\n
increment_value: value to which to filter the data, considering the\nprovided input_Col.
\n
increment_df: a dataframe to get the increment value from.\nyou either specify this or the increment_value (this takes precedence).\nThis is a good approach to get the latest value from a given dataframe\nthat was read and apply that value as filter here. In this way you can\nperform incremental loads based on the last value of a given dataframe\n(e.g., table or file based). Can be used together with the\nget_max_value transformer to accomplish these incremental based loads.\nSee our append load feature tests to see how to provide an acon for\nincremental loads, taking advantage of the scenario explained here.
\n
increment_col: name of the column from which to get the increment\nvalue from (when using increment_df approach). This assumes there's\nonly one row in the increment_df, reason why is a good idea to use\ntogether with the get_max_value transformer. Defaults to \"latest\"\nbecause that's the default output column name provided by the\nget_max_value transformer.
\n
greater_or_equal: if filtering should be done by also including the\nincrement value or not (useful for scenarios where you are performing\nincrement loads but still want to include data considering the increment\nvalue, and not only values greater than that increment... examples may\ninclude scenarios where you already loaded data including those values,\nbut the source produced more data containing those values).\nDefaults to false.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
Drop duplicate rows using spark function dropDuplicates().
\n\n
This transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.
\n\n
Arguments:
\n\n
\n
cols: column names.
\n
watermarker: properties to apply watermarker to the transformer.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
Join two dataframes based on specified type and columns.
\n\n
Some stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.
\n\n
Arguments:
\n\n
\n
left_df_alias: alias of the first dataframe.
\n
join_with: right dataframe.
\n
right_df_alias: alias of the second dataframe.
\n
join_condition: condition to join dataframes.
\n
join_type: type of join. Defaults to inner.\nAvailable values: inner, cross, outer, full, full outer,\nleft, left outer, right, right outer, semi,\nleft semi, anti, and left anti.
\n
broadcast_join: whether to perform a broadcast join or not.
\n
select_cols: list of columns to select at the end.
\n
watermarker: properties to apply watermarking.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
If num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).
\n\n
Arguments:
\n\n
\n
num_partitions: num of partitions to repartition.
\n
cols: list of columns to use for repartitioning.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
num_partitions: number of Spark partitions to split the extraction.
\n
lower_bound: lower bound to decide the partition stride.
\n
upper_bound: upper bound to decide the partition stride. If\ncalculate_upper_bound is True, then upperBound will be\nderived by our upper bound optimizer, using the partition column.
\n
default_upper_bound: the value to use as default upper bound in case\nthe result of the upper bound calculation is None. Default: \"1\".
\n
fetch_size: how many rows to fetch per round trip. Default: \"100000\".
custom_schema: specify custom_schema for particular columns of the\nreturned dataframe in the init/delta extraction of the source table.
\n
min_timestamp: min timestamp to consider to filter the changelog data.\nDefault: None and automatically derived from the location provided.\nIn case this one is provided it has precedence and the calculation\nis not done.
\n
max_timestamp: max timestamp to consider to filter the changelog data.\nDefault: None and automatically derived from the table having information\nabout the extraction requests, their timestamps and their status.\nIn case this one is provided it has precedence and the calculation\nis not done.
\n
generate_predicates: whether to generate predicates automatically or not.\nDefault: False.
\n
predicates: list containing all values to partition (if generate_predicates\nis used, the manual values provided are ignored). Default: None.
\n
predicates_add_null: whether to consider null on predicates list.\nDefault: True.
\n
extraction_timestamp: the timestamp of the extraction. Default: current time\nfollowing the format \"%Y%m%d%H%M%S\".
\n
max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\nfrom the table holding the extraction requests information.
Helper to get additional Spark Options initially passed.
\n\n
If people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.
\n\n
Arguments:
\n\n
\n
input_spec: the input specification.
\n
options: dict with Spark options.
\n
ignore_options: list of options to be ignored by the process.\nSpark read has two different approaches to parallelize\nreading process, one of them is using upper/lower bound,\nanother one is using predicates, those process can't be\nexecuted at the same time, you must choose one of them.\nBy choosing predicates you can't pass lower and upper bound,\nalso can't pass number of partitions and partition column\notherwise spark will interpret the execution partitioned by\nupper and lower bound and will expect to fill all variables.\nTo avoid fill all predicates hardcoded at the acon, there is\na feature that automatically generates all predicates for init\nor delta load based on input partition column, but at the end\nof the process, partition column can't be passed to the options,\nbecause we are choosing predicates execution, that is why to\ngenerate predicates we need to pass some options to ignore.
\n
\n\n
Returns:
\n\n
\n
a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).
Configurations available for an Extraction from SAP B4.
\n\n
It inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\n
These configurations cover:
\n\n
\n
latest_timestamp_input_col: the column containing the request timestamps\nin the dataset in latest_timestamp_data_location. Default: REQTSN.
\n
request_status_tbl: the name of the SAP B4 table having information\nabout the extraction requests. Composed of database.table.\nDefault: SAPHANADB.RSPMREQUEST.
\n
request_col_name: name of the column having the request timestamp to join\nwith the request status table. Default: REQUEST_TSN.
\n
data_target: the data target to extract from. User in the join operation with\nthe request status table.
\n
act_req_join_condition: the join condition into activation table\ncan be changed using this property.\nDefault: 'tbl.reqtsn = req.request_col_name'.
\n
include_changelog_tech_cols: whether to include the technical columns\n(usually coming from the changelog) table or not.
\n
extra_cols_req_status_tbl: columns to be added from request status table.\nIt needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\nreq.col2 as column_two\".
\n
request_status_tbl_filter: filter to use for filtering the request status table,\ninfluencing the calculation of the max timestamps and the delta extractions.
\n
adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".
\n
max_timestamp_custom_schema: the custom schema to apply on the calculation of\nthe max timestamp to consider for the delta extractions.\nDefault: timestamp DECIMAL(23,0).
\n
default_max_timestamp: the timestamp to use as default, when it is not possible\nto derive one.
\n
custom_schema: specify custom_schema for particular columns of the\nreturned dataframe in the init/delta extraction of the source table.
Get the data_target from the data_target option or derive it.
\n\n
By definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.
Configurations available for an Extraction from SAP BW.
\n\n
It inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\n
These configurations cover:
\n\n
\n
latest_timestamp_input_col: the column containing the actrequest timestamp\nin the dataset in latest_timestamp_data_location. Default:\n\"actrequest_timestamp\".
\n
act_request_table: the name of the SAP BW activation requests table.\nComposed of database.table. Default: SAPPHA.RSODSACTREQ.
\n
request_col_name: name of the column having the request to join\nwith the activation request table. Default: actrequest.
\n
act_req_join_condition: the join condition into activation table\ncan be changed using this property.\nDefault: 'changelog_tbl.request = act_req.request_col_name'.
\n
odsobject: name of BW Object, used for joining with the activation request\ntable to get the max actrequest_timestamp to consider while filtering\nthe changelog table.
\n
include_changelog_tech_cols: whether to include the technical columns\n(usually coming from the changelog) table or not. Default: True.
\n
extra_cols_act_request: list of columns to be added from act request table.\nIt needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\nas column_one, act_req.col2 as column_two\".
\n
get_timestamp_from_act_request: whether to get init timestamp\nfrom act request table or assume current/given timestamp.
\n
sap_bw_schema: sap bw schema. Default: SAPPHA.
\n
max_timestamp_custom_schema: the custom schema to apply on the calculation of\nthe max timestamp to consider for the delta extractions.\nDefault: timestamp DECIMAL(23,0).
\n
default_max_timestamp: the timestamp to use as default, when it is not possible\nto derive one.
With the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.
date_time_gt(str):\nFilter the files greater than the string datetime\nformatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".
\n
date_time_lt(str):\nFilter the files lower than the string datetime\nformatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".
\n
earliest_file(bool):\nFilter the earliest dated file in the directory.
\n
file_name_contains(str):\nFilter files when match the pattern.
\n
latest_file(bool):\nFilter the most recent dated file in the directory.
\n
sub_dir(bool):\nWhen true, the engine will search files into subdirectories\nof the remote_path.\nIt will consider one level below the remote_path.\nWhen sub_dir is used with latest_file/earliest_file argument,\nthe engine will retrieve the latest_file/earliest_file\nfor each subdirectory.
\n
\n\n
Arguments:
\n\n
\n
sftp: the SFTP client object.
\n
remote_path: path of files to be filtered.
\n
options_args: options from the acon.
\n
\n\n
Returns:
\n\n
\n
A list containing the file names to be passed to Spark.
\"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\ncredentials or not.
\n
\"gss_host\" \u2013 optional - The targets name in the kerberos database.
\n
\"gss_trust_dns\" \u2013 optional - Indicates whether or\nnot the DNS is trusted to securely canonicalize the name of the\nhost being connected to (default True).
\n
\"banner_timeout\" \u2013 an optional timeout (in seconds)\nto wait for the SSH banner to be presented.
\n
\"auth_timeout\" \u2013 an optional timeout (in seconds)\nto wait for an authentication response.
\n
\"disabled_algorithms\" \u2013 an optional dict passed directly to\nTransport and its keyword argument of the same name.
\n
\"transport_factory\" \u2013 an optional callable which is handed a\nsubset of the constructor arguments (primarily those related\nto the socket, GSS functionality, and algorithm selection)\nand generates a Transport instance to be used by this client.\nDefaults to Transport.__init__.
\n
\n\n
The parameter to specify the private key is expected to be in\nRSA format. Attempting a connection with a blank host key is\nnot allowed unless the argument \"add_auto_policy\" is explicitly\nset to True.
\n
\n\n
Returns:
\n\n
\n
sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.
A dictionary that corresponds to the conclusion of a cadence.
\n\n
Any end date inputted by the user we check this end date is actually end of\n a cadence (YEAR, QUARTER, MONTH, WEEK).\nIf the user input is 2024-03-31 this is a month end and a quarter end that\n means any use cases configured as month or quarter need to be calculated.
Hide sensitive information from being shown in the logs.
\n\n
Based on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).
This covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.
How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file?
\n\n
An algorithm (e.g., data load) in the lakehouse-engine is configured using an ACON. The lakehouse-engine is a\nconfiguration-driven framework, so people don't have to write code to execute a Spark algorithm. In contrast, the\nalgorithm is written in pyspark and accepts configurations through a JSON file (an ACON - algorithm configuration). The\nACON is the configuration providing the behaviour of a lakehouse engine algorithm. You can check the algorithm code, and\nhow it interprets the ACON here.\nIn this page we will go through the structure of an ACON file and what are the most suitable ACON files for common data\nengineering scenarios.\nCheck the underneath pages to find several ACON examples that cover many data extraction, transformation and loading scenarios.
\n\n
Overview of the Structure of the ACON file for DataLoads
\n\n
An ACON-based algorithm needs several specifications to work properly, but some of them might be optional. The available\nspecifications are:
\n\n
\n
Input specifications (input_specs): specify how to read data. This is a mandatory keyword.
\n
Transform specifications (transform_specs): specify how to transform data.
\n
Data quality specifications (dq_specs): specify how to execute the data quality process.
\n
Output specifications (output_specs): specify how to write data to the target. This is a mandatory keyword.
\n
Terminate specifications (terminate_specs): specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc.).
\n
Execution environment (exec_env): custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example).
\n
\n\n
Below is an example of a complete ACON file that reads from a s3 folder with CSVs and incrementally loads that data (using a merge) into a delta lake table.
\n\n
\n\n
spec_id is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in transform_specs) of a DataFrame that was read in the input_specs. Check ACON below to see how the spec_id of the input_specs is used as input_id in one transform specification.
You specify how to read the data by providing a list of Input Specifications. Usually there's just one element in that\nlist, as, in the lakehouse, you are generally focused on reading data from one layer (e.g., source, bronze, silver,\ngold) and put it on the next layer. However, there may be scenarios where you would like to combine two datasets (e.g.,\njoins or incremental filtering on one dataset based on the values of another\none), therefore you can use one or more elements.\nMore information about InputSpecs.
\n\n
Relevant notes
\n\n
\n
A spec id is fundamental, so you can use the input data later on in any step of the algorithm (transform, write, dq process, terminate).
\n
You don't have to specify db_table and location at the same time. Depending on the data_format sometimes you read from a table (e.g., jdbc or deltalake table) sometimes you read from a location (e.g., files like deltalake, parquet, json, avro... or kafka topic).
\n
\n\n
Transform Specifications
\n\n
In the lakehouse engine, you transform data by providing a transform specification, which contains a list of transform functions (transformers). So the transform specification acts upon on input, and it can execute multiple lakehouse engine transformation functions (transformers) upon that input.
\n\n
If you look into the example above we ask the lakehouse engine to execute two functions on the orders_bronze input\ndata: with_row_id and with_regex_value. Those functions can of course receive arguments. You can see a list of all\navailable transformation functions (transformers) here lakehouse_engine.transformers. Then, you just invoke them in\nyour ACON as demonstrated above, following exactly the same function name and parameters name as described in the code\ndocumentation. \nMore information about TransformSpec.
\n\n
Relevant notes
\n\n
\n
This stage is fully optional, you can omit it from the ACON.
\n
There is one relevant option force_streaming_foreach_batch_processing that can be used to force the transform to be\nexecuted in the foreachBatch function to ensure non-supported streaming operations can be properly executed. You don't\nhave to worry about this if you are using regular lakehouse engine transformers. But if you are providing your custom\nlogic in pyspark code via our lakehouse engine\ncustom_transformation (lakehouse_engine.transformers.custom_transformers) then sometimes your logic may contain\nSpark functions that are not compatible with Spark Streaming, and therefore this flag can enable all of your\ncomputation to be streaming-compatible by pushing down all the logic into the foreachBatch() function.
\n
\n\n
Data Quality Specifications
\n\n
One of the most relevant features of the lakehouse engine is that you can have data quality guardrails that prevent you\nfrom loading bad data into your target layer (e.g., bronze, silver or gold). The lakehouse engine data quality process\nincludes one main feature at the moment:
\n\n
\n
Validator: The capability to perform data quality checks on that data (e.g., is the max value of a column bigger\nthan x?) and even tag your data with the results of the DQ checks.
\n
\n\n
The output of the data quality process can be written into a Result Sink target (e.g. table or files) and is integrated with a Data Docs website, which can be a company-wide available website for people to check the quality of their data and share with others.
\n\n
To achieve all of this functionality the lakehouse engine uses Great Expectations internally. To hide the Great Expectations internals from our user base and provide friendlier abstractions using the ACON, we have developed the concept of DQSpec that can contain many DQFunctionSpec objects, which is very similar to the relationship between the TransformSpec and TransformerSpec, which means you can have multiple Great Expectations functions executed inside a single data quality specification (as in the ACON above).
You can write the outputs of the DQ process to a sink through the result_sink* parameters of the\nDQSpec. result_sink_options takes any Spark options for a DataFrame writer, which means you can specify the options\naccording to your sink format (e.g., delta, parquet, json, etc.). We usually recommend using \"delta\" as format.
\n
You can use the results of the DQ checks to tag the data that you are validating. When configured, these details will\nappear as a new column (like any other), as part of the tables of your Data Product.
\n
To be able to make an analysis with the data of result_sink*, we have available an approach in which you\nset result_sink_explode as true (which is the default) and then you have some columns expanded. Those are:\n
\n
General columns: Those are columns that have the basic information regarding dq_specs and will have always values\nand does not depend on the expectation types chosen.\n -\nColumns: checkpoint_config, run_name, run_time, run_results, success, validation_result_identifier, spec_id, input_id, validation_results, run_time_year, run_time_month, run_time_day.
\n
Statistics columns: Those are columns that have information about the runs of expectations, being those values for\nthe run and not for each expectation. Those columns come from run_results.validation_result.statistics.*.\n
Expectations columns: Those are columns that have information about the expectation executed.\n
\n
Columns: expectation_type, batch_id, expectation_success, exception_info. Those columns are exploded\nfrom run_results.validation_result.results\ninside expectation_config.expectation_type, expectation_config.kwargs.batch_id, success as expectation_success,\nand exception_info. Moreover, we also include unexpected_index_list, observed_value and kwargs.
\n
\n
Arguments of Expectations columns: Those are columns that will depend on the expectation_type selected. Those\ncolumns are exploded from run_results.validation_result.results inside expectation_config.kwargs.*.\n
\n
We can have for\nexample: column, column_A, column_B, max_value, min_value, value, value_pairs_set, value_set,\nand others.
\n
\n
More columns desired? Those can be added, using result_sink_extra_columns in which you can select columns\nlike <name> and/or explode columns like <name>.*.
\n
\n
Use the parameter \"source\" to identify the data used for an easier analysis.
\n
By default, Great Expectation will also provide a site presenting the history of the DQ validations that you have performed on your data.
\n
You can make an analysis of all your expectations and create a dashboard aggregating all that information.
\n
This stage is fully optional, you can omit it from the ACON.
\n
\n\n
Output Specifications
\n\n
The output_specs section of an ACON is relatively similar to the input_specs section, but of course focusing on how to write the results of the algorithm, instead of specifying the input for the algorithm, hence the name output_specs (output specifications). More information about OutputSpec.
\n\n
Relevant notes
\n\n
\n
Respect the supported write types and output formats.
\n
One of the most relevant options to specify in the options parameter is the checkpoint_location when in streaming\nread mode, because that location will be responsible for storing which data you already read and transformed from the\nsource, when the source is a Spark Streaming compatible source (e.g., Kafka or S3 files).
\n
\n\n
Terminate Specifications
\n\n
The terminate_specs section of the ACON is responsible for some \"wrapping up\" activities like optimising a table,\nvacuuming old files in a delta table, etc. With time the list of available terminators will likely increase (e.g.,\nreconciliation processes), but for now we have the following terminators.\nThis stage is fully optional, you can omit it from the ACON.\nThe most relevant now in the context of the lakehouse initiative are the following:
In the exec_env section of the ACON you can pass any Spark Session configuration that you want to define for the\nexecution of your algorithm. This is basically just a JSON structure that takes in any Spark Session property, so no\ncustom lakehouse engine logic. This stage is fully optional, you can omit it from the ACON.
\n\n
\n\n
Please be aware that Spark Session configurations that are not allowed to be changed when the Spark cluster is already\nrunning need to be passed in the configuration of the job/cluster that runs this algorithm, not here in this section.\nThis section only accepts Spark Session configs that can be changed in runtime. Whenever you introduce an option make\nsure that it takes effect during runtime, as to the best of our knowledge there's no list of allowed Spark properties\nto be changed after the cluster is already running. Moreover, typically Spark algorithms fail if you try to modify a\nconfig that can only be set up before the cluster is running.
The ReadMode is PERMISSIVE in this scenario, which is the default in Spark, hence we don't need to specify it. Permissive means don't enforce any schema on the input data.
\n
From a JDBC source the ReadType needs to be \"batch\" always as \"streaming\" is not available for a JDBC source.
\n
In this scenario we do an append load by getting the max date (transformer_spec \"get_max_value\") on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec \"incremental_filter\"). That is the standard way we do incremental batch loads in the lakehouse engine. For streaming incremental loads we rely on Spark Streaming checkpoint feature (check a streaming append load ACON example).
This scenario is an append load enforcing the schema (using the schema of the target table to enforce the schema of the source, i.e., the schema of the source needs to exactly match the schema of the target table) and FAILFASTING if the schema of the input data does not match the one we specified.
The ReadMode is FAILFAST in this scenario, i.e., fail the algorithm if the schema of the input data does not match the one we specified via schema_path, read_schema_from_table or schema Input_specs variables.
\n
In this scenario we do an append load by getting the max date (transformer_spec \"get_max_value\") on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec \"incremental_filter\"). That is the standard way we do incremental batch loads in the lakehouse engine. For streaming incremental loads we rely on Spark Streaming checkpoint feature (check a streaming append load ACON example).
Batch Delta Load Init, Delta and Backfill with Merge
\n\n
This scenario illustrates the process of implementing a delta load algorithm by first using an ACON to perform an initial load, then another one to perform the regular deltas that will be triggered on a recurrent basis, and finally an ACON for backfilling specific parcels if ever needed.
We can see that even though this is an init load we still have chosen to condense the records through our \"condense_record_mode_cdc\" transformer. This is a condensation step capable of handling SAP BW style changelogs based on actrequest_timestamps, datapakid, record_mode, etc...
\n
In the init load we actually did a merge in this case because we wanted to test locally if a merge with an empty target table works, but you don't have to do it, as an init load usually can be just a full load. If a merge of init data with an empty table has any performance implications when compared to a regular insert remains to be tested, but we don't have any reason to recommend a merge over an insert for an init load, and as said, this was done solely for local testing purposes, you can just use write_type: \"overwrite\"
The merge predicate and the insert, delete or update predicates should reflect the reality of your data, and it's up to each data product to figure out which predicates better match their reality:
\n\n
\n
The merge predicate usually involves making sure that the \"primary key\" for your data matches.\n
\n
Performance Tip!!! Ideally, in order to get a performance boost in your merges, you should also place a filter in your merge predicate (e.g., certain technical or business date in the target table >= x days ago), based on the assumption that the rows in that specified interval will never change in the future. This can drastically decrease the merge times of big tables.
\n\n\n
\n\n
\n\n
The insert, delete and update predicates will always depend on the structure of your changelog, and also how you expect your updates to arrive (e.g., in certain data products you know that you will never get out of order data or late arriving data, while in other you can never ensure that). These predicates should reflect that in order to prevent you from doing unwanted changes to the target delta lake table.\n\n
\n
For example, in this scenario, we delete rows that have the R, D or X record_mode values, because we know that if after condensing the rows that is the latest status of that row from the changelog, they should be deleted, and we never insert rows with those status (note: we use this guardrail in the insert to prevent out of order changes, which is likely not the case in SAP BW).
\n
Because the insert_predicate is fully optional, in your scenario you may not require that.
\n
\n\n
In this scenario, we don't pass an update_predicate in the ACON, because both insert_predicate and update_predicate are fully optional, i.e., if you don't pass them the algorithm will update any data that matches the merge_predicate and insert any data that does not match it. The predicates in these cases just make sure the algorithm does not insert or update any data that you don't want, as in the late arriving changes scenario where a deleted row may arrive first from the changelog then the update row, and to prevent your target table to have inconsistent data for a certain period of time (it will eventually get consistent when you receive the latest correct status from the changelog though) you can have this guardrail in the insert or update predicates. Again, for most sources this will not happen but sources like Kafka for example cannot 100% ensure order, for example.
\n
In order to understand how we can cover different scenarios (e.g., late arriving changes, out of order changes, etc.), please go here.
The backfilling process depicted here is fairly similar to the init load, but it is relevant to highlight by using a static value (that can be modified accordingly to the backfilling needs) in the incremental_filter function.
\n
Other relevant functions for backfilling may include the expression_filter function, where you can use a custom SQL filter to filter the input data.
There may appear a scenario where the data product dev team faces the need to perform complex data transformations that are either not yet available in the lakehouse engine or the logic is just too complex to chain in an ACON file. In the context of the lakehouse, the only layers that usually can impose that complexity is silver+ and gold. This page targets exactly those cases.
\n\n
Below you'll find a notebook where you can pass your own PySpark or Spark SQL logic into the ACON, by dynamically injecting a python function into the ACON dictionary. The lakehouse engine will take care of executing those transformations in the transformation step of the data loader algorithm. Please read the notebook's comments carefully to understand how it works, or simply open it in your notebook environment, which will make the notebook's code and comments more readable.
\n\n
\n\n
Force Streaming Micro Batch Processing.
\n\n
When you use streaming mode, with a custom transformer, it\u2019s\nhighly advisable that you set the force_streaming_microbatch_processing flag to True in the transform specification, as\nexplained above!
\n\n
\n\n
What is a custom transformer in the Lakehouse Engine and how you can use it to write your own pyspark logic?
\n\n
We highly promote the Lakehouse Engine for creating Data Products aligned with the data source (bronze/silver layer), pumping data into silver so our Data Scientists and Analysts can leverage the value of the data in silver, as close as it comes from the source.\nThe low-code and configuration-driven nature of the lakehouse engine makes it a compelling framework to use in such cases, where the transformations that are done from bronze to silver are not that many, as we want to keep the data close to the source.
\n\n
However, when it comes to Data Products enriched in some way or for insights (silver+, gold), they are typically heavy\non transformations (they are the T of the overall ELT process), so the nature of the lakehouse engine may would have\nget into the way of adequately building it. Considering this, and considering our user base that prefers an ACON-based\napproach and all the nice off-the-shelf features of the lakehouse engine, we have developed a feature that\nallows us to pass custom transformers where you put your entire pyspark logic and can pass it as an argument\nin the ACON (the configuration file that configures every lakehouse engine algorithm).
\n\n
Motivation:
\n\n
Doing that, you let the ACON guide your read, data quality, write and terminate processes, and you just focus on transforming data :)
\n\n
Custom transformation Function
\n\n
The function below is the one that encapsulates all your defined pyspark logic and sends it as a python function to the lakehouse engine. This function will then be invoked internally in the lakehouse engine via a df.transform() function. If you are interested in checking the internals of the lakehouse engine, our codebase is openly available here: https://github.com/adidas/lakehouse-engine
\n\n
\n\n
Attention!!!
\n\n
For this process to work, your function defined below needs to receive a DataFrame and return a DataFrame. Attempting any other method signature (e.g., defining more parameters) will not work, unless you use something like python partials, for example.
\n\n
\n\n
\n
defget_new_data(df:DataFrame)->DataFrame:\n"""Get the new data from the lakehouse engine reader and prepare it."""\n return(\n df.withColumn("amount",when(col("_change_type")=="delete",lit(0)).otherwise(col("amount")))\n .select("article_id","order_date","amount")\n .groupBy("article_id","order_date")\n .agg(sum("amount").alias("amount"))\n )\n\n\ndefget_joined_data(new_data_df:DataFrame,current_data_df:DataFrame)->DataFrame:\n"""Join the new data with the current data already existing in the target dataset."""\n return(\n new_data_df.alias("new_data")\n .join(\n current_data_df.alias("current_data"),\n [\n new_data_df.article_id==current_data_df.article_id,\n new_data_df.order_date==current_data_df.order_date,\n ],\n "left_outer",\n )\n .withColumn(\n "current_amount",when(col("current_data.amount").isNull(),lit(0)).otherwise("current_data.amount")\n )\n .withColumn("final_amount",col("current_amount")+col("new_data.amount"))\n .select(col("new_data.article_id"),col("new_data.order_date"),col("final_amount").alias("amount"))\n )\n\n\ndefcalculate_kpi(df:DataFrame)->DataFrame:\n"""Calculate KPI through a custom transformer that will be provided in the ACON.\n\n Args:\n df: DataFrame passed as input.\n\n Returns:\n DataFrame: the transformed DataFrame.\n """\n new_data_df=get_new_data(df)\n\n # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n current_data_df=ExecEnv.SESSION.table(\n "my_database.my_table"\n )\n\n transformed_df=get_joined_data(new_data_df,current_data_df)\n\n returntransformed_df\n
\n
\n\n
Don't like pyspark API? Write SQL
\n\n
You don't have to comply to the pyspark API if you prefer SQL. Inside the function above (or any of\nthe auxiliary functions you decide to develop) you can write something like:
\n\n
\n
defcalculate_kpi(df:DataFrame)->DataFrame:\n df.createOrReplaceTempView("new_data")\n\n # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n ExecEnv.SESSION.sql(\n"""\n CREATE OR REPLACE TEMP VIEW my_kpi AS\n SELECT ... FROM new_data ...\n """\n )\n\n returnExecEnv.SESSION.table("my_kpi")\n
\n
\n\n
Just your regular ACON
\n\n
If you notice the ACON below, everything is the same as you would do in a Data Product, but the transform_specs section of the ACON has a difference, which is a function called \"custom_transformation\" where we supply as argument the function defined above with the pyspark code.
\n\n
\n\n
Attention!!!
\n\n
Do not pass the function as calculate_kpi(), but as calculate_kpi, otherwise you are telling python to invoke the function right away, as opposed to pass it as argument to be invoked later by the lakehouse engine.
\n\n
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nacon={\n "input_specs":[\n {\n "spec_id":"sales",\n "read_type":"streaming",\n "data_format":"delta",\n "db_table":"my_database.dummy_sales",\n "options":{"readChangeFeed":"true"},\n }\n ],\n "transform_specs":[\n {\n "spec_id":"transformed_sales_kpi",\n "input_id":"sales",\n # because we are using streaming, this allows us to make sure that\n # all the computation in our custom transformer gets pushed to\n # Spark's foreachBatch method in a stream, which allows us to\n # run all Spark functions in a micro batch DataFrame, as there\n # are some Spark functions that are not supported in streaming.\n "force_streaming_foreach_batch_processing":True,\n "transformers":[\n {\n "function":"custom_transformation",\n "args":{"custom_transformer":calculate_kpi},\n },\n ],\n }\n ],\n "dq_specs":[\n {\n "spec_id":"my_table_quality",\n "input_id":"transformed_sales_kpi",\n "dq_type":"validator",\n "bucket":"my_dq_bucket",\n "data_docs_bucket":"my_data_product_bucket",\n "data_docs_prefix":"dq/my_data_product/data_docs/site/",\n "expectations_store_prefix":"dq/expectations/",\n "validations_store_prefix":"dq/validations/",\n "checkpoint_store_prefix":"dq/checkpoints/",\n "tbl_to_derive_pk":"my_table",\n "dq_functions":[\n {"function":"expect_column_values_to_not_be_null","args":{"column":"article_id"}},\n ],\n },\n ],\n "output_specs":[\n {\n "spec_id":"sales_kpi",\n "input_id":"transformed_sales_kpi",\n "write_type":"merge",\n "data_format":"delta",\n "db_table":"my_database.my_table",\n "options":{\n "checkpointLocation":"s3://my_data_product_bucket/gold/my_table",\n },\n "merge_opts":{\n "merge_predicate":"new.article_id = current.article_id AND new.order_date = current.order_date"\n },\n }\n ],\n}\n\nload_data(acon=acon)\n
A custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from\nSAP B4 DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions\n(AQ vs CL, active table, changelog table, requests status table, how to identify the next delta timestamp...),\nonly requiring a few parameters that are explained and exemplified in the\ntemplate scenarios that we have created.
\n\n
\n\n
This custom reader is very similar and uses most features from the sap_bw reader, so if you were using specific filters/parameters with the sap_bw reader, there is a high chance you can keep using it in a very similar way with the sap_b4 reader. The main concepts are applied to both readers, as the strategies on how to parallelize the extractions, for example.
There are no limits imposed by the Lakehouse-Engine framework, but you need to consider that there might be differences imposed by the source.
\n\n
E.g. Each User might be restricted on utilisation of about 100GB memory at a time from the source.
\n\n
Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.
\n\n
\n\n
\n\n
In case you want to perform further filtering in the REQTSN field, please be aware that it is not being pushed down to SAP B4 by default (meaning it will have bad performance).
\n\n
In that case, you will need to use customSchema option while reading, so that you are able to enable filter push down for those.
\n\n
\n\n
You can check the code documentation of the reader below:
For extractions using the SAP B4 reader, you can use the arguments listed in the SAP B4 arguments, but also the ones listed in the JDBC extractions, as those are inherited as well.
\n\n
\n\n
Extraction from SAP B4 ADSOs Template
\n\n
This template covers the following scenarios of extractions from the SAP B4Hana ADSOs:
\n\n
\n
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n
2 - Parallel extraction\n
\n
2.1 - Simplest Scenario
\n
2.2 - Provide upperBound (Recommended)
\n
2.3 - Automatic upperBound (Recommended)
\n
2.4 - Provide predicates (Recommended)
\n
2.5 - Generate predicates (Recommended)
\n
\n
\n\n
\n\n
Note: the template will cover two ADSO Types:
\n\n
\n
AQ: ADSO which is of append type and for which a single ADSO/tables holds all the information, like an\nevent table. For this type, the same ADSO is used for reading data both for the inits and deltas. Usually, these\nADSOs end with the digit \"6\".
\n
CL: ADSO which is split into two ADSOs, one holding the change log events, the other having the active\ndata (current version of the truth for a particular source). For this type, the ADSO having the active data\nis used for the first extraction (init) and the change log ADSO is used for the subsequent extractions (deltas).\nUsually, these ADSOs are split into active table ending with the digit \"2\" and changelog table ending with digit \"3\".
\n
\n\n
\n\n
For each of these ADSO types, the lakehouse-engine abstracts the logic to get the delta extractions. This logic\nbasically consists of joining the db_table (for AQ) or the changelog_table (for CL) with the table\nhaving the requests status (my_database.requests_status_table).\nOne of the fields used for this joining is the data_target, which has a relationship with the ADSO\n(db_table/changelog_table), being basically the same identifier without considering parts of it.
\n\n
Based on the previous insights, the queries that the lakehouse-engine generates under the hood translate to\n(this is a simplified version, for more details please refer to the lakehouse-engine code documentation):\nAQ Init Extraction:\nSELECT t.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table t
\n\n
AQ Delta Extraction:\nSELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('my_identifier')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table
\n\n
CL Init Extraction:\nSELECT t.*,\n {self._SAP_B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn,\n '0' AS datapakid,\n 0 AS record,\n CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_2 t
\n\n
CL Delta Extraction:\nSELECT tbl.*,\nCAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_3 AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('my_data_target')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table`
\n\n
\n\n
Introductory Notes:If you want to have a better understanding about JDBC Spark optimizations, here you have a few useful links:
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n\n
This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the ADSO\nyou want to extract from SAP B4Hana is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source ADSO, there are two options:
\n\n
\n
Delta Init - full extraction of the source ADSO. You should use it in the first time you extract from the\nADSO or any time you want to re-extract completely. Similar to a so-called full load.
\n
Delta - extracts the portion of the data that is new or has changed in the source, since the last\nextraction (using the max_timestamp value in the location of the data already extracted\nlatest_timestamp_data_location).
\n
\n\n
Below example is composed of two cells.
\n\n
\n
The first cell is only responsible to define the variables extraction_type and write_type,\ndepending on the extraction type: Delta Init (load_type = \"init\") or a Delta (load_type = \"delta\").\nThe variables in this cell will also be referenced by other acons/examples in this notebook, similar to what\nyou would do in your pipelines/jobs, defining this centrally and then re-using it.
\n
The second cell is where the acon to be used is defined (which uses the two variables extraction_type and\nwrite_type defined) and the load_data algorithm is executed to perform the extraction.
\n
\n\n
\n\n
There may be cases where you might want to always extract fully from the source ADSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.
In this section, 5 possible scenarios for parallel extractions from SAP B4Hana ADSOs are presented.
\n\n
2.1 - Parallel Extraction, Simplest Scenario
\n\n
This scenario provides the simplest example you can have for a parallel extraction from SAP B4Hana, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.
\n\n
On the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source ADSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.
2.2 - Parallel Extraction, Provide upper_bound (Recommended)
\n\n
This scenario performs the extraction from the SAP B4 ADSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:
\n\n
\n
numPartitions - number of Spark partitions to split the extraction.
\n
partitionColumn - column used to split the extraction. It must be a numeric, date, or timestamp.\nIt should be a column that is able to split the extraction evenly in several tasks. An auto-increment\ncolumn is usually a very good candidate.
\n
lowerBound - lower bound to decide the partition stride.
\n
upperBound - upper bound to decide the partition stride.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).
\n\n
When these 4 properties are used, Spark will use them to build several queries to split the extraction.
\n\n
Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:
\n\n
\n
SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL
\n
SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20
\n
SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30
This scenario is very similar to 2.2, the only difference being that upperBound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP B4 ADSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.
2.4 - Parallel Extraction, Provide Predicates (Recommended)
\n\n
This scenario performs the extraction from SAP B4 ADSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction (e.g. when extracting from ADSO of Type CL,\nthe active table does not have the RECORD column, which is usually a good option for scenarios 2.2 and 2.3):
\n\n
\n
partitionColumn - column used to split the extraction. It can be of any type.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenario 2.2 or 2.3.
\n\n
When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.
\n\n
Below the lakehouse function to generate predicate list automatically is presented.
\n\n
This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.
\n\n
predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here, specially if you are applying filter on transformations spec,\nand you know entire table won't be necessary, so you can change it to something like this: select distinct(x)\nfrom table where x > y.
\n\n
predicates_add_null: You can decide if you want to consider null on predicates list or not, by default\nthis property is True.
\n\n
Example: for \"partition_column\": \"CALMONTH\"
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nLOAD_TYPE="INIT"or"DELTA"\n\nifLOAD_TYPE=="INIT":\n extraction_type="init"\n write_type="overwrite"\nelse:\n extraction_type="delta"\n write_type="append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfromlakehouse_engine.core.exec_envimportExecEnv\nfromlakehouse_engine.utils.extraction.jdbc_extraction_utilsimport(\n JDBCExtraction,\n JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column="CALMONTH"\ndbtable="my_database.my_table_3"\n\npredicates_query=f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\nuser="my_user"\npassword="my_b4_hana_pwd"\nurl="my_sap_b4_url"\npredicates_add_null=True\n\njdbc_util=JDBCExtractionUtils(\n JDBCExtraction(\n user=user,\n password=password,\n url=url,\n predicates_add_null=predicates_add_null,\n partition_column=partition_column,\n dbtable=dbtable,\n )\n)\n\npredicates=jdbc_util.get_predicates(predicates_query)\n\nacon={\n "input_specs":[\n {\n "spec_id":"my_identifier_2_source",\n "read_type":"batch",\n "data_format":"sap_b4",\n "options":{\n "url":"my_sap_b4_url",\n "user":"my_user",\n "password":"my_b4_hana_pwd",\n "driver":"com.sap.db.jdbc.Driver",\n "dbtable":"my_database.my_table_2",\n "changelog_table":"my_database.my_table_3",\n "extraction_type":extraction_type,\n "latest_timestamp_data_location":"s3://my_path/my_identifier_2_prov_predicates/",\n "adso_type":"CL",\n "predicates":predicates,\n },\n }\n ],\n "output_specs":[\n {\n "spec_id":"my_identifier_2_bronze",\n "input_id":"my_identifier_2_source",\n "write_type":write_type,\n "data_format":"delta",\n "partitions":["REQTSN"],\n "location":"s3://my_path/my_identifier_2_prov_predicates/",\n }\n ],\n "exec_env":{\n "spark.databricks.delta.schema.autoMerge.enabled":True,\n "spark.databricks.delta.optimizeWrite.enabled":True,\n "spark.databricks.delta.autoCompact.enabled":True,\n },\n}\n\nload_data(acon=acon)\n
\n
\n\n
2.5 - Parallel Extraction, Generate Predicates
\n\n
This scenario is very similar to the scenario 2.4, with the only difference that it automatically\ngenerates the predicates (\"generate_predicates\": True).
\n\n
This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise \nthose would probably be recommended).
\n\n
When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumnLOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.
\n\n
Example: for \"partitionColumn\": \"record\"\nGenerate predicates:
\n\n
\n
SELECT DISTINCT(RECORD) as RECORD FROM dummy_table
Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.
\n\n
\n\n
A custom sap_bw reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from \nSAP BW DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions \n(active table, changelog table, activation requests table, how to identify the next delta timestamp...), \nonly requiring a few parameters that are explained and exemplified in the \ntemplate scenarios that we have created.
For extractions using the SAP BW reader, you can use the arguments listed in the SAP BW arguments, but also \nthe ones listed in the JDBC extractions, as those are inherited as well.
\n\n
\n\n
Extraction from SAP-BW template
\n\n
This template covers the following scenarios of extractions from the SAP BW DSOs:
\n\n
\n
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n
2 - Parallel extraction\n
\n
2.1 - Simplest Scenario
\n
2.2 - Provide upperBound (Recommended)
\n
2.3 - Automatic upperBound (Recommended)
\n
2.4 - Backfilling
\n
2.5 - Provide predicates (Recommended)
\n
2.6 - Generate predicates (Recommended)
\n
\n
3 - Extraction from Write Optimized DSO\n
\n
3.1 - Get initial actrequest_timestamp from Activation Requests Table
\n
\n
\n\n
\n\n
Introductory Notes: if you want to have a better understanding about JDBC Spark optimizations, \nhere you have a few useful links:
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n\n
This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques \nand using a single connection to retrieve all the data from the source. It should only be used in case the DSO \nyou want to extract from SAP BW is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source DSO, there are two options:
\n\n
\n
Delta Init - full extraction of the source DSO. You should use it in the first time you extract from the \nDSO or any time you want to re-extract completely. Similar to a so-called full load.
\n
Delta - extracts the portion of the data that is new or has changed in the source, since the last\nextraction (using the max actrequest_timestamp value in the location of the data already extracted,\nby default).
\n
\n\n
Below example is composed of two cells.
\n\n
\n
The first cell is only responsible to define the variables extraction_type and write_type,\ndepending on the extraction type Delta Init (LOAD_TYPE = INIT) or a Delta (LOAD_TYPE = DELTA).\nThe variables in this cell will also be referenced by other acons/examples in this notebook, similar to what\nyou would do in your pipelines/jobs, defining this centrally and then re-using it.
\n
The second cell is where the acon to be used is defined (which uses the two variables extraction_type and\nwrite_type defined) and the load_data algorithm is executed to perform the extraction.
\n
\n\n
\n\n
There may be cases where you might want to always extract fully from the source DSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.
\n\n
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nLOAD_TYPE="INIT"or"DELTA"\n\nifLOAD_TYPE=="INIT":\n extraction_type="init"\n write_type="overwrite"\nelse:\n extraction_type="delta"\n write_type="append"\n\nacon={\n "input_specs":[\n {\n "spec_id":"my_identifier_source",\n "read_type":"batch",\n # You should use this custom reader to benefit from the lakehouse-engine utils for extractions from SAP BW\n "data_format":"sap_bw",\n "options":{\n "user":"my_user",\n "password":"my_hana_pwd",\n "url":"my_sap_bw_url",\n "dbtable":"my_database.my_table",\n "odsobject":"my_ods_object",\n "changelog_table":"my_database.my_changelog_table",\n "latest_timestamp_data_location":"s3://my_path/my_identifier/",\n "extraction_type":extraction_type,\n },\n }\n ],\n "output_specs":[\n {\n "spec_id":"my_identifier_bronze",\n "input_id":"my_identifier_source",\n "write_type":write_type,\n "data_format":"delta",\n "partitions":["actrequest_timestamp"],\n "location":"s3://my_path/my_identifier/",\n }\n ],\n "exec_env":{\n "spark.databricks.delta.schema.autoMerge.enabled":True,\n "spark.databricks.delta.optimizeWrite.enabled":True,\n "spark.databricks.delta.autoCompact.enabled":True,\n },\n}\n\nload_data(acon=acon)\n
\n
\n\n
2 - Parallel extraction
\n\n
In this section, 6 possible scenarios for parallel extractions from SAP BW DSOs.
\n\n
2.1 - Parallel Extraction, Simplest Scenario
\n\n
This scenario provides the simplest example you can have for a parallel extraction from SAP BW, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people does not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential. \nOn the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source DSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the example 1.
2.2 - Parallel Extraction, Provide upper_bound (Recommended)
\n\n
This scenario performs the extraction from the SAP BW DSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using\nthe following options:
\n\n
\n
numPartitions - number of Spark partitions to split the extraction.
\n
partitionColumn - column used to split the extraction. It must be a numeric, date, or timestamp.\nIt should be a column that is able to split the extraction evenly in several tasks. An auto-increment\ncolumn is usually a very good candidate.
\n
lowerBound - lower bound to decide the partition stride.
\n
upperBound - upper bound to decide the partition stride. It can either be provided (as it is done in\nthis example) or derived automatically by our upperBound optimizer (example 2.3).
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).
\n\n
When these 4 properties are used, Spark will use them to build several queries to split the extraction.
\n\n
Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:
\n\n
\n
SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL
\n
SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20
\n
SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30
This scenario is very similar to 2.2, the only difference being that upper_bound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP BW DSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.
This scenario covers the case, in which you might want to backfill the data extracted from a SAP BW DSO and\nmade available in the bronze layer. By default, the delta extraction considers the max value of the column\nactrequest_timestamp on the data already extracted. However, there might be cases, in which you might want\nto extract a delta from a particular timestamp onwards or for a particular interval of time. For this, you\ncan use the properties min_timestamp and max_timestamp.
\n\n
Below, a very similar example to the previous one is provided, the only differences being that\nthe properties \"min_timestamp\": \"20210910000000\" and \"max_timestamp\": \"20210913235959\" are not provided,\nmeaning it will extract the data from the changelog table, using a filter\n\"20210910000000\" > actrequest_timestamp <= \"20210913235959\", ignoring if some of the data is already\navailable in the destination or not. Moreover, note that the property latest_timestamp_data_location\ndoes not need to be provided, as the timestamps to be considered are being directly provided (if both\nthe timestamps and the latest_timestamp_data_location are provided, the last parameter will have no effect).\nAdditionally, \"extraction_type\": \"delta\" and \"write_type\": \"append\" is forced, instead of using the\nvariables as in the other examples, because the backfilling scenario only makes sense for delta extractions.
\n\n
\n\n
Note: be aware that the backfilling example being shown has no mechanism to enforce that\nyou don't generate duplicated data in bronze. For your scenarios, you can either use this example and solve\nany duplication in the silver layer or extract the delta with a merge strategy while writing to bronze,\ninstead of appending.
2.5 - Parallel Extraction, Provide Predicates (Recommended)
\n\n
This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:
\n\n
\n
partitionColumn - column used to split the extraction. It can be of any type.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).
\n\n
When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.
\n\n
Below the lakehouse function to generate predicate list automatically is presented.
\n\n
This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.
\n\n
predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here,\nspecially if you are applying filter on transformations spec, and you know entire table won't be necessary, so\nyou can change it to something like this: select distinct(x) from table where x > y.
\n\n
predicates_add_null: You can decide if you want to consider null on predicates list or not, by default this\nproperty is True.
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nLOAD_TYPE="INIT"or"DELTA"\n\nifLOAD_TYPE=="INIT":\n extraction_type="init"\n write_type="overwrite"\nelse:\n extraction_type="delta"\n write_type="append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfromlakehouse_engine.core.exec_envimportExecEnv\nfromlakehouse_engine.utils.extraction.jdbc_extraction_utilsimport(\n JDBCExtraction,\n JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column="my_partition_column"\ndbtable="my_database.my_table"\n\npredicates_query=f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\ncolumn_for_predicates=partition_column\nuser="my_user"\npassword="my_hana_pwd"\nurl="my_bw_url"\npredicates_add_null=True\n\njdbc_util=JDBCExtractionUtils(\n JDBCExtraction(\n user=user,\n password=password,\n url=url,\n dbtable=dbtable,\n partition_column=partition_column,\n )\n)\n\npredicates=jdbc_util.get_predicates(predicates_query)\n\nacon={\n "input_specs":[\n {\n "spec_id":"my_identifier_source",\n "read_type":"batch",\n "data_format":"sap_bw",\n "options":{\n "user":"my_user",\n "password":"my_hana_pwd",\n "url":"my_sap_bw_url",\n "dbtable":"my_database.my_table",\n "odsobject":"my_ods_object",\n "latest_timestamp_data_location":"s3://my_path/my_identifier/",\n "extraction_type":extraction_type,\n "predicates":predicates,\n },\n }\n ],\n "output_specs":[\n {\n "spec_id":"my_identifier_bronze",\n "input_id":"my_identifier_source",\n "write_type":write_type,\n "data_format":"delta",\n "partitions":["actrequest_timestamp"],\n "location":"s3://my_path/my_identifier/",\n }\n ],\n "exec_env":{\n "spark.databricks.delta.schema.autoMerge.enabled":True,\n "spark.databricks.delta.optimizeWrite.enabled":True,\n "spark.databricks.delta.autoCompact.enabled":True,\n },\n}\n\nload_data(acon=acon)\n
This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:
\n\n
\n
partitionColumn - column used to split the extraction. It can be of any type.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).
\n\n
When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumnLOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.
\n\n
Example: for \"partitionColumn\": \"record\"\nGenerate predicates:
\n\n
\n
SELECT DISTINCT(RECORD) as RECORD FROM dummy_table
This scenario is based on the best practices of the scenario 2.2, but it is ready to extract data from\nWrite Optimized DSOs, which have the changelog embedded in the active table, instead of having a separate\nchangelog table. Due to this reason, you need to specify that the changelog_table parameter value is equal\nto the dbtable parameter value.\nMoreover, these tables usually already include the changelog technical columns\nlike RECORD and DATAPAKID, for example, that the framework adds by default. Thus, you need to specify\n\"include_changelog_tech_cols\": False to change this behaviour.\nFinally, you also need to specify the name of the column in the table that can be used to join with the\nactivation requests table to get the timestamp of the several requests/deltas,\nwhich is \"actrequest\" by default (\"request_col_name\": 'request').
3.1 - Extraction from Write Optimized DSOs, Get ACTREQUEST_TIMESTAMP from Activation Requests Table
\n\n
By default, the act_request_timestamp has being hardcoded (either assumes a given extraction_timestamp or the\ncurrent timestamp) in the init extraction, however this may be causing problems when merging changes in silver,\nfor write optimised DSOs. So, a new possibility to choose when to retrieve this timestamp from the\nact_req_table was added.
\n\n
This scenario performs the data extraction from Write Optimized DSOs, forcing the actrequest_timestamp to\nassume the value from the activation requests table (timestamp column).
\n\n
This feature is only available for WODSOs and to use it you need to specify \"get_timestamp_from_actrequest\": True.
One of the most important parameters to optimise the extraction is the partitionColumn, as you can see in the template. Thus, this section helps you figure out if a column is a good candidate or not.
\n\n
Basically the partition column needs to be a column which is able to adequately split the processing, which means we can use it to \"create\" different queries with intervals/filters, so that the Spark tasks process similar amounts of rows/volume. Usually a good candidate is an integer auto-increment technical column.
\n\n
\n\n
Although RECORD is usually a good candidate, it is usually available on the changelog table only. Meaning that you would need to use a different strategy for the init. In case you don't have good candidates for partitionColumn, you can use the sample acon provided in the scenario 2.1 in the template above. It might make sense to use scenario 2.1 for the init and then scenario 2.2 or 2.3 for the subsequent deltas.
\n\n
\n\n
When there is no int, date or timestamp good candidate for partitionColumn:
\n\n
In this case you can opt by the scenario 2.5 - Generate Predicates, which supports any kind of column to be defined as partitionColumn.
\n\n
However, you should still analyse if the column you are thinking about is a good candidate or not. In this scenario, Spark will create one query per distinct value of the partitionColumn, so you can perform some analysis.
Secure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web.
\n\n
This feature is available in the Lakehouse Engine with the purpose of having a mechanism to read data directly from SFTP directories without moving those files manually/physically to a S3 bucket.
\n\n
The engine uses Pandas to read the files and converts them into a Spark dataframe, which makes the available resources of an Acon usable, such as dq_specs, output_specs, terminator_specs and transform_specs.
\n\n
Furthermore, this feature provides several filters on the directories that makes easier to control the extractions.
\n\n
Introductory Notes:
\n\n
There are important parameters that must be added to input specs in order to make the SFTP extraction work properly:
\n\n
\n\n
Read type The engine supports only BATCH mode for this feature.
\n\n
\n\n
sftp_files_format - File format that will be used to read data from SFTP. The engine supports: CSV, FWF, JSON and XML.
\n\n
location - The SFTP directory to be extracted. If it is necessary to filter a specific file, it can be made using the file_name_contains option.
\n\n
options - Arguments used to set the Paramiko SSH client connection (hostname, username, password, port...), set the filter to retrieve files and set the file parameters (separators, headers, cols...). For more information about the file parameters, please go to the Pandas link in the useful links section.
\n\n
The options allowed are:
\n\n
\n\n
\n
Property type
\n
Detail
\n
Example
\n
Comment
\n
\n\n\n
\n
Connection
\n
add_auto_policy(str)
\n
true of false
\n
Indicates to allow an SFTP connection using no host key. When a connection attempt is being made using no host key, then the engine will throw an exception if the auto_add_policy property is false. The purpose of this flag is to make the user conscientiously choose a lesser secure connection.
\n
\n
\n
Connection
\n
key_type (str)
\n
\"Ed25519\" or \"RSA\"
\n
Indicates the key type to be used for the connection (SSH, Ed25519).
\n
\n
\n
Connection
\n
key_filename (str)
\n
\"/path/to/private_key/private_key.ppk\"
\n
The filename, or list of filenames, of optional private(keys), and/or certs to try for authentication. It must be used with a pkey in order to add a policy. If a pkey is not provided, then use add_auto_policy.
Value to use for the host key when connecting to the remote SFTP server.
\n
\n
\n
Filter
\n
date_time_gt (str)
\n
\"1900-01-01\" or \"1900-01-01 08:59:59\"
\n
Filter the files greater than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
\n
\n
\n
Filter
\n
date_time_lt (str)
\n
\"3999-12-31\" or \"3999-12-31 20:59:59\"
\n
Filter the files lower than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
\n
\n
\n
Filter
\n
earliest_file (bool)
\n
true or false
\n
Filter the earliest dated file in the directory.
\n
\n
\n
Filter
\n
file_name_contains (str)
\n
\"part_of_filename\"
\n
Filter files when match the pattern.
\n
\n
\n
Filter
\n
latest_file (bool)
\n
true or false
\n
Filter the most recent dated file in the directory.
\n
\n
\n
Read data from subdirectories
\n
sub_dir (bool)
\n
true or false
\n
The engine will search files into subdirectories of the location. It will consider one level below the root location given. When sub_dir is used with latest_file/earliest_file argument, the engine will retrieve the latest/earliest file for each subdirectory.
\n
\n
\n
Add metadata info
\n
file_metadata (bool)
\n
true or false
\n
When this option is set as True, the dataframe retrieves the filename with location and the modification_time from the original files in sftp. It attaches these two columns adding the information to respective records.
The scenario below shows the extraction of a CSV file using most part of the available filter options. Also, as an example, the column \"created_on\" is created in the transform_specs in order to store the processing date for every record. As the result, it will have in the output table the original file date (provided by the option file_metadata) and the processing date from the engine.
\n\n
For an incremental load approach, it is advised to use the \"modification_time\" column created by the option file_metadata. Since it has the original file date of modification, this date can be used in the logic to control what is new and has been changed recently.
\n\n
\n\n
Below scenario uses \"add_auto_policy\": true, which is not recommended.
The following scenario shows the extraction of a JSON file using an RSA pkey authentication instead of auto_add_policy. The engine supports Ed25519Key and RSA for pkeys.
\n\n
For the pkey file location, it is important to have the file in a location accessible by the cluster. This can be achieved either by mounting the location or with volumes.
\n\n
\n\n
This scenario uses a more secure authentication, thus it is the recommended option, instead of the previous scenario.
Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.
\n\n
\n\n
Introduction
\n\n
Many databases allow a JDBC connection to extract data. Our engine has one reader where you can configure all the necessary definitions to connect to a database using JDBC.
\n\n
In the next section you will find several examples about how to do it.
\n\n
The Simplest Scenario using sqlite
\n\n
\n\n
Not parallel - Recommended for smaller datasets only, or when stressing the source system is a high concern
\n\n
\n\n
This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source.
\n\n
Here we use a sqlite database where any connection is allowed. Due to that, we do not specify any username or password.
\n\n
Same as spark, we provide two different ways to run jdbc reader.
\n\n
1 - We can use the jdbc() function, passing inside all the arguments needed for Spark to work, and we can even combine this with additional options passed through .options().
\n\n
2 - Other way is using .format(\"jdbc\") and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution.
\n\n
You can find and run the following code in our local test for the engine.
\n\n
jdbc() function
\n\n
As we can see in the next cell, all the arguments necessary to establish the jdbc connection are passed inside the jdbc_args object. Here we find the url, the table, and the driver. Besides that, we can add options, such as the partition number. The partition number will impact in the queries' parallelism.
\n\n
The below code is an example in how to use jdbc() function in our ACON.\nAs for other cases, the acon configuration should be executed with load_data using:
In this example we do not use the jdbc_args object. All the jdbc connection parameters are inside the dictionary with the object options.\nAs for other cases, the acon configuration should be executed with load_data using:
In this template we will use a SAP as example for a more complete and runnable example.\nThese definitions can be used in several databases that allow JDBC connection.
\n\n
The following scenarios of extractions are covered:
\n\n
\n
1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets only,\nor when stressing the source system is a high concern)
\n
2 - Parallel extraction \n
\n
2.1 - Simplest Scenario
\n
2.2 - Provide upperBound (Recommended)
\n
2.3 - Provide predicates (Recommended)
\n
\n
\n\n
\n\n
Disclaimer: This template only uses SAP as demonstration example for JDBC connection.\nThis isn't a SAP template!!!\nIf you are looking to extract data from SAP, please use our sap_b4 reader or the sap_bw reader.
\n\n
\n\n
The JDBC connection has 2 main sections to be filled, the jdbc_args and options:
\n\n
\n
jdbc_args - Here you need to fill everything related to jdbc connection itself, like table/query, url, user,\n..., password.
\n
options - This section is more flexible, and you can provide additional options like \"fetchSize\", \"batchSize\",\n\"numPartitions\", ..., upper and \"lowerBound\".
\n
\n\n
If you want to know more regarding jdbc spark options you can follow the link below:
1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets, or for not stressing the source)
\n\n
This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the data\nyou want to extract from is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source, we can have two options:
\n\n
\n
Delta Init - full extraction of the source. You should use it in the first time you extract from the\nsource or any time you want to re-extract completely. Similar to a so-called full load.
\n
Delta - extracts the portion of the data that is new or has changed in the source, since the last\nextraction (for that, the logic at the transformation step needs to be applied). On the examples below,\nthe logic using REQTSN column is applied, which means that the maximum value on bronze is filtered\nand its value is used to filter incoming data from the data source.
On this section we present 3 possible scenarios for parallel extractions from JDBC sources.
\n\n
\n\n
Disclaimer for parallel extraction: Parallel extractions can bring a jdbc source down if a lot of stress\nis put on the system. Be careful when choosing the number of partitions. \nSpark is a distributed system and can lead to many connections.
\n\n
\n\n
2.1 - Parallel Extraction, Simplest Scenario
\n\n
This scenario provides the simplest example you can have for a parallel extraction from JDBC sources, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch experience around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.
\n\n
On the example bellow, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.
2.2 - Parallel Extraction, Provide upper_bound (Recommended)
\n\n
This scenario performs the extraction from the JDBC source in parallel, but has more concerns trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:
\n\n
\n
numPartitions - number of Spark partitions to split the extraction.
\n
partitionColumn - column used to split the extraction. It must be a numeric, date, or timestamp.\nIt should be a column that is able to split the extraction evenly in several tasks. An auto-increment\ncolumn is usually a very good candidate.
\n
lowerBound - lower bound to decide the partition stride.
\n
upperBound - upper bound to decide the partition stride.
\n
\n\n
This is an adequate example to be followed if there is a column in the data source that is good to\nbe used as the partitionColumn. Comparing with the previous example,\nthe numPartitions and three additional options to fine tune the extraction (partitionColumn, lowerBound,\nupperBound) are provided.
\n\n
When these 4 properties are used, Spark will use them to build several queries to split the extraction.\nExample: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like:
\n\n
\n
SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL
\n
SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20
\n
SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30
2.3 - Parallel Extraction with Predicates (Recommended)
\n\n
This scenario performs the extraction from JDBC source in parallel, useful in contexts where there aren't\nnumeric, date or timestamp columns to parallelize the extraction:
\n\n
\n
partitionColumn - column used to split the extraction (can be of any type).
\n
\n\n
\n
This is an adequate example to be followed if there is a column in the data source that is good to be\nused as the partitionColumn, specially if these columns are not complying with the scenario 2.2.
\n
\n\n
When this property is used, all predicates to Spark need to be provided, otherwise it will leave data behind.
\n\n
Bellow, a lakehouse function to generate predicate list automatically, is presented.
\n\n
By using this function one needs to be careful specially on predicates_query and predicates_add_null variables.
\n\n
predicates_query: At the sample below the whole table (select distinct(x) from table) is being considered,\nbut it is possible to filter using predicates list here, specially if you are applying filter on\ntransformations spec, and you know entire table won't be necessary, so you can change it to something like this:\nselect distinct(x) from table where x > y.
\n\n
predicates_add_null: One can consider if null on predicates list or not. By default, this property is True.\nExample: for \"partitionColumn\": \"record\"
This scenario is very similar to the full load, but it filters the data coming from the source, instead of doing a complete full load.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"csv",\n"options":{\n"header":true,\n"delimiter":"|",\n"inferSchema":true\n},\n"location":"file:///app/tests/lakehouse/in/feature/full_load/with_filter/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"filtered_sales",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"expression_filter",\n"args":{\n"exp":"date like '2016%'"\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"filtered_sales",\n"write_type":"overwrite",\n"data_format":"parquet",\n"location":"file:///app/tests/lakehouse/out/feature/full_load/with_filter/data"\n}\n]\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
As seen in the ACON, the filtering capabilities are provided by a transformer called expression_filter, where you can provide a custom Spark SQL filter.
This scenario is very similar to the Filtered Full Load, but we only replace a subset of the partitions, leaving the other ones untouched, so we don't replace the entire table. This capability is very useful for backfilling scenarios.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"csv",\n"options":{\n"header":true,\n"delimiter":"|",\n"inferSchema":true\n},\n"location":"file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"filtered_sales",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"expression_filter",\n"args":{\n"exp":"date like '2016%'"\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"filtered_sales",\n"write_type":"overwrite",\n"data_format":"delta",\n"partitions":[\n"date",\n"customer"\n],\n"location":"file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data",\n"options":{\n"replaceWhere":"date like '2016%'"\n}\n}\n]\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
The key option for this scenario in the ACON is the replaceWhere, which we use to only overwrite a specific period of time, that realistically can match a subset of all the partitions of the table. Therefore, this capability is very useful for backfilling scenarios.
Related with schema, we can make two kind of operations:
\n\n
\n
Flatten Schema: transformer named \"flatten_schema\" used to flatten the schema of dataframe.
\n\n
\n
Parameters to be defined:\n
\n
max_level: 2 => this sets the level until you want to flatten the schema.
\n
shorten_names: True => this flag is when you want to shorten the name of the prefixes of the fields.
\n
alias: True => this flag is used when you want to define a prefix for the column to be flattened.
\n
num_chars: 7 => this sets the number of characters to consider when shortening the names of the fields.
\n
ignore_cols: True => this list value should be set to specify the columns you don't want to flatten.
\n
\n
\n
Explode Columns: transformer named \"explode_columns\" used to explode columns with types ArrayType and MapType.
\n\n
\n
Parameters to be defined:\n
\n
explode_arrays: True => this flag should be set to true to explode all array columns present in the dataframe.
\n
array_cols_to_explode: [\"sample_col\"] => this list value should be set when to specify the array columns desired to explode.
\n
explode_maps: True => this flag should be set to true to explode all map columns present in the dataframe.
\n
map_cols_to_explode: [\"map_col\"] => this list value should be set when to specify the map columns desired to explode.
\n
\n
Recommendation: use array_cols_to_explode and map_cols_to_explode to specify the columns desired to explode and do not do it for all of them.
\n
\n
\n\n
The below scenario of flatten_schema is transforming one or more columns and dividing the content nested in more columns, as desired. We defined the number of levels we want to flatten in the schema, regarding the nested values. In this case, we are just setting max_level of 2.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"json",\n"schema_path":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json",\n"location":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_source",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"rename",\n"args":{\n"cols":{\n"date":"date2",\n"customer":"customer2"\n}\n}\n},\n{\n"function":"with_expressions",\n"args":{\n"cols_and_exprs":{\n"constant":"'just a constant'",\n"length_customer2":"length(customer2)"\n}\n}\n},\n{\n"function":"from_json",\n"args":{\n"input_col":"sample",\n"schema":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field2",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field3",\n"type":"double",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field4",\n"type":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field2",\n"type":"string",\n"nullable":true,\n"metadata":{}\n}\n]\n},\n"nullable":true,\n"metadata":{}\n}\n]\n}\n}\n},\n{\n"function":"to_json",\n"args":{\n"in_cols":[\n"item",\n"amount"\n],\n"out_col":"item_amount_json"\n}\n},\n{\n"function":"flatten_schema",\n"args":{\n"max_level":2\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"sales_source",\n"write_type":"append",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/batch/data"\n}\n]\n}\n
\n
\n\n
The scenario of explode_arrays is transforming the arrays columns in one or more rows, depending on the number of elements, so, it replicates the row for each array value. In this case we are using explode to all array columns, using explode_arrays as true.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"json",\n"schema_path":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json",\n"location":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_source",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"rename",\n"args":{\n"cols":{\n"date":"date2",\n"customer":"customer2"\n}\n}\n},\n{\n"function":"with_expressions",\n"args":{\n"cols_and_exprs":{\n"constant":"'just a constant'",\n"length_customer2":"length(customer2)"\n}\n}\n},\n{\n"function":"to_json",\n"args":{\n"in_cols":[\n"item",\n"amount"\n],\n"out_col":"item_amount_json"\n}\n},\n{\n"function":"explode_columns",\n"args":{\n"explode_arrays":true\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"sales_source",\n"write_type":"append",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/batch/data"\n}\n]\n}\n
\n
\n\n
The scenario of flatten_and_explode_arrays_and_maps is using flatten_schema and explode_columns to have the desired output. In this case, the desired output is to flatten all schema and explode maps and arrays, even having an array inside a struct. Steps:
\n\n
1. In this case, we have an array column inside a struct column, so first we need to use the `flatten_schema` transformer to extract the columns inside that struct;\n2. Then, we are able to explode all the array columns desired and map columns, using `explode_columns` transformer.\n3. To be able to have the map column in 2 columns, we use again the `flatten_schema` transformer.\n
\n\n
As for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"json",\n"schema_path":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json",\n"location":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_source",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"rename",\n"args":{\n"cols":{\n"date":"date2",\n"customer":"customer2"\n}\n}\n},\n{\n"function":"with_expressions",\n"args":{\n"cols_and_exprs":{\n"constant":"'just a constant'",\n"length_customer2":"length(customer2)"\n}\n}\n},\n{\n"function":"from_json",\n"args":{\n"input_col":"agg_fields",\n"schema":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"nullable":true,\n"metadata":{},\n"type":{\n"containsNull":true,\n"elementType":"string",\n"type":"array"\n}\n},\n{\n"name":"field2",\n"type":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field2",\n"type":"string",\n"nullable":true,\n"metadata":{}\n}\n]\n},\n"nullable":true,\n"metadata":{}\n}\n]\n}\n}\n},\n{\n"function":"to_json",\n"args":{\n"in_cols":[\n"item",\n"amount"\n],\n"out_col":"item_amount_json"\n}\n},\n{\n"function":"flatten_schema",\n"args":{\n"max_level":2\n}\n},\n{\n"function":"explode_columns",\n"args":{\n"explode_arrays":true,\n"map_cols_to_explode":[\n"sample"\n]\n}\n},\n{\n"function":"flatten_schema",\n"args":{\n"max_level":2\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"sales_source",\n"write_type":"append",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch/data"\n}\n]\n}\n
This scenario reads CSV data from a path and writes in full to another path with delta lake files.
\n\n
Relevant notes
\n\n
\n
This ACON infers the schema automatically through the option inferSchema (we use it for local tests only). This is usually not a best practice using CSV files, and you should provide a schema through the InputSpec variables schema_path, read_schema_from_table or schema.
\n
The transform_specs in this case are purely optional, and we basically use the repartition transformer to create one partition per combination of date and customer. This does not mean you have to use this in your algorithm.
\n
A full load is also adequate for an init load (initial load).
\n
\n\n
As for other cases, the acon configuration should be executed with load_data using:
Don't use this feature if the Lakehouse Engine already has a supported data format for your use case, as in that case it is preferred to use the dedicated data formats which are more extensively tested and predictable. Check the supported data formats here.
\n\n
\n\n
Reading from a Spark DataFrame is very simple using our framework. You just need to define the input_specs as follows:
This scenario illustrates an append load done via streaming instead of batch, providing an efficient way of picking up new files from an S3 folder, instead of relying on the incremental filtering from the source needed from a batch based process (see append loads in batch from a JDBC source to understand the differences between streaming and batch append loads). However, not all sources (e.g., JDBC) allow streaming.\nAs for other cases, the acon configuration should be executed with load_data using:
In this scenario, we use DROPMALFORMED read mode, which drops rows that do not comply with the provided schema;
\n
In this scenario, the schema is provided through the input_spec \"schema\" variable. This removes the need of a separate JSON Spark schema file, which may be more convenient in certain cases.
\n
As can be seen, we use the output_spec Spark option checkpointLocation to specify where to save the checkpoints indicating what we have already consumed from the input data. This allows fault-tolerance if the streaming job fails, but more importantly, it allows us to run a streaming job using AvailableNow and the next job automatically picks up the stream state since the last checkpoint, allowing us to do efficient append loads without having to manually specify incremental filters as we do for batch append loads.
Streaming Append Load with Optimize Dataset Terminator
\n\n
This scenario includes a terminator which optimizes a dataset (table), being able of vacuuming the table, optimising it with z-order or not, computing table statistics and more. You can find more details on the Terminator here.
\n\n
As for other cases, the acon configuration should be executed with load_data using:
Streaming Delta Load with Group and Rank Condensation
\n\n
This scenario is useful for when we want to do delta loads based on changelogs that need to be first condensed based on a group by and then a rank only, instead of the record mode logic in the record mode based change data capture.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_bronze",\n"read_type":"streaming",\n"data_format":"csv",\n"schema_path":"file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/source_schema.json",\n"with_filepath":true,\n"options":{\n"mode":"FAILFAST",\n"header":true,\n"delimiter":"|"\n},\n"location":"file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_bronze_with_extraction_date",\n"input_id":"sales_bronze",\n"transformers":[\n{\n"function":"with_regex_value",\n"args":{\n"input_col":"lhe_extraction_filepath",\n"output_col":"extraction_date",\n"drop_input_col":true,\n"regex":".*WE_SO_SCL_(\\\\d+).csv"\n}\n},\n{\n"function":"with_auto_increment_id"\n},\n{\n"function":"group_and_rank",\n"args":{\n"group_key":[\n"salesorder",\n"item"\n],\n"ranking_key":[\n"extraction_date",\n"changed_on",\n"lhe_row_id"\n]\n}\n},\n{\n"function":"repartition",\n"args":{\n"num_partitions":1\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_silver",\n"input_id":"sales_bronze_with_extraction_date",\n"write_type":"merge",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data",\n"options":{\n"checkpointLocation":"file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/checkpoint"\n},\n"with_batch_id":true,\n"merge_opts":{\n"merge_predicate":"current.salesorder = new.salesorder and current.item = new.item",\n"update_predicate":"new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",\n"delete_predicate":"new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"\n}\n}\n]\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
This type of delta load with this type of condensation is useful when the source changelog can be condensed based on dates, instead of technical fields like datapakid, record, record_mode, etc., as we see in SAP BW DSOs.An example of such system is Omnihub Tibco orders and deliveries files.
Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking)
\n\n
How to Deal with Late Arriving Data without using Watermark
\n\n
This scenario covers a delta load in streaming mode that is able to deal with late arriving and out of order events.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n "input_specs":[\n {\n "spec_id":"sales_source",\n "read_type":"streaming",\n "data_format":"csv",\n "options":{\n "header":true,\n "delimiter":"|"\n },\n "location":"file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data"\n }\n ],\n "transform_specs":[\n {\n "spec_id":"transformed_sales_source",\n "input_id":"sales_source",\n "transformers":[\n {\n "function":"condense_record_mode_cdc",\n "args":{\n "business_key":[\n "salesorder",\n "item"\n ],\n "ranking_key_desc":[\n "extraction_timestamp",\n "actrequest_timestamp",\n "datapakid",\n "partno",\n "record"\n ],\n "record_mode_col":"recordmode",\n "valid_record_modes":[\n "",\n "N",\n "R",\n "D",\n "X"\n ]\n }\n }\n ]\n }\n ],\n "output_specs":[\n {\n "spec_id":"sales_bronze",\n "input_id":"transformed_sales_source",\n "write_type":"merge",\n "data_format":"delta",\n "location":"file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data",\n "options":{\n "checkpointLocation":"file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/checkpoint"\n },\n "merge_opts":{\n "merge_predicate":"current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n "update_predicate":"new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",\n "delete_predicate":"new.recordmode in ('R','D','X')",\n "insert_predicate":"new.recordmode is null or new.recordmode not in ('R','D','X')"\n }\n }\n ],\n "exec_env":{\n "spark.sql.streaming.schemaInference":true\n }\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
First question we can impose is: Do we need such complicated update predicate to handle late arriving and out of order events? Simple answer is no. Because we expect that the latest event (e.g., latest status of a record in the source) will eventually arrive, and therefore the target delta lake table will eventually be consistent. However, when will that happen? Do we want to have our target table inconsistent until the next update comes along? This of course is only true when your source cannot ensure the order of the changes and cannot avoid late arriving changes (e.g., some changes that should have come in this changelog extraction, will only arrive in the next changelog extraction). From previous experiences, this is not the case with SAP BW, for example (as SAP BW is ACID compliant, and it will extract data from an SAP source and only have the updated changelog available when the extraction goes through, so theoretically we should not be able to extract data from the SAP BW changelog while SAP BW is still extracting data).
\n
However, when the source cannot fully ensure ordering (e.g., Kafka) and we want to make sure we don't load temporarily inconsistent data into the target table, we can pay extra special attention, as we do here, to our update and insert predicates, that will enable us to only insert or update data if the new event meets the respective predicates:\n
\n
In this scenario, we will only update if the update_predicate is true, and that long predicate we have here ensures that the change that we are receiving is likely the latest one;
\n
In this scenario, we will only insert the record if the record is not marked for deletion (this can happen if the new event is a record that is marked for deletion, but the record was not in the target table (late arriving changes where the delete came before the insert), and therefore, without the insert_predicate, the algorithm would still try to insert the row, even if the record_mode indicates that that row is for deletion. By using the insert_predicate above we avoid that to happen. However, even in such scenario, to prevent the algorithm to insert the data that comes later (which is old, as we said, the delete came before the insert and was actually the latest status), we would even need a more complex predicate based on your data's nature. Therefore, please read the disclaimer below.
\n
\n
\n\n
\n\n
Disclaimer! The scenario illustrated in this page is purely fictional, designed for the Lakehouse Engine local tests specifically. Your data source changelogs may be different and the scenario and predicates discussed here may not make sense to you. Consequently, the data product team should reason about the adequate merge predicate and insert, update and delete predicates, that better reflect how they want to handle the delta loads for their data.
\n\n
\n\n
\n
We use spark.sql.streaming.schemaInference in our local tests only. We don't encourage you to use it in your data product.
How to Deal with Late Arriving Data using Watermark
\n\n
When building real-time pipelines, one of the realities that teams have to work with is that distributed data ingestion is inherently unordered. Additionally, in the context of stateful streaming operations, teams need to be able to properly track event time progress in the stream of data they are ingesting for the proper calculation of time-window aggregations and other stateful operations. While working with real-time streaming data there will be delays between event time and processing time due to how data is ingested and whether the overall application experiences issues like downtime. Due to these potential variable delays, the engine that you use to process this data needs to have some mechanism to decide when to close the aggregate windows and produce the aggregate result.
\n\n
Imagine a scenario where we will need to perform stateful aggregations on the streaming data to understand and identify problems in the machines. This is where we need to leverage Structured Streaming and Watermarking to produce the necessary stateful aggregations.
To explain this visually let\u2019s take a scenario where we are receiving data at various times from around 10:50 AM \u2192 11:20 AM. We are creating 10-minute tumbling windows that calculate the average of the temperature and pressure readings that came in during the windowed period.
\n\n
In this first picture, we have the tumbling windows trigger at 11:00 AM, 11:10 AM and 11:20 AM leading to the result tables shown at the respective times. When the second batch of data comes around 11:10 AM with data that has an event time of 10:53 AM this gets incorporated into the temperature and pressure averages calculated for the 11:00 AM \u2192 11:10 AM window that closes at 11:10 AM, which does not give the correct result.
\n\n
Approach 2 - Watermark
\n\n
We can define a watermark that will allow Spark to understand when to close the aggregate window and produce the correct aggregate result. In Structured Streaming applications, we can ensure that all relevant data for the aggregations we want to calculate is collected by using a feature called watermarking. In the most basic sense, by defining a watermark Spark Structured Streaming then knows when it has ingested all data up to some time, T, (based on a set lateness expectation) so that it can close and produce windowed aggregates up to timestamp T.
Unlike the first scenario where Spark will emit the windowed aggregation for the previous ten minutes every ten minutes (i.e. emit the 11:00 AM \u219211:10 AM window at 11:10 AM), Spark now waits to close and output the windowed aggregation once the max event time seen minus the specified watermark is greater than the upper bound of the window.
\n\n
In other words, Spark needed to wait until it saw data points where the latest event time seen minus 10 minutes was greater than 11:00 AM to emit the 10:50 AM \u2192 11:00 AM aggregate window. At 11:00 AM, it does not see this, so it only initialises the aggregate calculation in Spark\u2019s internal state store. At 11:10 AM, this condition is still not met, but we have a new data point for 10:53 AM so the internal state gets updated, just not emitted. Then finally by 11:20 AM Spark has seen a data point with an event time of 11:15 AM and since 11:15 AM minus 10 minutes is 11:05 AM which is later than 11:00 AM the 10:50 AM \u2192 11:00 AM window can be emitted to the result table.
\n\n
This produces the correct result by properly incorporating the data based on the expected lateness defined by the watermark. Once the results are emitted the corresponding state is removed from the state store.
\n\n
Watermarking and Different Output Modes
\n\n
It is important to understand how state, late-arriving records, and the different output modes could lead to different behaviours of your application running on Spark. The main takeaway here is that in both append and update modes, once the watermark indicates that all data is received for an aggregate time window, the engine can trim the window state. In append mode the aggregate is produced only at the closing of the time window plus the watermark delay while in update mode it is produced on every update to the window.
\n\n
Lastly, by increasing your watermark delay window you will cause the pipeline to wait longer for data and potentially drop less data \u2013 higher precision, but also higher latency to produce the aggregates. On the flip side, smaller watermark delay leads to lower precision but also lower latency to produce the aggregates.
\n\n
Watermarks can only be used when you are running your streaming application in append or update output modes. There is a third output mode, complete mode, in which the entire result table is written to storage. This mode cannot be used because it requires all aggregate data to be preserved, and hence cannot use watermarking to drop intermediate state.
\n\n
Joins With Watermark
\n\n
There are three types of stream-stream joins that can be implemented in Structured Streaming: inner, outer, and semi joins. The main problem with doing joins in streaming applications is that you may have an incomplete picture of one side of the join. Giving Spark an understanding of when there are no future matches to expect is similar to the earlier problem with aggregations where Spark needed to understand when there were no new rows to incorporate into the calculation for the aggregation before emitting it.
\n\n
To allow Spark to handle this, we can leverage a combination of watermarks and event-time constraints within the join condition of the stream-stream join. This combination allows Spark to filter out late records and trim the state for the join operation through a time range condition on the join.
\n\n
Spark has a policy for handling multiple watermark definitions. Spark maintains one global watermark that is based on the slowest stream to ensure the highest amount of safety when it comes to not missing data.
\n\n
We can change this behaviour by changing spark.sql.streaming.multipleWatermarkPolicy to max; however, this means that data from the slower stream will be dropped.
\n\n
State Store Performance Considerations
\n\n
As of Spark 3.2, Spark offers RocksDB state store provider.
\n\n
If you have stateful operations in your streaming query (for example, streaming aggregation, streaming dropDuplicates, stream-stream joins, mapGroupsWithState, or flatMapGroupsWithState) and you want to maintain millions of keys in the state, then you may face issues related to large JVM garbage collection (GC) pauses causing high variations in the micro-batch processing times. This occurs because, by the implementation of HDFSBackedStateStore, the state data is maintained in the JVM memory of the executors and large number of state objects puts memory pressure on the JVM causing high GC pauses.
\n\n
In such cases, you can choose to use a more optimized state management solution based on RocksDB. Rather than keeping the state in the JVM memory, this solution uses RocksDB to efficiently manage the state in the native memory and the local disk. Furthermore, any changes to this state are automatically saved by Structured Streaming to the checkpoint location you have provided, thus providing full fault-tolerance guarantees (the same as default state management).
\n\n
To enable the new build-in state store implementation, set spark.sql.streaming.stateStore.providerClass to org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider.
DataFrame writer can give us some advantages by returning a dictionary containing the spec_id and the computed dataframe.\nIn these examples we will cover the following scenarios of using the output dataframe format:
Debugging purposes: as we can access any dataframe used in any part of our ACON\nwe can observe what is happening with the computation and identify what might be wrong\nor can be improved.
\n
Flexibility: in case we have some very specific need not covered yet by the lakehouse\nengine capabilities, example: return the Dataframe for further processing like using a machine\nlearning model/prediction.
\n
Simplify ACONs: instead developing a single complex ACON, using the Dataframe writer,\nwe can compose our ACON from the output of another ACON. This allows us to identify\nand split the notebook logic across ACONs.
\n
\n\n
If you want/need, you can add as many dataframes as you want in the output spec\nreferencing the spec_id you want to add.
\n\n
\n\n
This is not intended to replace the other capabilities offered by the\nlakehouse-engine and in case other feature can cover your use case,\nyou should use it instead of using the Dataframe writer, as they\nare much more extensively tested on different type of operations.
\n\n
Additionally, please always introspect if the problem that you are trying to resolve and for which no lakehouse-engine feature is available, could be a common problem and thus deserve a common solution and feature.
\n\n
Moreover, Dataframe writer is not supported for the streaming trigger\ntypes processing time and continuous.
\n\n
\n\n
1. Write to dataframe: Consuming the output spec as DataFrame
\n\n
Silver Dummy Sales Write to DataFrame
\n\n
In this example we will cover the Dummy Sales write to a result containing the output DataFrame.
\n\n
\n
An ACON is used to read from bronze, apply silver transformations and write to a dictionary\ncontaining the output spec as key and the dataframe as value through the following steps:\n
\n
1 - Definition of how to read data (input data location, read type and data format);
\n
2 - Transformation of data (rename relevant columns);
\n
3 - Write the data to dict containing the dataframe;
\n
\n
\n\n
\n\n
If you are trying to retrieve more than once the same data using checkpoint it will return an empty dataframe with empty schema as we don't have new data to read.
2. Write all dataframes: Consuming all DataFrames generated per specs
\n\n
Silver Dummy Sales Write to DataFrame
\n\n
In this example we will cover the Dummy Sales write to a result containing the specs and related DataFrame.
\n\n
\n
An ACON is used to read from bronze, apply silver transformations and write to a dictionary\ncontaining the spec id as key and the DataFrames as value through the following steps:\n
\n
Definition of how to read data (input data location, read type and data format);
\n
Transformation of data (rename relevant columns);
\n
Write the data to a dictionary containing all the spec ids and DataFrames computed per step;
Run the Load and Return the Dictionary with the related DataFrames by Spec
\n\n
This exploratory test will return a dictionary with all specs and the related dataframe.\nYou can access the DataFrame you need by output.get(<spec_id>) for future developments and tests.
3. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data
\n\n
Silver Load Dummy Deliveries
\n\n
In this example we will cover the Dummy Deliveries table read and incremental load to silver composing the silver data to write using the DataFrame output spec:
\n\n
\n
First ACON is used to get the latest data from bronze, in this step we are using more than one output because we will need the bronze data with the latest data in the next step.
\n
Second ACON is used to consume the bronze data and the latest data to perform silver transformation, in this ACON we are using as input the two dataframes computed by the first ACON.
\n
Third ACON is used to write the silver computed data from the previous ACON to the target.
\n
\n\n
\n\n
This example is not a recommendation on how to deal with incremental loads, the ACON was split in 3 for demo purposes.
\n\n
\n\n
Consume bronze data, generate the latest data and return a dictionary with bronze and transformed dataframes:
Consume previous dataframes generated by the first ACON (bronze and latest bronze data) to generate the silver data. In this acon we are only using just one output because we only need the dataframe from the output for the next step.
Console writer is an interesting feature to debug / validate what have been done on lakehouse engine. Before moving forward and store data somewhere, it is possible to show / print the final dataframe to the console, which means it is possible to transform the data as many times as you want and display the final result to validate if it is as expected.
\n\n
Silver Dummy Sales Write to Console Example
\n\n
In this template we will cover the Dummy Sales write to console. An ACON is used to read from bronze, apply silver transformations and write on console through the following steps:
\n\n\n
Definition of how to read data (input data location, read type and data format);
\n
Transformation of data (rename relevant columns);
\n
Definition of how to print to console (limit, truncate, vertical options);
\n\n\n
For this, the ACON specs are :
\n\n
\n
input_specs (MANDATORY): specify how to read data;
\n
transform specs (OPTIONAL): specify how to transform data;
\n
output_specs (MANDATORY): specify how to write data to the target.
\n
\n\n
\n\n
Writer to console is a wrapper for spark.show() function, if you want to know more about the function itself or the available options, please check the spark documentation here.
REST API writer is an interesting feature to send data from Spark to a REST API within the data pipeline context. It uses the Python requests library to execute the REST calls.
\n\n
It is possible to configure a few aspects of the writer, like if the payload should be sent via JSON body or via file, or configure additional JSON body parameters to add to the payload generated via Spark.
\n\n
In the current implementation of the writer, each row will generate a request to the API, so it is important that you prepare your dataframe accordingly (check example below).
\n\n
Silver Dummy Sales Write to REST API Example
\n\n
In this template we will cover the Dummy Sales write to a REST API. An ACON is used to read from bronze, apply silver transformations to prepare the REST api payload and write to the API through the following steps:
\n\n\n
Definition of how to read data (input data location, read type and data format);
\n
Transformation of the data so that we form a payload column per each row.\nImportant Note: In the current implementation of the writer, each row will generate a request to the API, so create_payload is a lakehouse engine custom transformer function that creates a JSON string with the payload to be sent to the API. The column name should be exactly \"payload\", so that the lakehouse engine further processes that column accordingly, in order to correctly write the data to the REST API.
\n
Definition of how to write to a REST api (url, authentication, payload format configuration, ...);
\n\n\n
For this, the ACON specs are :
\n\n
\n
input_specs (MANDATORY): specify how to read data;
\n
transform specs (MANDATORY): specify how to transform data to prepare the payload;
\n
output_specs (MANDATORY): specify how to write data to the target.
\n
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\ndefcreate_payload(df:DataFrame)->DataFrame:\n payload_df=payload_df.withColumn(\n "payload",\n lit('{"just a dummy key": "just a dummy value"}')\n )\n\n returnpayload_df\n\nacon={\n "input_specs":[\n {\n "spec_id":"dummy_sales_bronze",\n "read_type":"streaming",\n "data_format":"delta",\n "location":"s3://my_data_product_bucket/bronze/dummy_sales",\n }\n ],\n "transform_specs":[\n {\n "spec_id":"dummy_sales_transform",\n "input_id":"dummy_sales_bronze",\n "transformers":[\n {\n "function":"custom_transformation",\n "args":{\n "custom_transformer":create_payload,\n },\n }\n ],\n },\n ],\n "output_specs":[\n { \n "spec_id":"data_to_send_to_api",\n "input_id":"dummy_sales_transform",\n "data_format":"rest_api",\n "options":{\n "rest_api_url":"https://foo.bar.com",\n "rest_api_method":"post",\n "rest_api_basic_auth":{\n "username":"...",\n "password":"...",\n },\n "rest_api_is_file_payload":False,# True if payload is to be sent via JSON file instead of JSON body (application/json)\n "rest_api_file_payload_name":"custom_file",# this is the name of the file to be sent in cases where the payload uses file uploads rather than JSON body.\n "rest_api_extra_json_payload":{"x":"y"}\n }\n }\n ],\n}\n\nload_data(acon=acon)\n
The Data Quality framework is based on Great Expectations (GX) and other custom-made \ndevelopments, providing a very light abstraction on top of the GX open source framework and the Spark framework.
\n\n
How to use Data Quality?
\n\n
Data Loader
\n\n
You can define data quality rules inside the DataLoader algorithm that you use to load data.
\n\n
\n\n
The DataLoader algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the \ncapability to store DQ results having history over all the DQ executions, which can be used for debugging, \nto create DQ dashboards on top of the data, and much more.
\n\n
\n\n
Examples:\nIn these examples, dummy sales local data is used to cover a few example usages of the DQ Framework\n(based on Great Expectations).\nThe main difference between the sample acons is on the usage of dq_specs.
The DQValidator algorithm focuses on validating data (e.g., spark DataFrames, Files or Tables).\nIn contrast to the dq_specs inside the DataLoader algorithm, the DQValidator focuses on validating data at rest \n(post-mortem) instead of validating data in-transit (before it is loaded to the destination).
\n\n
\n\n
The DQValidator algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the\ncapability to store DQ results having history over all the DQ executions, which can be used for debugging,\nto create DQ dashboards on top of the data, and much more.
Similarly to the Data Quality Validator algorithm, the Reconciliator algorithm focuses on \nvalidating data at rest (post-mortem). In contrast to the DQValidator algorithm, the Reconciliator always compares a \ntruth dataset (e.g., spark DataFrames, Files or Tables) with the current dataset (e.g., spark DataFrames, Files or \nTables), instead of executing DQ rules defined by the teams. \nHere you can find more information regarding reconciliator and examples.
\n\n
\n\n
Reconciliator does not use Great Expectations, therefore Data Docs and Result Sink and others native methods are not available.
\n\n
\n\n
Custom Expectations
\n\n
If your data has a data quality check that cannot be done with the expectations provided by Great Expectations you \ncan create a custom expectation to make this verification.
\n\n
\n\n
Before creating a custom expectation check if there is an expectation already created to address your needs, \nboth in Great Expectations and the Lakehouse Engine.\nAny Custom Expectation that is too specific (using hardcoded table/column names) will be rejected.\nExpectations should be generic by definition.
How to check the results of the Data Quality Process?
\n\n
1. Table/location analysis
\n\n
The possibility to configure a Result Sink allows you to store the history of executions of the DQ process. \nYou can query the table or the location to search through data and analyse history.
\n\n
2. Power BI Dashboard
\n\n
With the information expanded, interactive analysis can be built on top of the history of the DQ process.\nA dashboard can be created with the results that we have in dq_specs. To be able to have this information you \nneed to use arguments result_sink_db_table and/or result_sink_location.
\n\n
Through having a dashboard, the runs and expectations can be analysed, filtered by year, month, source and \nrun name, and you will have information about the number of runs, some statistics, status of expectations and more. \nAnalysis such as biggest failures per expectation type, biggest failures by columns, biggest failures per source, \nand others can be made, using the information in the result_sink_db_table/result_sink_location.
\n\n
\n\n
The recommendation is to use the same result sink table/location for all your dq_specs and \nin the dashboard you will get a preview of the status of all of them.
\n\n
\n\n
\n\n
3. Data Docs Website
\n\n
A site that is auto generated to present you all the relevant information can also be used. If you choose to define \nthe parameter data_docs_bucket you will be able to store the GX documentation in the defined bucket,\nand therefore make your data docs available in the DQ Web App (GX UI) visible to everyone. \nThe data_docs_bucket property supersedes the bucket property only for data docs storage.
"""Expectation to check if column 'a' is lower or equal than column 'b'."""\n\nfromtypingimportAny,Dict,Optional\n\nfromgreat_expectations.coreimportExpectationConfiguration\nfromgreat_expectations.execution_engineimportExecutionEngine,SparkDFExecutionEngine\nfromgreat_expectations.expectations.expectationimportColumnPairMapExpectation\nfromgreat_expectations.expectations.metrics.map_metric_providerimport(\n ColumnPairMapMetricProvider,\n column_pair_condition_partial,\n)\n\nfromlakehouse_engine.utils.expectations_utilsimportvalidate_result\n\n\nclassColumnPairCustom(ColumnPairMapMetricProvider):\n"""Asserts that column 'A' is lower or equal than column 'B'.\n\n Additionally, the 'margin' parameter can be used to add a margin to the\n check between column 'A' and 'B': 'A' <= 'B' + 'margin'.\n """\n\n condition_metric_name="column_pair_values.a_smaller_or_equal_than_b"\n condition_domain_keys=(\n "batch_id",\n "table",\n "column_A",\n "column_B",\n "ignore_row_if",\n )\n condition_value_keys=("margin",)\n\n @column_pair_condition_partial(engine=SparkDFExecutionEngine)\n def_spark(\n self:ColumnPairMapMetricProvider,\n column_A:Any,\n column_B:Any,\n margin:Any,\n **kwargs:dict,\n )->Any:\n"""Implementation of the expectation's logic.\n\n Args:\n column_A: Value of the row of column_A.\n column_B: Value of the row of column_B.\n margin: margin value to be added to column_b.\n kwargs: dict with additional parameters.\n\n Returns:\n If the condition is met.\n """\n ifmarginisNone:\n approx=0\n elifnotisinstance(margin,(int,float,complex)):\n raiseTypeError(\n f"margin must be one of int, float, complex."\n f" Found: {margin} as {type(margin)}"\n )\n else:\n approx=margin# type: ignore\n\n returncolumn_A<=column_B+approx# type: ignore\n\n\nclassExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n"""Expect values in column A to be lower or equal than column B.\n\n Args:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.\n\n Keyword Args:\n - allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n - ignore_row_if: "both_values_are_missing",\n "either_value_is_missing", "neither" (default).\n - result_format: Which output mode to use:\n `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n - include_config: If True (default), then include the expectation config\n as part of the result object.\n - catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n - meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.\n\n Returns:\n An ExpectationSuiteValidationResult.\n """\n\n examples=[\n {\n "dataset_name":"Test Dataset",\n "data":[\n {\n "data":{\n "a":[11,22,50],\n "b":[10,21,100],\n "c":[9,21,30],\n },\n "schemas":{\n "spark":{\n "a":"IntegerType",\n "b":"IntegerType",\n "c":"IntegerType",\n }\n },\n }\n ],\n "tests":[\n {\n "title":"negative_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"c",\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["c"],\n },\n },\n "out":{\n "success":False,\n "unexpected_index_list":[\n {"c":9,"a":11},\n {"c":21,"a":22},\n {"c":30,"a":50},\n ],\n },\n },\n {\n "title":"positive_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"b",\n "margin":1,\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["a"],\n },\n },\n "out":{\n "success":True,\n "unexpected_index_list":[],\n },\n },\n ],\n },\n ]\n\n map_metric="column_pair_values.a_smaller_or_equal_than_b"\n success_keys=(\n "column_A",\n "column_B",\n "ignore_row_if",\n "margin",\n "mostly",\n )\n default_kwarg_values={\n "mostly":1.0,\n "ignore_row_if":"neither",\n "result_format":"BASIC",\n "include_config":True,\n "catch_exceptions":False,\n }\n\n def_validate(\n self,\n configuration:ExpectationConfiguration,\n metrics:Dict,\n runtime_configuration:Optional[dict]=None,\n execution_engine:Optional[ExecutionEngine]=None,\n )->Any:\n"""Custom implementation of the GE _validate method.\n\n This method is used on the tests to validate both the result\n of the tests themselves and if the unexpected index list\n is correctly generated.\n The GE test logic does not do this validation, and thus\n we need to make it manually.\n\n Args:\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n runtime_configuration: Configuration used when running the expectation.\n execution_engine: Execution Engine where the expectation was run.\n\n Returns:\n Dictionary with the result of the validation.\n """\n returnvalidate_result(\n self,\n configuration,\n metrics,\n runtime_configuration,\n execution_engine,\n ColumnPairMapExpectation,\n )\n\n\n"""Mandatory block of code. If it is removed the expectation will not be available."""\nif__name__=="__main__":\n # test the custom expectation with the function `print_diagnostic_checklist()`\n ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
\n
\n\n
Naming Conventions
\n\n
Your expectation's name should start with expect.
\n\n
The name of the file must be the name of the expectation written in snake case. Ex: expect_column_length_match_input_length
\n\n
The name of the class must be the name of the expectation written in camel case. Ex: ExpectColumnLengthMatchInputLength
\n\n
File Structure
\n\n
The file contains two main sections:
\n\n
\n
the definition of the metric that we are tracking (where we define the logic of the expectation);
\n
the definition of the expectation
\n
\n\n
Metric Definition
\n\n
In this section we define the logic of the expectation. This needs to follow a certain structure:
\n\n
Code Structure
\n\n
1) The class you define needs to extend one of the Metric Providers defined by Great Expectations that corresponds \nto your expectation's type. More info on the metric providers.
\n\n
2) You need to define the name of your metric. This name must be unique and must follow the following structure: \ntype of expectation.name of metric. Ex.: column_pair_values.a_smaller_or_equal_than_b\nTypes of expectations:column_values, multicolumn_values, column_pair_values, table_rows, table_columns.
\n\n
3) Any GX default parameters that are necessary to calculate your metric must be defined as \"condition_domain_keys\".
\n\n
4) Any additional parameters that are necessary to calculate your metric must be defined as \"condition_value_keys\".
\n\n
5) The logic of your expectation must be defined for the SparkDFExecutionEngine in order to be run on the Lakehouse.
\n\n
\n
1)classColumnMapMetric(ColumnMapMetricProvider):\n"""Asserts that a column matches a pattern."""\n\n 2)condition_metric_name="column_pair_values.a_smaller_or_equal_than_b"\n 3)condition_domain_keys=(\n "batch_id",\n "table",\n "column_A",\n "column_B",\n "ignore_row_if",\n )\n 4)condition_value_keys=("margin",)\n\n 5)@column_pair_condition_partial(engine=SparkDFExecutionEngine)\n def_spark(\n self:ColumnPairMapMetricProvider,\n column_A:Any,\n column_B:Any,\n margin:Any,\n **kwargs:dict,\n )->Any:\n"""Implementation of the expectation's logic.\n\n Args:\n column_A: Value of the row of column_A.\n column_B: Value of the row of column_B.\n margin: margin value to be added to column_b.\n kwargs: dict with additional parameters.\n\n Returns:\n If the condition is met.\n """\n ifmarginisNone:\n approx=0\n elifnotisinstance(margin,(int,float,complex)):\n raiseTypeError(\n f"margin must be one of int, float, complex."\n f" Found: {margin} as {type(margin)}"\n )\n else:\n approx=margin# type: ignore\n\n returncolumn_A<=column_B+approx# type: ignore\n
\n
\n\n
Expectation Definition
\n\n
In this section we define the expectation. This needs to follow a certain structure:
\n\n
Code Structure
\n\n
1) The class you define needs to extend one of the Expectations defined by Great Expectations that corresponds to your expectation's type.
\n\n
2) You must define an \"examples\" object where you define at least one success and one failure of your expectation to \ndemonstrate its logic. The result format must be set to complete, and you must set the unexpected_index_name variable.
\n\n
\n\n
For any examples where you will have unexpected results you must define unexpected_index_list in your \"out\" element.\nThis will be validated during the testing phase.
\n\n
\n\n
3) The metric must be the same you defined in the metric definition.
\n\n
4) You must define all additional parameters that the user has to/should provide to the expectation.
\n\n
5) You should define any default values for your expectations parameters.
\n\n
6) You must define the _validate method like shown in the example. You must call the validate_result function \ninside your validate method, this process adds a validation to the unexpected index list in the examples.
\n\n
\n\n
If your custom expectation requires any extra validations, or you require additional fields to be returned on \nthe final dataframe, you can add them in this function. \nThe validate_result method has two optional parameters (partial_success and `partial_result) that can be used to \npass the result of additional validations and add more information to the result key of the returned dict respectively.
\n\n
\n\n
\n
1)classExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n"""Expect values in column A to be lower or equal than column B.\n\n Args:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.\n\n Keyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: "both_values_are_missing",\n "either_value_is_missing", "neither" (default).\n result_format: Which output mode to use:\n `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.\n\n Returns:\n An ExpectationSuiteValidationResult.\n """\n 2)examples=[\n {\n "dataset_name":"Test Dataset",\n "data":{\n "a":[11,22,50],\n "b":[10,21,100],\n "c":[9,21,30],\n },\n "schemas":{\n "spark":{"a":"IntegerType","b":"IntegerType","c":"IntegerType"}\n },\n "tests":[\n {\n "title":"negative_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"c",\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["c"],\n "include_unexpected_rows":True,\n },\n },\n "out":{\n "success":False,\n "unexpected_index_list":[\n {"c":9,"a":11},\n {"c":21,"a":22},\n {"c":30,"a":50},\n ],\n },\n },\n {\n "title":"positive_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"b",\n "margin":1,\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["a"],\n },\n },\n "out":{"success":True},\n },\n ],\n },\n ]\n\n 3)map_metric="column_values.pattern_match"\n 4)success_keys=(\n "validation_regex",\n "mostly",\n )\n 5)default_kwarg_values={\n "ignore_row_if":"never",\n "result_format":"BASIC",\n "include_config":True,\n "catch_exceptions":False,\n "mostly":1,\n }\n\n 6)def_validate(\n self,\n configuration:ExpectationConfiguration,\n metrics:Dict,\n runtime_configuration:Optional[dict]=None,\n execution_engine:Optional[ExecutionEngine]=None,\n )->dict:\n"""Custom implementation of the GX _validate method.\n\n This method is used on the tests to validate both the result\n of the tests themselves and if the unexpected index list\n is correctly generated.\n The GX test logic does not do this validation, and thus\n we need to make it manually.\n\n Args:\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n runtime_configuration: Configuration used when running the expectation.\n execution_engine: Execution Engine where the expectation was run.\n\n Returns:\n Dictionary with the result of the validation.\n """\n returnvalidate_result(self,configuration,metrics)\n
\n
\n\n
Printing the Expectation Diagnostics
\n\n
Your expectations must include the ability to call the Great Expectations diagnostic function in order to be validated.
\n\n
In order to do this code must be present.
\n\n
\n
"""Mandatory block of code. If it is removed the expectation will not be available."""\nif__name__=="__main__":\n # test the custom expectation with the function `print_diagnostic_checklist()`\n ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
\n
\n\n
Creation Process
\n\n
1) Create a branch from lakehouse engine.
\n\n
2) Create a custom expectation with your specific logic:
\n\n\n
All new expectations must be placed inside folder /lakehouse_engine/dq_processors/custom_expectations.
\n
The name of the expectation must be added to the file /lakehouse_engine/core/definitions.py, to the variable: CUSTOM_EXPECTATION_LIST.
\n
All new expectations must be tested on /tests/feature/custom_expectations/test_custom_expectations.py.\nIn order to create a new test for your custom expectation it is necessary to:
\n\n\n
\n
Copy one of the expectation folders in tests/resources/feature/custom_expectations renaming it to your custom expectation.
\n
Make any necessary changes on the data/schema file present.
\n
On /tests/feature/custom_expectations/test_custom_expectations.py add a scenario to test your expectation, all expectations \nmust be tested on batch and streaming. The test is implemented to generate an acon based on each scenario data.
\n
Test your developments to check that everything is working as intended.
\n
\n\n
3) When the development is completed, create a pull request with your changes.
\n\n
4) Your expectation will be available with the next release of the lakehouse engine that happens after you pull request is approved. \nThis means that you need to upgrade your version of the lakehouse engine in order to use it.
\n\n
Usage
\n\n
Custom Expectations are available to use like any other expectations provided by Great Expectations.
\n\n
Parameters
\n\n
Depending on the type of expectation you are defining some parameters are expected by default. \nEx: A ColumnMapExpectation has a default \"column\" parameter.
\n\n
Mostly
\n\n
Mostly is a standard \nparameter for a subset of expectations that is used to define a threshold for the failure of an expectation. \nEx: A mostly value of 0.7 makes it so that the expectation only fails if more than 70% of records have \na negative result.
\n\n
Result Format
\n\n
Great Expectations has several different types of result formats \nfor the expectations results. The lakehouse engine requires the result format to be set to \"COMPLETE\" in order to tag \nthe lines where the expectations failed.
\n\n
unexpected_index_column_names
\n\n
Inside this key you must define what columns are used as an index inside your data. If this is set and the result \nformat is set to \"COMPLETE\" a list with the indexes of the lines that failed the validation will be returned by \nGreat Expectations.\nThis information is used by the Lakehouse Engine to tag the lines in error after the fact. The additional tests \ninside the _validate method verify that the custom expectation is tagging these lines correctly.
DQValidator algorithm allows DQ Validations isolated from the data load (only read and apply data quality validations).\nWith this algorithm you have the capacity to apply the Lakehouse-Engine Data Quality Process,\nusing Great Expectations functions directly into a specific dataset also\nmaking use of all the InputSpecs available in the engine.
\n\n
Validating the Data Quality, using this algorithm, is a matter of defining the data you want to read and the validations you want to do to your data, detailing the great expectations functions you want to apply on the data to assess its quality.
\n\n
\n\n
This algorithm also gives the possibility to restore a previous version of a delta table or delta files in case the DQ\nprocess raises any exception. Please use it carefully!! You may lose important commits and data. Moreover, this will\nhighly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data\nQuality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored\nto the previous version, but the error could have happened 4 or 5 versions before.
\n\n
\n\n
When to use?
\n\n
\n
Post-Load validation: check quality of data already loaded to a table/location
\n
Pre-Load validation: check quality of the data you want to load (check DQ by reading a set of files in a specific\nlocation...)
\n
Validation of a DataFrame computed in the notebook itself (e.g. check data quality after joining or filtering\ndatasets, using the computed DataFrame as input for the validation)
\n
\n\n
This algorithm also gives teams some freedom to:
\n\n
\n
Schedule isolated DQ Validations to run periodically, with the frequency they need;
\n
Define a DQ Validation process as an end-to-end test of the respective data product.
\n
\n\n
How to use?
\n\n
All of these configurations are passed via the ACON to instantiate\na DQValidatorSpec object. The DQValidator algorithm uses an\nACON to configure its execution. In DQValidatorSpec you can\nfind the meaning of each ACON property.
On this page you will also find the following examples of usage:
\n\n\n
Dataframe as input & Success on the DQ Validation
\n
Table as input & Failure on DQ Validation & Restore previous version
\n
Files as input & Failure on DQ Validation & Fail_on_error disabled
\n
Files as input & Failure on DQ Validation & Critical functions defined
\n
Files as input & Failure on DQ Validation & Max failure percentage defined
\n\n\n
Example 1 : Dataframe as input & Success on the DQ Validation
\n\n
This example focuses on using a dataframe, computed in this notebook, directly in the input spec. First, a new\nDataFrame is generated as a result of the join of data from two tables (dummy_deliveries and dummy_pd_article) and\nsome DQ Validations are applied on top of this dataframe.
Example 2: Table as input & Failure on DQ Validation & Restore previous version
\n\n
In this example we are using a table as input to validate the data that was loaded. Here, we are forcing the DQ Validations to fail in order to show the possibility of restoring the table to the previous version.
\n\n
\n\n
Be careful when using the feature of restoring a previous version of a delta table or delta files. You may\nlose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality\nvalidations. If you run your data loads daily and Data Quality validations weekly, and you define the\nrestore_prev_version to true, this means that the table will be restored to the previous version, but the error\ncould have happened 4 or 5 versions before (because loads are daily, validations are weekly).
\n\n
\n\n
Steps followed in this example to show how the restore_prev_version feature works.
\n\n\n
Insert rows into the dummy_deliveries table to adjust the total numbers of rows and make the DQ process fail.
\n
Use the \"DESCRIBE HISTORY\" statement to check the number of versions available on the table and check the version\nnumber resulting from the insertion to the table.
\n
Execute the DQ Validation, using the configured acon (based on reading the dummy_deliveries table and setting the \nrestore_prev_version to true). Checking the logs of the process, you can see that the data did not pass all the \nexpectations defined and that the table version restore process was triggered.
\n
Re-run a \"DESCRIBE HISTORY\" statement to check that the previous version of the table was restored and thus, the row inserted in the beginning of the process is no longer present in the table.
\n\n\n
\n
fromlakehouse_engine.engineimportexecute_dq_validation\n\n# Force failure of data quality by adding new row\nspark.sql("""INSERT INTO my_database.dummy_deliveries VALUES (7, 1, 20180601, 71, "article1", "delivered")""")\n\n\n# Check history of the table\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n\nacon={\n "input_spec":{\n "spec_id":"deliveries_input",\n "read_type":"batch",\n "db_table":"my_database.dummy_deliveries",\n },\n "dq_spec":{\n "spec_id":"dq_deliveries",\n "input_id":"deliveries_input",\n "dq_type":"validator",\n "bucket":"my_data_product_bucket",\n "data_docs_bucket":"my_dq_data_docs_bucket",\n "data_docs_prefix":"dq/my_data_product/data_docs/site/",\n "tbl_to_derive_pk":"my_database.dummy_deliveries",\n "dq_functions":[\n {"function":"expect_column_values_to_not_be_null","args":{"column":"delivery_date"}},\n {"function":"expect_table_row_count_to_be_between","args":{"min_value":15,"max_value":19}},\n ],\n },\n "restore_prev_version":True,\n}\n\nexecute_dq_validation(acon=acon)\n\n# Check that the previous version of the table was restored\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n
\n
\n\n
Example 3: Files as input & Failure on DQ Validation & Fail_on_error disabled
\n\n
In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail, however disabling the \"fail_on_error\" configuration,\nso the algorithm warns about the expectations that failed but the process/the execution of the algorithm doesn't fail.
Example 4: Files as input & Failure on DQ Validation & Critical functions defined
\n\n
In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the critical functions feature, which will throw an error\nif any of the functions fails.
Example 5: Files as input & Failure on DQ Validation & Max failure percentage defined
\n\n
In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the max_percentage_failure,\nwhich will throw an error if the percentage of failures surpasses the defined maximum threshold.
Unlike DataLoader, this new DQValidator algorithm only allows, for now, one input_spec (instead of a list of input_specs) and one dq_spec (instead of a list of dq_specs). There are plans and efforts already initiated to make this available in the input_specs and one dq_spec (instead of a list of dq_specs). However, you can prepare a Dataframe which joins more than a source, and use it as input, in case you need to assess the Data Quality from different sources at the same time. Alternatively, you can also show interest on any enhancement on this feature, as well as contributing yourself.
This scenario illustrates the minimal configuration that you can have to use dq_specs, in which\nit uses required parameters: spec_id, input_id, dq_type, bucket, dq_functions and the optional\nparameter data_docs_bucket. This parameter allows you to store the GX documentation in another\nbucket that can be used to make your data docs available, in DQ Web App (GX UI), without giving users access to your bucket.\nThedata_docs_bucket property supersedes the bucket property only for data docs storage.
\n\n
Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check:
\n\n
\n
expect_column_to_exist - if a column exist in the data;
\n
expect_table_row_count_to_be_between - if the row count of the data is between the defined interval;
\n
expect_table_column_count_to_be_between - if the number of columns in the data is bellow the max value defined.
These scenarios store the results of the dq_specs into a result sink. For that, both scenarios include parameters defining\nthe specific table and location (result_sink_db_table and result_sink_location) where the results\nare expected to be stored. With this configuration, people can, later on, check the history of the DQ\nexecutions using the configured table/location, as shown bellow. You can configure saving the output of the\nresults in the result sink following two approaches:
\n\n
\n
Denormalized/exploded Data Model (recommended) - the results are stored in a detailed format in which\npeople are able to analyse them by Data Quality Run, by expectation_type and by keyword arguments.
\n
\n\n
\n\n
\n
...
\n
source
\n
column
\n
max_value
\n
min_value
\n
expectation_type
\n
expectation_success
\n
observed_value
\n
run_time_year
\n
...
\n
\n\n\n
\n
all columns from raw + more
\n
deliveries
\n
salesorder
\n
null
\n
null
\n
expect_column_to_exist
\n
TRUE
\n
null
\n
2023
\n
...
\n
\n
\n
all columns from raw + more
\n
deliveries
\n
null
\n
null
\n
null
\n
expect_table_row_count_to_be_between
\n
TRUE
\n
23
\n
2023
\n
...
\n
\n
\n
all columns from raw + more
\n
deliveries
\n
null
\n
null
\n
null
\n
expect_table_column_count_to_be_between
\n
TRUE
\n
6
\n
2023
\n
...
\n
\n\n
\n\n
\n
Raw Format Data Model (not recommended) - the results are stored in the raw format that Great\nExpectations outputs. This is not recommended as the data will be highly nested and in a\nstring format (to prevent problems with schema changes), which makes analysis and the creation of a dashboard on top way \nharder.
\n
\n\n
\n\n
\n
checkpoint_config
\n
run_name
\n
run_time
\n
run_results
\n
success
\n
validation_result_identifier
\n
spec_id
\n
input_id
\n
\n\n\n
\n
entire configuration
\n
20230323-...-dq_validation
\n
2023-03-23T15:11:32.225354+00:00
\n
results of the 3 expectations
\n
true/false for the run
\n
identifier
\n
spec_id
\n
input_id
\n
\n\n
\n\n
\n\n
\n
More configurations can be applied in the result sink, as the file format and partitions.
\n
It is recommended to:\n
\n
Use the same result sink table/location for all dq_specs across different data loads, from different \nsources, in the same Data Product.
\n
Use the parameter source (only available with \"result_sink_explode\": True), in the dq_specs, as\nused in both scenarios, with the name of the data source, to be easier to distinguish sources in the\nanalysis. If not specified, the input_id of the dq_spec will be considered as the source.
\n
These recommendations will enable more rich analysis/dashboard at Data Product level, considering\nall the different sources and data loads that the Data Product is having.
\n
\n
\n\n
\n\n
1. Result Sink Exploded (Recommended)
\n\n
This scenario stores DQ Results (results produces by the execution of the dq_specs) in the Result Sink,\nin a detailed format, in which people are able to analyse them by Data Quality Run, by expectation_type and\nby keyword arguments. This is the recommended approach since it makes the analysis on top of the result\nsink way easier and faster.
\n\n
For achieving the exploded data model, this scenario introduces the parameter result_sink_explode, which\nis a flag to determine if the output table/location should have the columns exploded (as True) or\nnot (as False). Default:True, but it is still provided explicitly in this scenario for demo purposes.\nThe table/location will include a schema which contains general columns, statistic columns, arguments of\nexpectations, and others, thus part of the schema will be always with values and other part will depend on\nthe expectations chosen.
To check the history of the DQ results, you can run commands like:
\n\n
\n
the table: display(spark.table(\"my_database.dq_result_sink\"))
\n
the location: display(spark.read.format(\"delta\").load(\"my_dq_path/dq_result_sink/\"))
\n
\n\n
2. Raw Result Sink
\n\n
This scenario is very similar to the previous one, but it changes the parameter result_sink_explode to False so that\nit produces a raw result sink output containing only one row representing the full run of dq_specs (no\nmatter the amount of expectations/dq_functions defined there). Being a raw output, it is not a\nrecommended approach, as it will be more complicated to analyse and make queries on top of it.
Data quality is essential for any organisation that relies on data to make informed decisions. \nHigh-quality data provides accurate, reliable, and timely information that enables organisations to identify\nopportunities, mitigate risks, and optimize their operations. In contrast, low-quality data can lead to incorrect\nconclusions, faulty decisions, and wasted resources.
\n\n
There are several common issues that can compromise data quality, such as:
\n\n
\n
data entry errors;
\n
data duplication;
\n
incomplete / inconsistent data;
\n
changes where data is collected (e.g. sources);
\n
faulty data processing, such as inaccurate data cleansing or transformations.
\n
\n\n
Therefore, implementing data quality controls, such as data validation rules, and regularly monitoring data for \naccuracy and completeness is key for any organisation.
\n\n
One of these controls that can be applied is the DQ Row Tagging Strategy so that you not only apply validations on \nyour data to ensure Data Quality, but you also tag your data with the results of the Data Quality validations \nproviding advantages like:
\n\n
\n
Transparency for downstream and upstream consumers;
\n
Data Observability and Reliability;
\n
More trust over the data;
\n
Anomaly Detection;
\n
Easier and faster discovery of Data Quality problems, and, consequently faster resolution;
\n
Makes it easier to deal with integrations with other systems and migrations (you can have validations capturing that a column was changed or simply disappeared);
\n
\n\n
\n\n
When using the DQ Row Tagging approach data availability will take precedence over Data Quality, meaning \nthat all the data will be introduced into the final target (e.g. table or location) no matter what Data Quality\nissues it is having.
\n\n
\n\n
Different Types of Expectations:
\n\n
\n
Table Level
\n
Column Aggregated Level
\n
Query Level
\n
Column Values (row level)
\n
Column Pair Value (row level)
\n
Multicolumn Values (row level)
\n
\n\n
The expectations highlighted as row level will be the ones enabling to Tag failures on specific rows and adding \nthe details about each failure (they affect the field run_row_result inside dq_validations). The expectations \nwith other levels (not row level) influence the overall result of the Data Quality execution, but won't be used to tag\nspecific rows (they affect the field run_success only, so you can even have situations for which you get \nrun_success False and run_row_success True for all rows).
\n\n
How does the Strategy work?
\n\n
The strategy relies mostly on the 6 below arguments.
\n\n
\n\n
When you specify \"tag_source_data\": True the arguments fail_on_error, gx_result_format and \nresult_sink_explode are set to the expected values.
\n\n
\n\n
\n
unexpected_rows_pk - the list columns composing the primary key of the source data to use to identify the rows \nfailing the DQ validations.
\n
tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
\n
gx_result_format - great expectations result format. Default: COMPLETE.
\n
tag_source_data - flag to enable the tagging strategy in the source data, adding the information of \nthe DQ results in a column dq_validations. This column makes it possible to identify if the DQ run was\nsucceeded in general and, if not, it unlocks the insights to know what specific rows have made the DQ validations\nfail and why. Default: False.
\n
\n\n
\n\n
It only works if result_sink_explode is True, result_format is COMPLETE and \nfail_on_error is `False.
\n\n
\n\n
\n
fail_on_error - whether to fail the algorithm if the validations of your data in the DQ process failed.
\n
result_sink_explode - flag to determine if the output table/location should have the columns exploded (as True)\nor not (as False). Default: True.
\n
\n\n
\n\n
It is mandatory to provide one of the arguments (unexpected_rows_pk or tbl_to_derive_pk) when using \ntag_source_data as True. \nWhen tag_source_data is False, this is not mandatory, but still recommended.
\n\n
\n\n
\n\n
\n\n
The tagging strategy only works when tag_source_data is True, which automatically\nassigns the expected values for the parameters result_sink_explode (True), fail_on_error (False)\nand gx_result_format (\"COMPLETE\").
\n\n
\n\n
\n\n
For the DQ Row Tagging to work, in addition to configuring the aforementioned arguments in the dq_specs, \nyou will also need to add the dq_validations field into your table (your DDL statements, recommended) or \nenable schema evolution.
\n\n
\n\n
\n\n
Kwargs field is a string, because it can assume different schemas for different expectations and runs. \nIt is useful to provide the complete picture of the row level failure and to allow filtering/joining with \nthe result sink table, when there is one. Some examples of kwargs bellow:
\n\n
\n
{\"column\": \"country\", \"min_value\": 1, \"max_value\": 2, \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for \nexpectations starting with expect_column_values (they always make use of \"column\", the other arguments vary).
\n
{\"column_A: \"country\", \"column_B\": \"city\", \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for expectations \nstarting with expect_column_pair (they make use of \"column_A\" and \"column_B\", the other arguments vary).
\n
{\"column_list\": [\"col1\", \"col2\", \"col3\"], \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for expectations \nstarting with expect_multicolumn (they make use of \"column_list\", the other arguments vary).\nbatch_id is common to all expectations, and it is an identifier for the batch of data being validated by\nGreat Expectations.
\n
\n\n
\n\n
Example
\n\n
This scenario uses the row tagging strategy which allow users to tag the rows that failed to be easier to\nidentify the problems in the validations.
Running bellow cell shows the new column created, named dq_validations with information about DQ validations.\ndisplay(spark.read.format(\"delta\").load(\"s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/\"))
\n\n
Performance and Limitations Trade-offs
\n\n
When using the DQ Row Tagging Strategy, by default we are using Great Expectations Result Format \"Complete\" with \nUnexpected Index Column Names (a primary key for the failures), meaning that for each failure, we are getting all \nthe distinct values for the primary key. After getting all the failures, we are applying some needed transformations \nand joining them with the source data, so that it can be tagged by filling the \"dq_validations\" column.
\n\n
Hence, this can definitely be a heavy and time-consuming operation on your data loads. To reduce this disadvantage \nyou can cache the dataframe by passing the \"cache_df\": True in your DQ Specs. In addition to this, always have in \nmind that each expectation (dq_function) that you add into your DQ Specs, is more time that you are adding into your \ndata loads, so always balance performance vs amount of validations that you need.
\n\n
Moreover, Great Expectations is currently relying on the driver node to capture the results of the execution and \nreturn/store them. Thus, in case you have huge amounts of rows failing (let's say 500k or more) Great Expectations \nmight raise exceptions.
\n\n
On these situations, the data load will still happen and the data will still be tagged with the Data Quality \nvalidations information, however you won't have the complete picture of the failures, so the raised_exceptions \nfield is filled as True, so that you can easily notice it and debug it.
\n\n
Most of the time, if you have such an amount of rows failing, it will probably mean that you did something wrong \nand want to fix it as soon as possible (you are not really caring about tagging specific rows, because you will \nnot want your consumers to be consuming a million of defective rows). However, if you still want to try to make it \npass, you can try to increase your driver and play with some spark configurations like:
\n\n
\n
spark.driver.maxResultSize
\n
spark.task.maxFailures
\n
\n\n
For debugging purposes, you can also use a different Great Expectations Result Format like \"SUMMARY\" (adding in your DQ Spec\n\"gx_result_format\": \"SUMMARY\"), so that you get only a partial list of the failures, avoiding surpassing the driver\ncapacity.
\n\n
\n\n
When using a Result Format different from the default (\"COMPLETE\"), the flag \"tag_source_data\" will be \noverwritten to False, as the results of the tagging wouldn't be complete which could lead to erroneous \nconclusions from stakeholders (but you can always get the details about the result of the DQ execution in\nthe result_sink_location or result_sink_db_table that you have configured).
The scenarios presented on this page are similar, but their goal is to show what happens when a DQ expectation fails the validations.\nThe logs generated by the execution of the code will contain information regarding which expectation(s) have failed and why.
\n\n
1. Fail on Error
\n\n
In this scenario is specified below two parameters:
\n\n
\n
\"fail_on_error\": False - this parameter is what controls what happens if a DQ expectation fails. In case\nthis is set to true (default), your job will fail/be aborted and an exception will be raised.\nIn case this is set to false, a log message will be printed about the error (as shown in this\nscenario) and the result status will also be available in result sink (if configured) and in the\n[data docs great expectation site](../data_quality.html#3-data-docs-website). On this scenario it is set tofalse` \nto avoid failing the execution of the notebook.
\n
the max_value of the function expect_table_column_count_to_be_between is defined with specific value so that\nthis expectation fails the validations.
If you run bellow command, you would be able to see the success column has the value false\nfor the last execution.\ndisplay(spark.table(RENDER_UTILS.render_content(\"my_database.dq_result_sink\")))
\n\n
2. Critical Functions
\n\n
In this scenario, alternative parameters to fail_on_error are used:
\n\n
\n
critical_functions - this parameter defaults to None if not defined.\nIt controls what DQ functions are considered a priority and as such, it stops the validation\nand throws an execution error whenever a function defined as critical doesn't pass the test.\nIf any other function that is not defined in this parameter fails, an error message is printed in the logs.\nThis parameter has priority over fail_on_error.\nIn this specific example, after defining the expect_table_column_count_to_be_between as critical,\nit is made sure that the execution is stopped whenever the conditions for the function are not met.
\n
\n\n
Additionally, it can also be defined additional parameters like:
\n\n
\n
max_percentage_failure - this parameter defaults to None if not defined.\nIt controls what percentage of the total functions can fail without stopping the execution of the validation.\nIf the threshold is surpassed the execution stops and a failure error is thrown.\nThis parameter has priority over fail_on_error and critical_functions.
\n
\n\n
You can also pair critical_functions with max_percentage_failure by defining something like\na 0.6 max percentage of failure and also defining some critical function.\nIn this case even if the threshold is respected, the list defined on critical_functions still is checked.
Checking if data reconciles, using this algorithm, is a matter of reading the truth data and the current data.\nYou can use any input specification compatible with the lakehouse engine to read truth or current data. On top\nof that, you can pass a truth_preprocess_query and a current_preprocess_query so you can preprocess the data before\nit goes into the actual reconciliation process. The reconciliation process is focused on joining truth\nwith current by all provided columns except the ones passed as metrics.
\n\n
In the table below, we present how a simple reconciliation would look like:
\n\n
\n\n
\n
current_country
\n
current_count
\n
truth_country
\n
truth_count
\n
absolute_diff
\n
perc_diff
\n
yellow
\n
red
\n
recon_type
\n
\n\n\n
\n
Sweden
\n
123
\n
Sweden
\n
120
\n
3
\n
0.025
\n
0.1
\n
0.2
\n
percentage
\n
\n
\n
Germany
\n
2946
\n
Sweden
\n
2946
\n
0
\n
0
\n
0.1
\n
0.2
\n
percentage
\n
\n
\n
France
\n
2901
\n
France
\n
2901
\n
0
\n
0
\n
0.1
\n
0.2
\n
percentage
\n
\n
\n
Belgium
\n
426
\n
Belgium
\n
425
\n
1
\n
0.002
\n
0.1
\n
0.2
\n
percentage
\n
\n\n
\n\n
The Reconciliator algorithm uses an ACON to configure its execution. You can find the meaning of each ACON property\nin ReconciliatorSpec object.
\n\n
Below there is an example of usage of reconciliator.
\n\n
\n
fromlakehouse_engine.engineimportexecute_reconciliation\n\ntruth_query="""\n SELECT\n shipping_city,\n sum(sales_order_qty) as qty,\n order_date_header\n FROM (\n SELECT\n ROW_NUMBER() OVER (\n PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n ORDER BY changed_on desc\n ) as rank1,\n sales_order_header,\n sales_order_item,\n sales_order_qty,\n order_date_header,\n shipping_city\n FROM truth -- truth is a locally accessible temp view created by the lakehouse engine\n WHERE order_date_header = '2021-10-01'\n ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\ncurrent_query="""\n SELECT\n shipping_city,\n sum(sales_order_qty) as qty,\n order_date_header\n FROM (\n SELECT\n ROW_NUMBER() OVER (\n PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n ORDER BY changed_on desc\n ) as rank1,\n sales_order_header,\n sales_order_item,\n sales_order_qty,\n order_date_header,\n shipping_city\n FROM current -- current is a locally accessible temp view created by the lakehouse engine\n WHERE order_date_header = '2021-10-01'\n ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\nacon={\n "metrics":[{"metric":"qty","type":"percentage","aggregation":"avg","yellow":0.05,"red":0.1}],\n "truth_input_spec":{\n "spec_id":"truth",\n "read_type":"batch",\n "data_format":"csv",\n "schema_path":"s3://my_data_product_bucket/artefacts/metadata/schemas/bronze/orders.json",\n "options":{\n "delimiter":"^",\n "dateFormat":"yyyyMMdd",\n },\n "location":"s3://my_data_product_bucket/bronze/orders",\n },\n "truth_preprocess_query":truth_query,\n "current_input_spec":{\n "spec_id":"current",\n "read_type":"batch",\n "data_format":"delta",\n "db_table":"my_database.orders",\n },\n "current_preprocess_query":current_query,\n}\n\nexecute_reconciliation(acon=acon)\n
The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small\nsingle-node clusters to check if an upstream system or data product contains new data since the last execution of our\njob. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new\ndata, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction,\nTransformation, and Loading).
\n\n
How do Sensor-based jobs work?
\n\n
\n\n
With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source\nsystem) have new data since the last successful job. We accomplish this through the approach illustrated above, which\ncan be interpreted as follows:
\n\n\n
A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors;
\n
The Sensor task may run in a very tiny single-node cluster to ensure cost\nefficiency (check sensor cost efficiency);
\n
If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster\nto process all the ETL tasks (data processing tasks).
\n
In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options:\n\n
(Preferred) Sense the upstream Data Product sensor control delta table;
\n
Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table\nsensor);
\n
\n\n\n
The Structure and Relevance of the Data Product\u2019s Sensors Control Table
\n\n
The concept of a lakehouse engine sensor is based on a special delta table stored inside the data product that chooses\nto opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that\ndata product. You can refer to the below table to understand the sensor delta table structure:
\n\n
\n\n
\n
Column Name
\n
Type
\n
Description
\n
\n\n\n
\n
sensor_id
\n
STRING
\n
A unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream. Each sensor in each job should have a different sensor_id. If you attempt to create 2 sensors with the same sensor_id, the engine will fail.
\n
\n
\n
assets
\n
ARRAY<STRING>
\n
A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status PROCESSED_NEW_DATA.
\n
\n
\n
status
\n
STRING
\n
Status of the sensor. Can either be:
ACQUIRED_NEW_DATA \u2013 when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.
PROCESSED_NEW_DATA - when the job where the sensor is located has processed all the tasks in that job.
\n
\n
\n
status_change_timestamp
\n
STRING
\n
Timestamp when the status has changed for the last time.
\n
\n
\n
checkpoint_location
\n
STRING
\n
Base location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors.
\n
\n
\n
upstream_key
\n
STRING
\n
Upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically). This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).
\n
\n
\n
upstream_value
\n
STRING
\n
Upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key. This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database).
\n
\n\n
\n\n
Note: to make use of the sensors you will need to add this table to your data product.
\n\n
How is it different from scheduled jobs?
\n\n
Sensor-based jobs are still scheduled, but they can be scheduled with higher frequency, as they are more cost-efficient\nthan ramping up a multi-node cluster supposed to do heavy ETL, only to figure out that the upstream does not have new\ndata.
\n\n
Are sensor-based jobs cost-efficient?
\n\n
For the same schedule (e.g., 4 times a day), sensor-based jobs are more cost-efficient than scheduling a regular job, because with sensor-based jobs you can start a very tiny single-node cluster, and only if there is new data in the upstream the bigger ETL cluster is spin up. For this reason, they are considered more cost-efficient.\nMoreover, if you have very hard SLAs to comply with, you can also play with alternative architectures where you can have several sensors in a continuous (always running) cluster, which then keeps triggering the respective data processing jobs, whenever there is new data.
\n\n
Sensor Steps
\n\n\n
Create your sensor task for the upstream source. Examples of available sources:\n
This shows how to create a Sensor to detect new data from a Delta Table.
\n\n
Configuration required to have a Sensor
\n\n
\n
sensor_id: A unique identifier of the sensor in a specific job.
\n
assets: List of assets considered for the sensor, which are considered as available once the\nsensor detects new data and status is ACQUIRED_NEW_DATA.
\n
control_db_table_name: Name of the sensor control table.
\n
input_spec: Input spec with the upstream source.
\n
preprocess_query: Query to filter data returned by the upstream.
\n
\n\n
\n\n
This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as sensor_new_data.
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it \nhas acquired new data. This value can be used to execute or not the next steps.
It makes use of generate_sensor_query to generate the preprocess_query,\ndifferent from delta_table.
\n\n
Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger \nthe condition to proceed to the next task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, \nit will automatically trigger the proceeding task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
\n\n
\n
fromlakehouse_engine.engineimportexecute_sensor\n\nacon={\n "sensor_id":"MY_SENSOR_ID",\n "assets":["MY_SENSOR_ASSETS"],\n "control_db_table_name":"my_database.lakehouse_engine_sensors",\n "input_spec":{\n "spec_id":"sensor_upstream",\n "read_type":"streaming",\n "data_format":"csv",# You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles"\n "location":"s3://my_data_product_bucket/path",\n },\n "base_checkpoint_location":"s3://my_data_product_bucket/checkpoints",\n "fail_on_empty_result":True,\n}\n\nexecute_sensor(acon=acon)\n
\n
\n\n
fail_on_empty_result as False
\n\n
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
This shows how to create a Sensor to detect new data from a JDBC table.
\n\n
Configuration required to have a Sensor
\n\n
\n
jdbc_args: Arguments of the JDBC upstream.
\n
generate_sensor_query: Generates a Sensor query to consume data from the upstream, this function can be used on preprocess_query ACON option.\n
\n
sensor_id: The unique identifier for the Sensor.
\n
filter_exp: Expression to filter incoming new data.\nA placeholder ?upstream_key and ?upstream_value can be used, example: ?upstream_key > ?upstream_value so that it can be replaced by the respective values from the sensor control_db_table_name for this specific sensor_id.
\n
control_db_table_name: Sensor control table name.
\n
upstream_key: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream).
\n
upstream_value: the first upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). Note: This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is -2147483647.
\n
upstream_table_name: Table name to consume the upstream value. If it's empty the default value applied is sensor_new_data.
\n
\n
\n\n
If you want to know more please visit the definition of the class here.
\n\n
Scenarios
\n\n
This covers the following scenarios of using the Sensor:
Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
This shows how to create a Sensor to detect new data from a SAP LOGCHAIN table.
\n\n
Configuration required to have a Sensor
\n\n
\n
sensor_id: A unique identifier of the sensor in a specific job.
\n
assets: List of assets considered for the sensor, which are considered as available once the\nsensor detects new data and status is ACQUIRED_NEW_DATA.
\n
control_db_table_name: Name of the sensor control table.
\n
input_spec: Input spec with the upstream source.
\n
preprocess_query: Query to filter data returned by the upstream.
\n
\n\n
\n\n
This parameter is only needed when the upstream data have to be filtered,
\n\n
in this case a custom query should be created with the source table as sensor_new_data.
\n\n
\n\n
\n
base_checkpoint_location: Spark streaming checkpoints to identify if the upstream has new data.
\n
fail_on_empty_result: Flag representing if it should raise NoNewDataException when\nthere is no new data detected from upstream.
\n
\n\n
Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream.\nThe Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream:\ngenerate_sensor_sap_logchain_query and generate_sensor_query.
\n\n
\n
generate_sensor_sap_logchain_query: This function aims\nto create a temporary table with timestamp from the SAP LOGCHAIN table, which is a process control table.
\n\n
Note: this temporary table only lives during runtime, and it is related with the\nsap process control table but has no relationship or effect on the sensor control table.
\n\n
\n
chain_id: SAP Chain ID process.
\n
dbtable: SAP LOGCHAIN db table name, default: my_database.RSPCLOGCHAIN.
\n
status: SAP Chain Status of your process, default: G.
\n
engine_table_name: Name of the temporary table created from the upstream data,\ndefault: sensor_new_data.\nThis temporary table will be used as source in the query option.
\n
\n
generate_sensor_query: Generates a Sensor query to consume data from the temporary table created in the prepareQuery.
\n\n
\n
sensor_id: The unique identifier for the Sensor.
\n
filter_exp: Expression to filter incoming new data.\nA placeholder ?upstream_key and ?upstream_value can be used, example: ?upstream_key > ?upstream_value\nso that it can be replaced by the respective values from the sensor control_db_table_name\nfor this specific sensor_id.
\n
control_db_table_name: Sensor control table name.
\n
upstream_key: the key of custom sensor information to control how to identify\nnew data from the upstream (e.g., a time column in the upstream).
\n
upstream_value: the first upstream value to identify new data from the\nupstream (e.g., the value of a time present in the upstream).
\n\n
\n\n
This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is -2147483647.
\n\n
\n
upstream_table_name: Table name to consume the upstream value.\nIf it's empty the default value applied is sensor_new_data.
\n\n
\n\n
In case of using the generate_sensor_sap_logchain_query the default value for the temp table is sensor_new_data, so if passing a different value in the engine_table_name this parameter should have the same value.
\n\n
\n
\n
\n\n
If you want to know more please visit the definition of the class here.
\n\n
Scenarios
\n\n
This covers the following scenarios of using the Sensor:
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
Load data using an algorithm configuration (ACON represented as dict).
\n\n
This algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.
\n\n
Since an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.
\n\n
As the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.
If there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.
\n\n
Arguments:
\n\n
\n
data: input dataframes in an ordered dict.
\n
\n\n
Returns:
\n\n
\n
Another ordered dict with the transformed dataframes, according to the\n transformation specification.
Process the data quality tasks for the data that was read and/or transformed.
\n\n
It supports multiple input dataframes. Although just one is advisable.
\n\n
It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\n
Arguments:
\n\n
\n
data: dataframes from previous steps of the algorithm that we which to\nrun the DQ process on.
\n
\n\n
Returns:
\n\n
\n
Another ordered dict with the validated dataframes.
Write the data that was read and transformed (if applicable).
\n\n
It supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.
\n\n
Arguments:
\n\n
\n
data: dataframes that were read and transformed (if applicable).
Validate data using an algorithm configuration (ACON represented as dict).
\n\n
This algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).
A data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.
Process the data quality tasks for the data that was read.
\n\n
It supports a single input dataframe.
\n\n
It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\n
Arguments:
\n\n
\n
data: input dataframe on which to run the DQ process.
Class to define the behavior of an algorithm that checks if data reconciles.
\n\n
Checking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.
\n\n
The reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).
\n\n
All of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.
\n\n
\n\n
It is crucial that both the current and truth datasets have exactly the same\nstructure.
\n\n
\n\n
\n\n
You should not use 0 as yellow or red threshold, as the algorithm will verify\nif the difference between the truth and current values is bigger\nor equal than those thresholds.
\n\n
\n\n
\n\n
The reconciliation does not produce any negative values or percentages, as we\nuse the absolute value of the differences. This means that the recon result\nwill not indicate if it was the current values that were bigger or smaller\nthan the truth values, or vice versa.
Definitions for collection of Lakehouse Engine Stats.
\n\n
\n\n
Note: whenever the value comes from a key inside a Spark Config\nthat returns an array, it can be specified with a '#' so that it\nis adequately processed.
This is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).
\n\n
\n
spec_id: spec_id of the input specification read_type: ReadType type of read\noperation.
\n
data_format: format of the input.
\n
sftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\ndirectory.
\n
df_name: dataframe name.
\n
db_table: table name in the form of <db>.<table>.
\n
location: uri that identifies from where to read data in the specified format.
\n
enforce_schema_from_table: if we want to enforce the table schema or not, by\nproviding a table name in the form of <db>.<table>.
\n
query: sql query to execute and return the dataframe. Use it if you do not want to\nread from a file system nor from a table, but rather from a sql query instead.
\n
schema: dict representation of a schema of the input (e.g., Spark struct type\nschema).
\n
schema_path: path to a file with a representation of a schema of the input (e.g.,\nSpark struct type schema).
\n
disable_dbfs_retry: optional flag to disable file storage dbfs.
\n
with_filepath: if we want to include the path of the file that is being read. Only\nworks with the file reader (batch and streaming modes are supported).
\n
options: dict with other relevant options according to the execution\nenvironment (e.g., spark) possible sources.
\n
calculate_upper_bound: when to calculate upper bound to extract from SAP BW\nor not.
\n
calc_upper_bound_schema: specific schema for the calculated upper_bound.
\n
generate_predicates: when to generate predicates to extract from SAP BW or not.
\n
predicates_add_null: if we want to include is null on partition by predicates.
\n
temp_view: optional name of a view to point to the input dataframe to be used\nto create or replace a temp view on top of the dataframe.
Transformer Specification, i.e., a single transformation amongst many.
\n\n
\n
function: name of the function (or callable function) to be executed.
\n
args: (not applicable if using a callable function) dict with the arguments\nto pass to the function <k,v> pairs with the name of the parameter of\nthe function and the respective value.
I.e., the specification that defines the many transformations to be done to the data\nthat was read.
\n\n
\n
spec_id: id of the terminate specification
\n
input_id: id of the corresponding input\nspecification.
\n
transformers: list of transformers to execute.
\n
force_streaming_foreach_batch_processing: sometimes, when using streaming, we want\nto force the transform to be executed in the foreachBatch function to ensure\nnon-supported streaming operations can be properly executed.
dq_type - type of DQ process to execute (e.g. validator).
\n
dq_functions - list of function specifications to execute.
\n
dq_db_table - name of table to derive the dq functions from.
\n
dq_table_table_filter - name of the table which rules are to be applied in the\nvalidations (Only used when deriving dq functions).
\n
dq_table_extra_filters - extra filters to be used when deriving dq functions.\nThis is a sql expression to be applied to the dq_db_table.
\n
execution_point - execution point of the dq functions. [at_rest, in_motion].\nThis is set during the load_data or dq_validator functions.
\n
unexpected_rows_pk - the list of columns composing the primary key of the\nsource data to identify the rows failing the DQ validations. Note: only one\nof tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\nis mandatory to provide one of these arguments when using tag_source_data\nas True. When tag_source_data is False, this is not mandatory, but still\nrecommended.
\n
tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\nNote: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\nbe provided. It is mandatory to provide one of these arguments when using\ntag_source_data as True. hen tag_source_data is False, this is not\nmandatory, but still recommended.
\n
gx_result_format - great expectations result format. Default: \"COMPLETE\".
\n
tag_source_data - when set to true, this will ensure that the DQ process ends by\ntagging the source data with an additional column with information about the\nDQ results. This column makes it possible to identify if the DQ run was\nsucceeded in general and, if not, it unlocks the insights to know what\nspecific rows have made the DQ validations fail and why. Default: False.\nNote: it only works if result_sink_explode is True, gx_result_format is\nCOMPLETE, fail_on_error is False (which is done automatically when\nyou specify tag_source_data as True) and tbl_to_derive_pk or\nunexpected_rows_pk is configured.
\n
store_backend - which store_backend to use (e.g. s3 or file_system).
\n
local_fs_root_dir - path of the root directory. Note: only applicable for\nstore_backend file_system.
\n
data_docs_local_fs - the path for data docs only for store_backend\nfile_system.
\n
bucket - the bucket name to consider for the store_backend (store DQ artefacts).\nNote: only applicable for store_backend s3.
\n
data_docs_bucket - the bucket name for data docs only. When defined, it will\nsupersede bucket parameter. Note: only applicable for store_backend s3.
\n
expectations_store_prefix - prefix where to store expectations' data. Note: only\napplicable for store_backend s3.
\n
validations_store_prefix - prefix where to store validations' data. Note: only\napplicable for store_backend s3.
\n
data_docs_prefix - prefix where to store data_docs' data.
\n
checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\napplicable for store_backend s3.
\n
data_asset_name - name of the data asset to consider when configuring the great\nexpectations' data source.
\n
expectation_suite_name - name to consider for great expectations' suite.
\n
result_sink_db_table - db.table_name indicating the database and table in which\nto save the results of the DQ process.
\n
result_sink_location - file system location in which to save the results of the\nDQ process.
\n
data_product_name - name of the data product.
\n
result_sink_partitions - the list of partitions to consider.
\n
result_sink_format - format of the result table (e.g. delta, parquet, kafka...).
\n
result_sink_options - extra spark options for configuring the result sink.\nE.g: can be used to configure a Kafka sink if result_sink_format is kafka.
\n
result_sink_explode - flag to determine if the output table/location should have\nthe columns exploded (as True) or not (as False). Default: True.
\n
result_sink_extra_columns - list of extra columns to be exploded (following\nthe pattern \".*\") or columns to be selected. It is only used when\nresult_sink_explode is set to True.
\n
source - name of data source, to be easier to identify in analysis. If not\nspecified, it is set as default . This will be only used\nwhen result_sink_explode is set to True.
\n
fail_on_error - whether to fail the algorithm if the validations of your data in\nthe DQ process failed.
\n
cache_df - whether to cache the dataframe before running the DQ process or not.
\n
critical_functions - functions that should not fail. When this argument is\ndefined, fail_on_error is nullified.
\n
max_percentage_failure - percentage of failure that should be allowed.\nThis argument has priority over both fail_on_error and critical_functions.
merge_predicate: predicate to apply to the merge operation so that we can\ncheck if a new record corresponds to a record already included in the\nhistorical data.
\n
insert_only: indicates if the merge should only insert data (e.g., deduplicate\nscenarios).
\n
delete_predicate: predicate to apply to the delete operation.
\n
update_predicate: predicate to apply to the update operation.
\n
insert_predicate: predicate to apply to the insert operation.
\n
update_column_set: rules to apply to the update operation which allows to\nset the value for each column to be updated.\n(e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )
\n
insert_column_set: rules to apply to the insert operation which allows to\nset the value for each column to be inserted.\n(e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )
This is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).
\n\n
\n
spec_id: id of the output specification.
\n
input_id: id of the corresponding input specification.
\n
write_type: type of write operation.
\n
data_format: format of the output. Defaults to DELTA.
\n
db_table: table name in the form of <db>.<table>.
\n
location: uri that identifies from where to write data in the specified format.
\n
partitions: list of partition input_col names.
\n
merge_opts: options to apply to the merge operation.
\n
streaming_micro_batch_transformers: transformers to invoke for each streaming\nmicro batch, before writing (i.e., in Spark's foreachBatch structured\nstreaming function). Note: the lakehouse engine manages this for you, so\nyou don't have to manually specify streaming transformations here, so we don't\nadvise you to manually specify transformations through this parameter. Supply\nthem as regular transformers in the transform_specs sections of an ACON.
\n
streaming_once: if the streaming query is to be executed just once, or not,\ngenerating just one micro batch.
\n
streaming_processing_time: if streaming query is to be kept alive, this indicates\nthe processing time of each micro batch.
\n
streaming_available_now: if set to True, set a trigger that processes all\navailable data in multiple batches then terminates the query.\nWhen using streaming, this is the default trigger that the lakehouse-engine will\nuse, unless you configure a different one.
\n
streaming_continuous: set a trigger that runs a continuous query with a given\ncheckpoint interval.
\n
streaming_await_termination: whether to wait (True) for the termination of the\nstreaming query (e.g. timeout or exception) or not (False). Default: True.
\n
streaming_await_termination_timeout: a timeout to set to the\nstreaming_await_termination. Default: None.
\n
with_batch_id: whether to include the streaming batch id in the final data,\nor not. It only takes effect in streaming mode.
\n
options: dict with other relevant options according to the execution environment\n(e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\nstreaming, etc.
\n
streaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\nbut for the DQ functions to be executed. Used internally by the lakehouse\nengine, so you don't have to supply DQ functions through this parameter. Use the\ndq_specs of the acon instead.
metrics: list of metrics in the form of:\n[{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n}].
\n
recon_type: reconciliation type (percentage or absolute). Percentage calculates\nthe difference between truth and current results as a percentage (x-y/x), and\nabsolute calculates the raw difference (x - y).
\n
truth_input_spec: input specification of the truth data.
\n
current_input_spec: input specification of the current results data
\n
truth_preprocess_query: additional query on top of the truth input data to\npreprocess the truth data before it gets fueled into the reconciliation process.\nImportant note: you need to assume that the data out of\nthe truth_input_spec is referencable by a table called 'truth'.
\n
truth_preprocess_query_args: optional dict having the functions/transformations to\napply on top of the truth_preprocess_query and respective arguments. Note: cache\nis being applied on the Dataframe, by default. For turning the default behavior\noff, pass \"truth_preprocess_query_args\": [].
\n
current_preprocess_query: additional query on top of the current results input\ndata to preprocess the current results data before it gets fueled into the\nreconciliation process. Important note: you need to assume that the data out of\nthe current_results_input_spec is referencable by a table called 'current'.
\n
current_preprocess_query_args: optional dict having the\nfunctions/transformations to apply on top of the current_preprocess_query\nand respective arguments. Note: cache is being applied on the Dataframe,\nby default. For turning the default behavior off, pass\n\"current_preprocess_query_args\": [].
\n
ignore_empty_df: optional boolean, to ignore the recon process if source & target\ndataframes are empty, recon will exit success code (passed)
input_spec: input specification of the data to be checked/validated.
\n
dq_spec: data quality specification.
\n
restore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.
assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
\n
control_db_table_name: db.table to store sensor metadata.
\n
input_spec: input specification of the source to be checked for new data.
\n
preprocess_query: SQL query to transform/filter the result from the\nupstream. Consider that we should refer to 'new_data' whenever\nwe are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"
\n
checkpoint_location: optional location to store checkpoints to resume\nfrom. These checkpoints use the same as Spark checkpoint strategy.\nFor Spark readers that do not support checkpoints, use the\npreprocess_query parameter to form a SQL query to filter the result\nfrom the upstream accordingly.
\n
fail_on_empty_result: if the sensor should throw an error if there is no new\ndata in the upstream. Default: True.
query_label_filter: query use-case label to execute.\nqueue_filter: queue to execute the job.\ncadence_filter: selected cadences to build the asset.\ntarget_database: target database to write.\ncurr_date: current date.\nstart_date: period start date.\nend_date: period end date.\nrerun_flag: rerun flag.\ntarget_table: target table to write.\nsource_database: source database.\ngab_base_path: base path to read the use cases.\nlookup_table: gab configuration table.\ncalendar_table: gab calendar table.
Based on the use case configuration return the values to override in the SQL file.\nThis enum aims to exhaustively map each combination of cadence, reconciliation,\n week_start and snap_flag return the corresponding values join_select,\n project_start and project_end to replace this values in the stages SQL file.
\n\n
Return corresponding configuration (join_select, project_start, project_end) for\n each combination (cadence x recon x week_start x snap_flag).
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.GABCombinedConfiguration.COMBINED_CONFIGURATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "GABCombinedConfiguration.COMBINED_CONFIGURATION", "kind": "variable", "doc": "\n", "default_value": "<GABCombinedConfiguration.COMBINED_CONFIGURATION: {1: {'cadence': 'DAY', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('${cad}',${date_column}))"}, 2: {'cadence': 'WEEK', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n calendar_date as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 3: {'cadence': 'WEEK', 'recon': {'YEAR', 'QUARTER', 'MONTH', 'DAY'}, 'week_start': 'M', 'snap_flag': {'Y', 'N'}, 'join_select': "\\n select distinct case\\n when '${config_week_start}' = 'Monday' then weekstart_mon\\n when '${config_week_start}' = 'Sunday' then weekstart_sun\\n end as cadence_start_date,\\n case\\n when '${config_week_start}' = 'Monday' then weekend_mon\\n when '${config_week_start}' = 'Sunday' then weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 4: {'cadence': 'MONTH', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct month_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 5: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_mon\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 6: {'cadence': 'MONTH', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct month_start as cadence_start_date,\\n case\\n when date(\\n date_trunc('MONTH',add_months(calendar_date, 1))\\n )-1 < weekend_sun\\n then date(date_trunc('MONTH',add_months(calendar_date, 1)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 7: {'cadence': 'MONTH', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "date(date_trunc('MONTH',add_months(${date_column}, 1)))-1"}, 8: {'cadence': 'QUARTER', 'recon': 'DAY', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n calendar_date as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 9: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 10: {'cadence': 'QUARTER', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct quarter_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('QUARTER',add_months(calendar_date, 3))\\n )-1\\n then date(date_trunc('QUARTER',add_months(calendar_date, 3)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 11: {'cadence': 'QUARTER', 'recon': 'MONTH', 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': '\\n select distinct quarter_start as cadence_start_date,\\n month_end as cadence_end_date\\n ', 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 12: {'cadence': 'QUARTER', 'recon': 'YEAR', 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 13: {'cadence': 'QUARTER', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months( date(date_trunc('${cad}',${date_column})), 3)\\n )\\n )-1\\n "}, 14: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'M', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_mon > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_mon\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 15: {'cadence': 'YEAR', 'recon': 'WEEK', 'week_start': 'S', 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when weekend_sun > date(\\n date_trunc('YEAR',add_months(calendar_date, 12))\\n )-1\\n then date(date_trunc('YEAR',add_months(calendar_date, 12)))-1\\n else weekend_sun\\n end as cadence_end_date", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 16: {'cadence': 'YEAR', 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': 'N', 'inverse_flag': 'Y', 'join_select': '', 'project_start': "date(date_trunc('${cad}',${date_column}))", 'project_end': "\\n date(\\n date_trunc(\\n '${cad}',add_months(date(date_trunc('${cad}',${date_column})), 12)\\n )\\n )-1\\n "}, 17: {'cadence': 'YEAR', 'recon': {'QUARTER', 'MONTH', 'DAY'}, 'week_start': {'M', 'S'}, 'snap_flag': 'Y', 'join_select': "\\n select distinct year_start as cadence_start_date,\\n case\\n when '${rec_cadence}' = 'DAY' then calendar_date\\n when '${rec_cadence}' = 'MONTH' then month_end\\n when '${rec_cadence}' = 'QUARTER' then quarter_end\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}, 18: {'cadence': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'recon': {'QUARTER', 'WEEK', 'DAY', 'YEAR', 'MONTH'}, 'week_start': {'M', 'S'}, 'snap_flag': {'Y', 'N'}, 'join_select': "\\n select distinct\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekstart_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekstart_sun\\n else\\n date(date_trunc('${cad}',calendar_date))\\n end as cadence_start_date,\\n case\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Monday'\\n then weekend_mon\\n when '${cad}' = 'WEEK' and '${config_week_start}' = 'Sunday'\\n then weekend_sun\\n when '${cad}' = 'DAY'\\n then date(date_trunc('${cad}',calendar_date))\\n when '${cad}' = 'MONTH'\\n then date(\\n date_trunc(\\n 'MONTH',\\n add_months(date(date_trunc('${cad}',calendar_date)), 1)\\n )\\n )-1\\n when '${cad}' = 'QUARTER'\\n then date(\\n date_trunc(\\n 'QUARTER',\\n add_months(date(date_trunc('${cad}',calendar_date)) , 3)\\n )\\n )-1\\n when '${cad}' = 'YEAR'\\n then date(\\n date_trunc(\\n 'YEAR',\\n add_months(date(date_trunc('${cad}',calendar_date)), 12)\\n )\\n )-1\\n end as cadence_end_date\\n ", 'project_start': 'df_cal.cadence_start_date', 'project_end': 'df_cal.cadence_end_date'}}>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "
Module to take care of creating a singleton of the execution environment class.
Generate the new set of extended start and end dates based on the cadence.
\n\n
Running week cadence again to extend to correct week start and end date in case\n of recon window for Week cadence is present.\nFor end_date 2012-12-31,in case of Quarter Recon window present for Week\n cadence, start and end dates are recalculated to 2022-10-01 to 2022-12-31.\nBut these are not start and end dates of week. Hence, to correct this, new dates\n are passed again to get the correct dates.
\n\n
Arguments:
\n\n
\n
cadence: cadence to process.
\n
derived_cadence: cadence reconciliation to process.
\n
start_date: start date of the period to process.
\n
end_date: end date of the period to process.
\n
query_type: use case query type.
\n
current_date: current date to be used in the end date, in case the end date\nis greater than current date so the end date should be the current date.
Read data from delta table containing sensor status info.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id. If this parameter is defined search occurs\nonly considering this parameter. Otherwise, it considers sensor\nassets and checkpoint location.
\n
control_db_table_name: db.table to control sensor runs.
\n
assets: list of assets that are fueled by the pipeline\nwhere this sensor is.
\n
\n\n
Return:
\n\n
\n
Row containing the data for the provided sensor_id.
Generates a sensor preprocess query based on timestamp logic.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
filter_exp: expression to filter incoming new data.\nYou can use the placeholder ?upstream_value so that\nit can be replaced by the upstream_value in the\ncontrol_db_table_name for this specific sensor_id.
\n
control_db_table_name: db.table to retrieve the last status change\ntimestamp. This is only relevant for the jdbc sensor.
\n
upstream_key: the key of custom sensor information\nto control how to identify new data from the\nupstream (e.g., a time column in the upstream).
\n
upstream_value: value for custom sensor\nto identify new data from the upstream\n(e.g., the value of a time present in the upstream)\nIf none we will set the default value.\nNote: This parameter is used just to override the\ndefault value -2147483647.
\n
upstream_table_name: value for custom sensor\nto query new data from the upstream.\nIf none we will set the default value,\nour sensor_new_data view.
Expect values in column A to be not equal to column B.
\n\n
Arguments:
\n\n
\n
column_A: The first column name.
\n
column_B: The second column name.
\n
\n\n
Keyword Args:
\n\n
\n
\n
allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
Expect values in column A to be lower or equal than column B.
\n\n
Arguments:
\n\n
\n
column_A: The first column name.
\n
column_B: The second column name.
\n
margin: additional approximation to column B value.
\n
\n\n
Keyword Args:
\n\n
\n
\n
allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
Expect value in column to be date that is not older than a given time.
\n\n
Since timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.
\n\n
Arguments:
\n\n
\n
column: Name of column to validate
\n
Note: Column must be of type Date, Timestamp or String (with Timestamp format).\nFormat: yyyy-MM-ddTHH:mm:ss
\n
timeframe: dict with the definition of the timeframe.
\n
kwargs: dict with additional parameters.
\n
\n\n
Keyword Args:
\n\n
\n
\n
allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.
This expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.
Run the specified data quality process on a dataframe.
\n\n
Based on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.
\n\n
Arguments:
\n\n
\n
dq_spec: data quality specification.
\n
data: input dataframe to run the dq process on.
\n
\n\n
Returns:
\n\n
\n
The DataFrame containing the results of the DQ process.
This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.
\n\n
Arguments:
\n\n
\n
store_backend: which store_backend to use (e.g. s3 or file_system).
\n
local_fs_root_dir: path of the root directory. Note: only applicable\nfor store_backend file_system
\n
data_docs_local_fs: path of the root directory. Note: only applicable\nfor store_backend file_system.
\n
data_docs_prefix: prefix where to store data_docs' data.
\n
bucket: the bucket name to consider for the store_backend\n(store DQ artefacts). Note: only applicable for store_backend s3.
\n
data_docs_bucket: the bucket name for data docs only. When defined,\nit will supersede bucket parameter.\nNote: only applicable for store_backend s3.
\n
expectations_store_prefix: prefix where to store expectations' data.\nNote: only applicable for store_backend s3.
\n
validations_store_prefix: prefix where to store validations' data.\nNote: only applicable for store_backend s3.
\n
checkpoint_store_prefix: prefix where to store checkpoints' data.\nNote: only applicable for store_backend s3.
We use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.
\n\n
Arguments:
\n\n
\n
context: the BaseDataContext containing the configurations for the data\nsource and store backend.
\n
batch_request: run time batch request to be able to query underlying data.
\n
expectation_suite_name: name of the expectation suite.
\n
dq_functions: a list of DQFunctionSpec to consider in the expectation suite.
\n
critical_functions: list of critical expectations in the expectation suite.
Update the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
control_db_table_name:db.table to store sensor checkpoints.
\n
status: status of the sensor.
\n
assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
Generates a preprocess query to be used in a sensor configuration.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
filter_exp: expression to filter incoming new data.\nYou can use the placeholder ?default_upstream_key and\n?default_upstream_value, so that it can be replaced by the\nrespective values in the control_db_table_name for this specific\nsensor_id.
\n
control_db_table_name:db.table to retrieve the last status change\ntimestamp. This is only relevant for the jdbc sensor.
\n
upstream_key: the key of custom sensor information to control how to\nidentify new data from the upstream (e.g., a time column in the\nupstream).
\n
upstream_value: the upstream value\nto identify new data from the upstream (e.g., the value of a time\npresent in the upstream).
\n
upstream_table_name: value for custom sensor\nto query new data from the upstream\nIf none we will set the default value,\nour sensor_new_data view.
This function does a full build of data docs based on all the great expectations\ncheckpoints in the specified location, getting all history of run/validations\nexecuted and results.
\n\n
Arguments:
\n\n
\n
store_backend: which store_backend to use (e.g. s3 or file_system).
\n
local_fs_root_dir: path of the root directory. Note: only applicable\nfor store_backend file_system
\n
data_docs_local_fs: path of the root directory. Note: only applicable\nfor store_backend file_system.
\n
data_docs_prefix: prefix where to store data_docs' data.
\n
bucket: the bucket name to consider for the store_backend\n(store DQ artefacts). Note: only applicable for store_backend s3.
\n
data_docs_bucket: the bucket name for data docs only. When defined,\nit will supersede bucket parameter.\nNote: only applicable for store_backend s3.
\n
expectations_store_prefix: prefix where to store expectations' data.\nNote: only applicable for store_backend s3.
\n
validations_store_prefix: prefix where to store validations' data.\nNote: only applicable for store_backend s3.
\n
checkpoint_store_prefix: prefix where to store checkpoints' data.\nNote: only applicable for store_backend s3.
Exception for when the input of an incremental filter is not found.
\n\n
This may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.
Define how to write a streaming micro batch after transforming it.
\n\n
This function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.
\n\n
Arguments:
\n\n
\n
kwargs: any keyword arguments.
\n
\n\n
Returns:
\n\n
\n
A function to be executed in the foreachBatch spark write method.
After the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.
Optimize a dataset based on a set of pre-conceived optimizations.
\n\n
Most of the time the dataset is a table, but it can be a file-based one only.
\n\n
Arguments:
\n\n
\n
db_table:database_name.table_name.
\n
location: dataset/table filesystem location.
\n
compute_table_stats: to compute table statistics or not.
\n
vacuum: (delta lake tables only) whether to vacuum the delta lake\ntable or not.
\n
vacuum_hours: (delta lake tables only) number of hours to consider\nin vacuum operation.
\n
optimize: (delta lake tables only) whether to optimize the table or\nnot. Custom optimize parameters can be supplied through ExecEnv (Spark)\nconfigs
\n
optimize_where: expression to use in the optimize function.
\n
optimize_zorder_col_list: (delta lake tables only) list of\ncolumns to consider in the zorder optimization process. Custom optimize\nparameters can be supplied through ExecEnv (Spark) configs.
\n
debug: flag indicating if we are just debugging this for local\ntests and therefore pass through all the exceptions to perform some\nassertions in local tests.
Update the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.
\n\n
Arguments:
\n\n
\n
sensor_id: sensor id.
\n
control_db_table_name:db.table to store sensor checkpoints.
\n
status: status of the sensor.
\n
assets: a list of assets that are considered as available to\nconsume downstream after this sensor has status\nPROCESSED_NEW_DATA.
Explode columns with types like ArrayType and MapType.
\n\n
After it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.
\n\n
Arguments:
\n\n
\n
explode_arrays: whether you want to explode array columns (True)\nor not (False). Default: False.
\n
array_cols_to_explode: array columns which you want to explode.\nIf you don't specify it will get all array columns and explode them.\nDefault: None.
\n
explode_maps: whether you want to explode map columns (True)\nor not (False). Default: False.
\n
map_cols_to_explode: map columns which you want to explode.\nIf you don't specify it will get all map columns and explode them.\nDefault: None.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
cols: dict with columns and respective target names.
\n
escape_col_names: whether to escape column names (e.g. /BIC/COL1) or not.\nIf True it creates a column with the new name and drop the old one.\nIf False, uses the native withColumnRenamed Spark function.\nDefault: True.
\n
\n\n
Returns:
\n\n
\n
Function to be called in .transform() spark function.
Convert a json string into a json column (struct).
\n\n
The new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.
\n\n
Arguments:
\n\n
\n
input_col: dict with columns and respective target names.
\n
schema_path: path to the StructType schema (spark schema).
\n
schema: dict with the StructType schema (spark schema).
\n
json_options: options to parse the json value.
\n
drop_all_cols: whether to drop all the input columns or not.\nDefaults to False.
\n
disable_dbfs_retry: optional flag to disable file storage dbfs.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
in_cols: name(s) of the input column(s).\nExample values:\n\"*\" - all\ncolumns; \"my_col\" - one column named \"my_col\";\n\"my_col1, my_col2\" - two columns.
\n
out_col: name of the output column.
\n
json_options: options to parse the json value.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
Condense Change Data Capture (CDC) based on record_mode strategy.
\n\n
This CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.
\n\n
Arguments:
\n\n
\n
business_key: The business key (logical primary key) of the data.
\n
ranking_key_desc: In this type of CDC condensation the data needs to be\nin descending order in a certain way, using columns specified in this\nparameter.
\n
ranking_key_asc: In this type of CDC condensation the data needs to be\nin ascending order in a certain way, using columns specified in\nthis parameter.
\n
record_mode_col: Name of the record mode input_col.
\n
valid_record_modes: Depending on the context, not all record modes may be\nconsidered for condensation. Use this parameter to skip those.
\n
\n\n
Returns:
\n\n
\n
A function to be executed in the .transform() spark function.
Execute a custom transformation provided by the user.
\n\n
This transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.
\n\n
\n\n
Attention!
\n\n
Please bear in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame,\nbecause it is how Spark's .transform method is able to chain the\ntransformations.
\n\n
\n\n
Example:
\n\n
\n
defmy_custom_logic(df:DataFrame)->DataFrame:\n
\n
\n\n
Arguments:
\n\n
\n
custom_transformer: custom transformer function. A python function with all\nrequired pyspark logic provided by the user.
\n
\n\n
Returns:
\n\n
\n
Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.
Execute a SQL transformation provided by the user.
\n\n
This transformer can be very useful whenever the user wants to perform\nSQL-based transformations that are not natively supported by the\nlakehouse engine transformers.
\n\n
Arguments:
\n\n
\n
sql: the SQL query to be executed. This can read from any table or\nview from the catalog, or any dataframe registered as a temp\nview.
\n
\n\n
Returns:
\n\n
\n
Callable: A function to be called in .transform() spark function.
Create day/month/week/quarter/year hierarchy for the provided date columns.
\n\n
Uses Spark's extract function.
\n\n
Arguments:
\n\n
\n
cols: list of names of the date columns to create the hierarchy.
\n
formats: dict with the correspondence between the hierarchy and the format\nto apply. Check here.\nExample: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n}
\n
\n\n
Returns:
\n\n
\n
A function to be executed in the .transform() spark function.
Incrementally Filter a certain dataframe given an increment logic.
\n\n
This logic can either be an increment value or an increment dataframe from\nwhich the get the latest value from. By default, the operator for the\nfiltering process is greater or equal to cover cases where we receive late\narriving data not cover in a previous load. You can change greater_or_equal\nto false to use greater, when you trust the source will never output more data\nwith the increment after you have load the data (e.g., you will never load\ndata until the source is still dumping data, which may cause you to get an\nincomplete picture of the last arrived data).
\n\n
Arguments:
\n\n
\n
input_col: input column name
\n
increment_value: value to which to filter the data, considering the\nprovided input_Col.
\n
increment_df: a dataframe to get the increment value from.\nyou either specify this or the increment_value (this takes precedence).\nThis is a good approach to get the latest value from a given dataframe\nthat was read and apply that value as filter here. In this way you can\nperform incremental loads based on the last value of a given dataframe\n(e.g., table or file based). Can be used together with the\nget_max_value transformer to accomplish these incremental based loads.\nSee our append load feature tests to see how to provide an acon for\nincremental loads, taking advantage of the scenario explained here.
\n
increment_col: name of the column from which to get the increment\nvalue from (when using increment_df approach). This assumes there's\nonly one row in the increment_df, reason why is a good idea to use\ntogether with the get_max_value transformer. Defaults to \"latest\"\nbecause that's the default output column name provided by the\nget_max_value transformer.
\n
greater_or_equal: if filtering should be done by also including the\nincrement value or not (useful for scenarios where you are performing\nincrement loads but still want to include data considering the increment\nvalue, and not only values greater than that increment... examples may\ninclude scenarios where you already loaded data including those values,\nbut the source produced more data containing those values).\nDefaults to false.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
Drop duplicate rows using spark function dropDuplicates().
\n\n
This transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.
\n\n
Arguments:
\n\n
\n
cols: column names.
\n
watermarker: properties to apply watermarker to the transformer.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
Join two dataframes based on specified type and columns.
\n\n
Some stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.
\n\n
Arguments:
\n\n
\n
left_df_alias: alias of the first dataframe.
\n
join_with: right dataframe.
\n
right_df_alias: alias of the second dataframe.
\n
join_condition: condition to join dataframes.
\n
join_type: type of join. Defaults to inner.\nAvailable values: inner, cross, outer, full, full outer,\nleft, left outer, right, right outer, semi,\nleft semi, anti, and left anti.
\n
broadcast_join: whether to perform a broadcast join or not.
\n
select_cols: list of columns to select at the end.
\n
watermarker: properties to apply watermarking.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
If num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).
\n\n
Arguments:
\n\n
\n
num_partitions: num of partitions to repartition.
\n
cols: list of columns to use for repartitioning.
\n
\n\n
Returns:
\n\n
\n
A function to be called in .transform() spark function.
num_partitions: number of Spark partitions to split the extraction.
\n
lower_bound: lower bound to decide the partition stride.
\n
upper_bound: upper bound to decide the partition stride. If\ncalculate_upper_bound is True, then upperBound will be\nderived by our upper bound optimizer, using the partition column.
\n
default_upper_bound: the value to use as default upper bound in case\nthe result of the upper bound calculation is None. Default: \"1\".
\n
fetch_size: how many rows to fetch per round trip. Default: \"100000\".
custom_schema: specify custom_schema for particular columns of the\nreturned dataframe in the init/delta extraction of the source table.
\n
min_timestamp: min timestamp to consider to filter the changelog data.\nDefault: None and automatically derived from the location provided.\nIn case this one is provided it has precedence and the calculation\nis not done.
\n
max_timestamp: max timestamp to consider to filter the changelog data.\nDefault: None and automatically derived from the table having information\nabout the extraction requests, their timestamps and their status.\nIn case this one is provided it has precedence and the calculation\nis not done.
\n
generate_predicates: whether to generate predicates automatically or not.\nDefault: False.
\n
predicates: list containing all values to partition (if generate_predicates\nis used, the manual values provided are ignored). Default: None.
\n
predicates_add_null: whether to consider null on predicates list.\nDefault: True.
\n
extraction_timestamp: the timestamp of the extraction. Default: current time\nfollowing the format \"%Y%m%d%H%M%S\".
\n
max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\nfrom the table holding the extraction requests information.
Helper to get additional Spark Options initially passed.
\n\n
If people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.
\n\n
Arguments:
\n\n
\n
input_spec: the input specification.
\n
options: dict with Spark options.
\n
ignore_options: list of options to be ignored by the process.\nSpark read has two different approaches to parallelize\nreading process, one of them is using upper/lower bound,\nanother one is using predicates, those process can't be\nexecuted at the same time, you must choose one of them.\nBy choosing predicates you can't pass lower and upper bound,\nalso can't pass number of partitions and partition column\notherwise spark will interpret the execution partitioned by\nupper and lower bound and will expect to fill all variables.\nTo avoid fill all predicates hardcoded at the acon, there is\na feature that automatically generates all predicates for init\nor delta load based on input partition column, but at the end\nof the process, partition column can't be passed to the options,\nbecause we are choosing predicates execution, that is why to\ngenerate predicates we need to pass some options to ignore.
\n
\n\n
Returns:
\n\n
\n
a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).
Configurations available for an Extraction from SAP B4.
\n\n
It inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\n
These configurations cover:
\n\n
\n
latest_timestamp_input_col: the column containing the request timestamps\nin the dataset in latest_timestamp_data_location. Default: REQTSN.
\n
request_status_tbl: the name of the SAP B4 table having information\nabout the extraction requests. Composed of database.table.\nDefault: SAPHANADB.RSPMREQUEST.
\n
request_col_name: name of the column having the request timestamp to join\nwith the request status table. Default: REQUEST_TSN.
\n
data_target: the data target to extract from. User in the join operation with\nthe request status table.
\n
act_req_join_condition: the join condition into activation table\ncan be changed using this property.\nDefault: 'tbl.reqtsn = req.request_col_name'.
\n
include_changelog_tech_cols: whether to include the technical columns\n(usually coming from the changelog) table or not.
\n
extra_cols_req_status_tbl: columns to be added from request status table.\nIt needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\nreq.col2 as column_two\".
\n
request_status_tbl_filter: filter to use for filtering the request status table,\ninfluencing the calculation of the max timestamps and the delta extractions.
\n
adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".
\n
max_timestamp_custom_schema: the custom schema to apply on the calculation of\nthe max timestamp to consider for the delta extractions.\nDefault: timestamp DECIMAL(23,0).
\n
default_max_timestamp: the timestamp to use as default, when it is not possible\nto derive one.
\n
custom_schema: specify custom_schema for particular columns of the\nreturned dataframe in the init/delta extraction of the source table.
Get the data_target from the data_target option or derive it.
\n\n
By definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.
Configurations available for an Extraction from SAP BW.
\n\n
It inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\n
These configurations cover:
\n\n
\n
latest_timestamp_input_col: the column containing the actrequest timestamp\nin the dataset in latest_timestamp_data_location. Default:\n\"actrequest_timestamp\".
\n
act_request_table: the name of the SAP BW activation requests table.\nComposed of database.table. Default: SAPPHA.RSODSACTREQ.
\n
request_col_name: name of the column having the request to join\nwith the activation request table. Default: actrequest.
\n
act_req_join_condition: the join condition into activation table\ncan be changed using this property.\nDefault: 'changelog_tbl.request = act_req.request_col_name'.
\n
odsobject: name of BW Object, used for joining with the activation request\ntable to get the max actrequest_timestamp to consider while filtering\nthe changelog table.
\n
include_changelog_tech_cols: whether to include the technical columns\n(usually coming from the changelog) table or not. Default: True.
\n
extra_cols_act_request: list of columns to be added from act request table.\nIt needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\nas column_one, act_req.col2 as column_two\".
\n
get_timestamp_from_act_request: whether to get init timestamp\nfrom act request table or assume current/given timestamp.
\n
sap_bw_schema: sap bw schema. Default: SAPPHA.
\n
max_timestamp_custom_schema: the custom schema to apply on the calculation of\nthe max timestamp to consider for the delta extractions.\nDefault: timestamp DECIMAL(23,0).
\n
default_max_timestamp: the timestamp to use as default, when it is not possible\nto derive one.
With the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.
date_time_gt(str):\nFilter the files greater than the string datetime\nformatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".
\n
date_time_lt(str):\nFilter the files lower than the string datetime\nformatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".
\n
earliest_file(bool):\nFilter the earliest dated file in the directory.
\n
file_name_contains(str):\nFilter files when match the pattern.
\n
latest_file(bool):\nFilter the most recent dated file in the directory.
\n
sub_dir(bool):\nWhen true, the engine will search files into subdirectories\nof the remote_path.\nIt will consider one level below the remote_path.\nWhen sub_dir is used with latest_file/earliest_file argument,\nthe engine will retrieve the latest_file/earliest_file\nfor each subdirectory.
\n
\n\n
Arguments:
\n\n
\n
sftp: the SFTP client object.
\n
remote_path: path of files to be filtered.
\n
options_args: options from the acon.
\n
\n\n
Returns:
\n\n
\n
A list containing the file names to be passed to Spark.
\"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\ncredentials or not.
\n
\"gss_host\" \u2013 optional - The targets name in the kerberos database.
\n
\"gss_trust_dns\" \u2013 optional - Indicates whether or\nnot the DNS is trusted to securely canonicalize the name of the\nhost being connected to (default True).
\n
\"banner_timeout\" \u2013 an optional timeout (in seconds)\nto wait for the SSH banner to be presented.
\n
\"auth_timeout\" \u2013 an optional timeout (in seconds)\nto wait for an authentication response.
\n
\"disabled_algorithms\" \u2013 an optional dict passed directly to\nTransport and its keyword argument of the same name.
\n
\"transport_factory\" \u2013 an optional callable which is handed a\nsubset of the constructor arguments (primarily those related\nto the socket, GSS functionality, and algorithm selection)\nand generates a Transport instance to be used by this client.\nDefaults to Transport.__init__.
\n
\n\n
The parameter to specify the private key is expected to be in\nRSA format. Attempting a connection with a blank host key is\nnot allowed unless the argument \"add_auto_policy\" is explicitly\nset to True.
\n
\n\n
Returns:
\n\n
\n
sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.
A dictionary that corresponds to the conclusion of a cadence.
\n\n
Any end date inputted by the user we check this end date is actually end of\n a cadence (YEAR, QUARTER, MONTH, WEEK).\nIf the user input is 2024-03-31 this is a month end and a quarter end that\n means any use cases configured as month or quarter need to be calculated.
Hide sensitive information from being shown in the logs.
\n\n
Based on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).
This covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.
How to configure a DataLoader algorithm in the lakehouse-engine by using an ACON file?
\n\n
An algorithm (e.g., data load) in the lakehouse-engine is configured using an ACON. The lakehouse-engine is a\nconfiguration-driven framework, so people don't have to write code to execute a Spark algorithm. In contrast, the\nalgorithm is written in pyspark and accepts configurations through a JSON file (an ACON - algorithm configuration). The\nACON is the configuration providing the behaviour of a lakehouse engine algorithm. You can check the algorithm code, and\nhow it interprets the ACON here.\nIn this page we will go through the structure of an ACON file and what are the most suitable ACON files for common data\nengineering scenarios.\nCheck the underneath pages to find several ACON examples that cover many data extraction, transformation and loading scenarios.
\n\n
Overview of the Structure of the ACON file for DataLoads
\n\n
An ACON-based algorithm needs several specifications to work properly, but some of them might be optional. The available\nspecifications are:
\n\n
\n
Input specifications (input_specs): specify how to read data. This is a mandatory keyword.
\n
Transform specifications (transform_specs): specify how to transform data.
\n
Data quality specifications (dq_specs): specify how to execute the data quality process.
\n
Output specifications (output_specs): specify how to write data to the target. This is a mandatory keyword.
\n
Terminate specifications (terminate_specs): specify what to do after writing into the target (e.g., optimising target table, vacuum, compute stats, expose change data feed to external location, etc.).
\n
Execution environment (exec_env): custom Spark session configurations to be provided for your algorithm (configurations can also be provided from your job/cluster configuration, which we highly advise you to do instead of passing performance related configs here for example).
\n
\n\n
Below is an example of a complete ACON file that reads from a s3 folder with CSVs and incrementally loads that data (using a merge) into a delta lake table.
\n\n
\n\n
spec_id is one of the main concepts to ensure you can chain the steps of the algorithm, so, for example, you can specify the transformations (in transform_specs) of a DataFrame that was read in the input_specs. Check ACON below to see how the spec_id of the input_specs is used as input_id in one transform specification.
You specify how to read the data by providing a list of Input Specifications. Usually there's just one element in that\nlist, as, in the lakehouse, you are generally focused on reading data from one layer (e.g., source, bronze, silver,\ngold) and put it on the next layer. However, there may be scenarios where you would like to combine two datasets (e.g.,\njoins or incremental filtering on one dataset based on the values of another\none), therefore you can use one or more elements.\nMore information about InputSpecs.
\n\n
Relevant notes
\n\n
\n
A spec id is fundamental, so you can use the input data later on in any step of the algorithm (transform, write, dq process, terminate).
\n
You don't have to specify db_table and location at the same time. Depending on the data_format sometimes you read from a table (e.g., jdbc or deltalake table) sometimes you read from a location (e.g., files like deltalake, parquet, json, avro... or kafka topic).
\n
\n\n
Transform Specifications
\n\n
In the lakehouse engine, you transform data by providing a transform specification, which contains a list of transform functions (transformers). So the transform specification acts upon on input, and it can execute multiple lakehouse engine transformation functions (transformers) upon that input.
\n\n
If you look into the example above we ask the lakehouse engine to execute two functions on the orders_bronze input\ndata: with_row_id and with_regex_value. Those functions can of course receive arguments. You can see a list of all\navailable transformation functions (transformers) here lakehouse_engine.transformers. Then, you just invoke them in\nyour ACON as demonstrated above, following exactly the same function name and parameters name as described in the code\ndocumentation. \nMore information about TransformSpec.
\n\n
Relevant notes
\n\n
\n
This stage is fully optional, you can omit it from the ACON.
\n
There is one relevant option force_streaming_foreach_batch_processing that can be used to force the transform to be\nexecuted in the foreachBatch function to ensure non-supported streaming operations can be properly executed. You don't\nhave to worry about this if you are using regular lakehouse engine transformers. But if you are providing your custom\nlogic in pyspark code via our lakehouse engine\ncustom_transformation (lakehouse_engine.transformers.custom_transformers) then sometimes your logic may contain\nSpark functions that are not compatible with Spark Streaming, and therefore this flag can enable all of your\ncomputation to be streaming-compatible by pushing down all the logic into the foreachBatch() function.
\n
\n\n
Data Quality Specifications
\n\n
One of the most relevant features of the lakehouse engine is that you can have data quality guardrails that prevent you\nfrom loading bad data into your target layer (e.g., bronze, silver or gold). The lakehouse engine data quality process\nincludes one main feature at the moment:
\n\n
\n
Validator: The capability to perform data quality checks on that data (e.g., is the max value of a column bigger\nthan x?) and even tag your data with the results of the DQ checks.
\n
\n\n
The output of the data quality process can be written into a Result Sink target (e.g. table or files) and is integrated with a Data Docs website, which can be a company-wide available website for people to check the quality of their data and share with others.
\n\n
To achieve all of this functionality the lakehouse engine uses Great Expectations internally. To hide the Great Expectations internals from our user base and provide friendlier abstractions using the ACON, we have developed the concept of DQSpec that can contain many DQFunctionSpec objects, which is very similar to the relationship between the TransformSpec and TransformerSpec, which means you can have multiple Great Expectations functions executed inside a single data quality specification (as in the ACON above).
You can write the outputs of the DQ process to a sink through the result_sink* parameters of the\nDQSpec. result_sink_options takes any Spark options for a DataFrame writer, which means you can specify the options\naccording to your sink format (e.g., delta, parquet, json, etc.). We usually recommend using \"delta\" as format.
\n
You can use the results of the DQ checks to tag the data that you are validating. When configured, these details will\nappear as a new column (like any other), as part of the tables of your Data Product.
\n
To be able to make an analysis with the data of result_sink*, we have available an approach in which you\nset result_sink_explode as true (which is the default) and then you have some columns expanded. Those are:\n
\n
General columns: Those are columns that have the basic information regarding dq_specs and will have always values\nand does not depend on the expectation types chosen.\n -\nColumns: checkpoint_config, run_name, run_time, run_results, success, validation_result_identifier, spec_id, input_id, validation_results, run_time_year, run_time_month, run_time_day.
\n
Statistics columns: Those are columns that have information about the runs of expectations, being those values for\nthe run and not for each expectation. Those columns come from run_results.validation_result.statistics.*.\n
Expectations columns: Those are columns that have information about the expectation executed.\n
\n
Columns: expectation_type, batch_id, expectation_success, exception_info. Those columns are exploded\nfrom run_results.validation_result.results\ninside expectation_config.expectation_type, expectation_config.kwargs.batch_id, success as expectation_success,\nand exception_info. Moreover, we also include unexpected_index_list, observed_value and kwargs.
\n
\n
Arguments of Expectations columns: Those are columns that will depend on the expectation_type selected. Those\ncolumns are exploded from run_results.validation_result.results inside expectation_config.kwargs.*.\n
\n
We can have for\nexample: column, column_A, column_B, max_value, min_value, value, value_pairs_set, value_set,\nand others.
\n
\n
More columns desired? Those can be added, using result_sink_extra_columns in which you can select columns\nlike <name> and/or explode columns like <name>.*.
\n
\n
Use the parameter \"source\" to identify the data used for an easier analysis.
\n
By default, Great Expectation will also provide a site presenting the history of the DQ validations that you have performed on your data.
\n
You can make an analysis of all your expectations and create a dashboard aggregating all that information.
\n
This stage is fully optional, you can omit it from the ACON.
\n
\n\n
Output Specifications
\n\n
The output_specs section of an ACON is relatively similar to the input_specs section, but of course focusing on how to write the results of the algorithm, instead of specifying the input for the algorithm, hence the name output_specs (output specifications). More information about OutputSpec.
\n\n
Relevant notes
\n\n
\n
Respect the supported write types and output formats.
\n
One of the most relevant options to specify in the options parameter is the checkpoint_location when in streaming\nread mode, because that location will be responsible for storing which data you already read and transformed from the\nsource, when the source is a Spark Streaming compatible source (e.g., Kafka or S3 files).
\n
\n\n
Terminate Specifications
\n\n
The terminate_specs section of the ACON is responsible for some \"wrapping up\" activities like optimising a table,\nvacuuming old files in a delta table, etc. With time the list of available terminators will likely increase (e.g.,\nreconciliation processes), but for now we have the following terminators.\nThis stage is fully optional, you can omit it from the ACON.\nThe most relevant now in the context of the lakehouse initiative are the following:
In the exec_env section of the ACON you can pass any Spark Session configuration that you want to define for the\nexecution of your algorithm. This is basically just a JSON structure that takes in any Spark Session property, so no\ncustom lakehouse engine logic. This stage is fully optional, you can omit it from the ACON.
\n\n
\n\n
Please be aware that Spark Session configurations that are not allowed to be changed when the Spark cluster is already\nrunning need to be passed in the configuration of the job/cluster that runs this algorithm, not here in this section.\nThis section only accepts Spark Session configs that can be changed in runtime. Whenever you introduce an option make\nsure that it takes effect during runtime, as to the best of our knowledge there's no list of allowed Spark properties\nto be changed after the cluster is already running. Moreover, typically Spark algorithms fail if you try to modify a\nconfig that can only be set up before the cluster is running.
The ReadMode is PERMISSIVE in this scenario, which is the default in Spark, hence we don't need to specify it. Permissive means don't enforce any schema on the input data.
\n
From a JDBC source the ReadType needs to be \"batch\" always as \"streaming\" is not available for a JDBC source.
\n
In this scenario we do an append load by getting the max date (transformer_spec \"get_max_value\") on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec \"incremental_filter\"). That is the standard way we do incremental batch loads in the lakehouse engine. For streaming incremental loads we rely on Spark Streaming checkpoint feature (check a streaming append load ACON example).
This scenario is an append load enforcing the schema (using the schema of the target table to enforce the schema of the source, i.e., the schema of the source needs to exactly match the schema of the target table) and FAILFASTING if the schema of the input data does not match the one we specified.
The ReadMode is FAILFAST in this scenario, i.e., fail the algorithm if the schema of the input data does not match the one we specified via schema_path, read_schema_from_table or schema Input_specs variables.
\n
In this scenario we do an append load by getting the max date (transformer_spec \"get_max_value\") on bronze and use that date to filter the source to only get data with a date greater than that max date on bronze (transformer_spec \"incremental_filter\"). That is the standard way we do incremental batch loads in the lakehouse engine. For streaming incremental loads we rely on Spark Streaming checkpoint feature (check a streaming append load ACON example).
Batch Delta Load Init, Delta and Backfill with Merge
\n\n
This scenario illustrates the process of implementing a delta load algorithm by first using an ACON to perform an initial load, then another one to perform the regular deltas that will be triggered on a recurrent basis, and finally an ACON for backfilling specific parcels if ever needed.
We can see that even though this is an init load we still have chosen to condense the records through our \"condense_record_mode_cdc\" transformer. This is a condensation step capable of handling SAP BW style changelogs based on actrequest_timestamps, datapakid, record_mode, etc...
\n
In the init load we actually did a merge in this case because we wanted to test locally if a merge with an empty target table works, but you don't have to do it, as an init load usually can be just a full load. If a merge of init data with an empty table has any performance implications when compared to a regular insert remains to be tested, but we don't have any reason to recommend a merge over an insert for an init load, and as said, this was done solely for local testing purposes, you can just use write_type: \"overwrite\"
The merge predicate and the insert, delete or update predicates should reflect the reality of your data, and it's up to each data product to figure out which predicates better match their reality:
\n\n
\n
The merge predicate usually involves making sure that the \"primary key\" for your data matches.\n
\n
Performance Tip!!! Ideally, in order to get a performance boost in your merges, you should also place a filter in your merge predicate (e.g., certain technical or business date in the target table >= x days ago), based on the assumption that the rows in that specified interval will never change in the future. This can drastically decrease the merge times of big tables.
\n\n\n
\n\n
\n\n
The insert, delete and update predicates will always depend on the structure of your changelog, and also how you expect your updates to arrive (e.g., in certain data products you know that you will never get out of order data or late arriving data, while in other you can never ensure that). These predicates should reflect that in order to prevent you from doing unwanted changes to the target delta lake table.\n\n
\n
For example, in this scenario, we delete rows that have the R, D or X record_mode values, because we know that if after condensing the rows that is the latest status of that row from the changelog, they should be deleted, and we never insert rows with those status (note: we use this guardrail in the insert to prevent out of order changes, which is likely not the case in SAP BW).
\n
Because the insert_predicate is fully optional, in your scenario you may not require that.
\n
\n\n
In this scenario, we don't pass an update_predicate in the ACON, because both insert_predicate and update_predicate are fully optional, i.e., if you don't pass them the algorithm will update any data that matches the merge_predicate and insert any data that does not match it. The predicates in these cases just make sure the algorithm does not insert or update any data that you don't want, as in the late arriving changes scenario where a deleted row may arrive first from the changelog then the update row, and to prevent your target table to have inconsistent data for a certain period of time (it will eventually get consistent when you receive the latest correct status from the changelog though) you can have this guardrail in the insert or update predicates. Again, for most sources this will not happen but sources like Kafka for example cannot 100% ensure order, for example.
\n
In order to understand how we can cover different scenarios (e.g., late arriving changes, out of order changes, etc.), please go here.
The backfilling process depicted here is fairly similar to the init load, but it is relevant to highlight by using a static value (that can be modified accordingly to the backfilling needs) in the incremental_filter function.
\n
Other relevant functions for backfilling may include the expression_filter function, where you can use a custom SQL filter to filter the input data.
There may appear a scenario where the data product dev team faces the need to perform complex data transformations that are either not yet available in the lakehouse engine or the logic is just too complex to chain in an ACON file. In the context of the lakehouse, the only layers that usually can impose that complexity is silver+ and gold. This page targets exactly those cases.
\n\n
Below you'll find a notebook where you can pass your own PySpark or Spark SQL logic into the ACON, by dynamically injecting a python function into the ACON dictionary. The lakehouse engine will take care of executing those transformations in the transformation step of the data loader algorithm. Please read the notebook's comments carefully to understand how it works, or simply open it in your notebook environment, which will make the notebook's code and comments more readable.
\n\n
\n\n
Force Streaming Micro Batch Processing.
\n\n
When you use streaming mode, with a custom transformer, it\u2019s\nhighly advisable that you set the force_streaming_microbatch_processing flag to True in the transform specification, as\nexplained above!
\n\n
\n\n
What is a custom transformer in the Lakehouse Engine and how you can use it to write your own pyspark logic?
\n\n
We highly promote the Lakehouse Engine for creating Data Products aligned with the data source (bronze/silver layer), pumping data into silver so our Data Scientists and Analysts can leverage the value of the data in silver, as close as it comes from the source.\nThe low-code and configuration-driven nature of the lakehouse engine makes it a compelling framework to use in such cases, where the transformations that are done from bronze to silver are not that many, as we want to keep the data close to the source.
\n\n
However, when it comes to Data Products enriched in some way or for insights (silver+, gold), they are typically heavy\non transformations (they are the T of the overall ELT process), so the nature of the lakehouse engine may would have\nget into the way of adequately building it. Considering this, and considering our user base that prefers an ACON-based\napproach and all the nice off-the-shelf features of the lakehouse engine, we have developed a feature that\nallows us to pass custom transformers where you put your entire pyspark logic and can pass it as an argument\nin the ACON (the configuration file that configures every lakehouse engine algorithm).
\n\n
Motivation:
\n\n
Doing that, you let the ACON guide your read, data quality, write and terminate processes, and you just focus on transforming data :)
\n\n
Custom transformation Function
\n\n
The function below is the one that encapsulates all your defined pyspark logic and sends it as a python function to the lakehouse engine. This function will then be invoked internally in the lakehouse engine via a df.transform() function. If you are interested in checking the internals of the lakehouse engine, our codebase is openly available here: https://github.com/adidas/lakehouse-engine
\n\n
\n\n
Attention!!!
\n\n
For this process to work, your function defined below needs to receive a DataFrame and return a DataFrame. Attempting any other method signature (e.g., defining more parameters) will not work, unless you use something like python partials, for example.
\n\n
\n\n
\n
defget_new_data(df:DataFrame)->DataFrame:\n"""Get the new data from the lakehouse engine reader and prepare it."""\n return(\n df.withColumn("amount",when(col("_change_type")=="delete",lit(0)).otherwise(col("amount")))\n .select("article_id","order_date","amount")\n .groupBy("article_id","order_date")\n .agg(sum("amount").alias("amount"))\n )\n\n\ndefget_joined_data(new_data_df:DataFrame,current_data_df:DataFrame)->DataFrame:\n"""Join the new data with the current data already existing in the target dataset."""\n return(\n new_data_df.alias("new_data")\n .join(\n current_data_df.alias("current_data"),\n [\n new_data_df.article_id==current_data_df.article_id,\n new_data_df.order_date==current_data_df.order_date,\n ],\n "left_outer",\n )\n .withColumn(\n "current_amount",when(col("current_data.amount").isNull(),lit(0)).otherwise("current_data.amount")\n )\n .withColumn("final_amount",col("current_amount")+col("new_data.amount"))\n .select(col("new_data.article_id"),col("new_data.order_date"),col("final_amount").alias("amount"))\n )\n\n\ndefcalculate_kpi(df:DataFrame)->DataFrame:\n"""Calculate KPI through a custom transformer that will be provided in the ACON.\n\n Args:\n df: DataFrame passed as input.\n\n Returns:\n DataFrame: the transformed DataFrame.\n """\n new_data_df=get_new_data(df)\n\n # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n current_data_df=ExecEnv.SESSION.table(\n "my_database.my_table"\n )\n\n transformed_df=get_joined_data(new_data_df,current_data_df)\n\n returntransformed_df\n
\n
\n\n
Don't like pyspark API? Write SQL
\n\n
You don't have to comply to the pyspark API if you prefer SQL. Inside the function above (or any of\nthe auxiliary functions you decide to develop) you can write something like:
\n\n
\n
defcalculate_kpi(df:DataFrame)->DataFrame:\n df.createOrReplaceTempView("new_data")\n\n # we prefer if you use 'ExecEnv.SESSION' instead of 'spark', because is the internal object the\n # lakehouse engine uses to refer to the spark session. But if you use 'spark' should also be fine.\n ExecEnv.SESSION.sql(\n"""\n CREATE OR REPLACE TEMP VIEW my_kpi AS\n SELECT ... FROM new_data ...\n """\n )\n\n returnExecEnv.SESSION.table("my_kpi")\n
\n
\n\n
Just your regular ACON
\n\n
If you notice the ACON below, everything is the same as you would do in a Data Product, but the transform_specs section of the ACON has a difference, which is a function called \"custom_transformation\" where we supply as argument the function defined above with the pyspark code.
\n\n
\n\n
Attention!!!
\n\n
Do not pass the function as calculate_kpi(), but as calculate_kpi, otherwise you are telling python to invoke the function right away, as opposed to pass it as argument to be invoked later by the lakehouse engine.
\n\n
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nacon={\n "input_specs":[\n {\n "spec_id":"sales",\n "read_type":"streaming",\n "data_format":"delta",\n "db_table":"my_database.dummy_sales",\n "options":{"readChangeFeed":"true"},\n }\n ],\n "transform_specs":[\n {\n "spec_id":"transformed_sales_kpi",\n "input_id":"sales",\n # because we are using streaming, this allows us to make sure that\n # all the computation in our custom transformer gets pushed to\n # Spark's foreachBatch method in a stream, which allows us to\n # run all Spark functions in a micro batch DataFrame, as there\n # are some Spark functions that are not supported in streaming.\n "force_streaming_foreach_batch_processing":True,\n "transformers":[\n {\n "function":"custom_transformation",\n "args":{"custom_transformer":calculate_kpi},\n },\n ],\n }\n ],\n "dq_specs":[\n {\n "spec_id":"my_table_quality",\n "input_id":"transformed_sales_kpi",\n "dq_type":"validator",\n "bucket":"my_dq_bucket",\n "data_docs_bucket":"my_data_product_bucket",\n "data_docs_prefix":"dq/my_data_product/data_docs/site/",\n "expectations_store_prefix":"dq/expectations/",\n "validations_store_prefix":"dq/validations/",\n "checkpoint_store_prefix":"dq/checkpoints/",\n "tbl_to_derive_pk":"my_table",\n "dq_functions":[\n {"function":"expect_column_values_to_not_be_null","args":{"column":"article_id"}},\n ],\n },\n ],\n "output_specs":[\n {\n "spec_id":"sales_kpi",\n "input_id":"transformed_sales_kpi",\n "write_type":"merge",\n "data_format":"delta",\n "db_table":"my_database.my_table",\n "options":{\n "checkpointLocation":"s3://my_data_product_bucket/gold/my_table",\n },\n "merge_opts":{\n "merge_predicate":"new.article_id = current.article_id AND new.order_date = current.order_date"\n },\n }\n ],\n}\n\nload_data(acon=acon)\n
The SQL Custom Transformer executes a SQL transformation provided by the user.This transformer can be very useful whenever the user wants to perform SQL-based transformations that are not natively supported by the lakehouse engine transformers.
\n\n
The transformer receives the SQL query to be executed. This can read from any table or view from the catalog, or any dataframe registered as a temp view.
\n\n
\n
To register a dataframe as a temp view you can use the \"temp_view\" config in the input_specs, as shown below.
A custom sap_b4 reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from\nSAP B4 DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions\n(AQ vs CL, active table, changelog table, requests status table, how to identify the next delta timestamp...),\nonly requiring a few parameters that are explained and exemplified in the\ntemplate scenarios that we have created.
\n\n
\n\n
This custom reader is very similar and uses most features from the sap_bw reader, so if you were using specific filters/parameters with the sap_bw reader, there is a high chance you can keep using it in a very similar way with the sap_b4 reader. The main concepts are applied to both readers, as the strategies on how to parallelize the extractions, for example.
There are no limits imposed by the Lakehouse-Engine framework, but you need to consider that there might be differences imposed by the source.
\n\n
E.g. Each User might be restricted on utilisation of about 100GB memory at a time from the source.
\n\n
Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.
\n\n
\n\n
\n\n
In case you want to perform further filtering in the REQTSN field, please be aware that it is not being pushed down to SAP B4 by default (meaning it will have bad performance).
\n\n
In that case, you will need to use customSchema option while reading, so that you are able to enable filter push down for those.
\n\n
\n\n
You can check the code documentation of the reader below:
For extractions using the SAP B4 reader, you can use the arguments listed in the SAP B4 arguments, but also the ones listed in the JDBC extractions, as those are inherited as well.
\n\n
\n\n
Extraction from SAP B4 ADSOs Template
\n\n
This template covers the following scenarios of extractions from the SAP B4Hana ADSOs:
\n\n
\n
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n
2 - Parallel extraction\n
\n
2.1 - Simplest Scenario
\n
2.2 - Provide upperBound (Recommended)
\n
2.3 - Automatic upperBound (Recommended)
\n
2.4 - Provide predicates (Recommended)
\n
2.5 - Generate predicates (Recommended)
\n
\n
\n\n
\n\n
Note: the template will cover two ADSO Types:
\n\n
\n
AQ: ADSO which is of append type and for which a single ADSO/tables holds all the information, like an\nevent table. For this type, the same ADSO is used for reading data both for the inits and deltas. Usually, these\nADSOs end with the digit \"6\".
\n
CL: ADSO which is split into two ADSOs, one holding the change log events, the other having the active\ndata (current version of the truth for a particular source). For this type, the ADSO having the active data\nis used for the first extraction (init) and the change log ADSO is used for the subsequent extractions (deltas).\nUsually, these ADSOs are split into active table ending with the digit \"2\" and changelog table ending with digit \"3\".
\n
\n\n
\n\n
For each of these ADSO types, the lakehouse-engine abstracts the logic to get the delta extractions. This logic\nbasically consists of joining the db_table (for AQ) or the changelog_table (for CL) with the table\nhaving the requests status (my_database.requests_status_table).\nOne of the fields used for this joining is the data_target, which has a relationship with the ADSO\n(db_table/changelog_table), being basically the same identifier without considering parts of it.
\n\n
Based on the previous insights, the queries that the lakehouse-engine generates under the hood translate to\n(this is a simplified version, for more details please refer to the lakehouse-engine code documentation):\nAQ Init Extraction:\nSELECT t.*, CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table t
\n\n
AQ Delta Extraction:\nSELECT tbl.*, CAST({self._B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AQ' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG', 'GR') AND UPPER(DATATARGET) = UPPER('my_identifier')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table
\n\n
CL Init Extraction:\nSELECT t.*,\n {self._SAP_B4_EXTRACTION.extraction_timestamp}000000000 AS reqtsn,\n '0' AS datapakid,\n 0 AS record,\n CAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_2 t
\n\n
CL Delta Extraction:\nSELECT tbl.*,\nCAST({self._SAP_B4_EXTRACTION.extraction_timestamp} AS DECIMAL(15,0)) AS extraction_start_timestamp\nFROM my_database.my_table_3 AS tbl\nJOIN my_database.requests_status_table AS req\nWHERE STORAGE = 'AT' AND REQUEST_IS_IN_PROCESS = 'N' AND LAST_OPERATION_TYPE IN ('C', 'U')\nAND REQUEST_STATUS IN ('GG') AND UPPER(DATATARGET) = UPPER('my_data_target')\nAND req.REQUEST_TSN > max_timestamp_in_bronze AND req.REQUEST_TSN <= max_timestamp_in_requests_status_table`
\n\n
\n\n
Introductory Notes:If you want to have a better understanding about JDBC Spark optimizations, here you have a few useful links:
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n\n
This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the ADSO\nyou want to extract from SAP B4Hana is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source ADSO, there are two options:
\n\n
\n
Delta Init - full extraction of the source ADSO. You should use it in the first time you extract from the\nADSO or any time you want to re-extract completely. Similar to a so-called full load.
\n
Delta - extracts the portion of the data that is new or has changed in the source, since the last\nextraction (using the max_timestamp value in the location of the data already extracted\nlatest_timestamp_data_location).
\n
\n\n
Below example is composed of two cells.
\n\n
\n
The first cell is only responsible to define the variables extraction_type and write_type,\ndepending on the extraction type: Delta Init (load_type = \"init\") or a Delta (load_type = \"delta\").\nThe variables in this cell will also be referenced by other acons/examples in this notebook, similar to what\nyou would do in your pipelines/jobs, defining this centrally and then re-using it.
\n
The second cell is where the acon to be used is defined (which uses the two variables extraction_type and\nwrite_type defined) and the load_data algorithm is executed to perform the extraction.
\n
\n\n
\n\n
There may be cases where you might want to always extract fully from the source ADSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.
In this section, 5 possible scenarios for parallel extractions from SAP B4Hana ADSOs are presented.
\n\n
2.1 - Parallel Extraction, Simplest Scenario
\n\n
This scenario provides the simplest example you can have for a parallel extraction from SAP B4Hana, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.
\n\n
On the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source ADSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.
2.2 - Parallel Extraction, Provide upper_bound (Recommended)
\n\n
This scenario performs the extraction from the SAP B4 ADSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:
\n\n
\n
numPartitions - number of Spark partitions to split the extraction.
\n
partitionColumn - column used to split the extraction. It must be a numeric, date, or timestamp.\nIt should be a column that is able to split the extraction evenly in several tasks. An auto-increment\ncolumn is usually a very good candidate.
\n
lowerBound - lower bound to decide the partition stride.
\n
upperBound - upper bound to decide the partition stride.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).
\n\n
When these 4 properties are used, Spark will use them to build several queries to split the extraction.
\n\n
Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:
\n\n
\n
SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL
\n
SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20
\n
SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30
This scenario is very similar to 2.2, the only difference being that upperBound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP B4 ADSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.
2.4 - Parallel Extraction, Provide Predicates (Recommended)
\n\n
This scenario performs the extraction from SAP B4 ADSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction (e.g. when extracting from ADSO of Type CL,\nthe active table does not have the RECORD column, which is usually a good option for scenarios 2.2 and 2.3):
\n\n
\n
partitionColumn - column used to split the extraction. It can be of any type.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenario 2.2 or 2.3.
\n\n
When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.
\n\n
Below the lakehouse function to generate predicate list automatically is presented.
\n\n
This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.
\n\n
predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here, specially if you are applying filter on transformations spec,\nand you know entire table won't be necessary, so you can change it to something like this: select distinct(x)\nfrom table where x > y.
\n\n
predicates_add_null: You can decide if you want to consider null on predicates list or not, by default\nthis property is True.
\n\n
Example: for \"partition_column\": \"CALMONTH\"
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nLOAD_TYPE="INIT"or"DELTA"\n\nifLOAD_TYPE=="INIT":\n extraction_type="init"\n write_type="overwrite"\nelse:\n extraction_type="delta"\n write_type="append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfromlakehouse_engine.core.exec_envimportExecEnv\nfromlakehouse_engine.utils.extraction.jdbc_extraction_utilsimport(\n JDBCExtraction,\n JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column="CALMONTH"\ndbtable="my_database.my_table_3"\n\npredicates_query=f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\nuser="my_user"\npassword="my_b4_hana_pwd"\nurl="my_sap_b4_url"\npredicates_add_null=True\n\njdbc_util=JDBCExtractionUtils(\n JDBCExtraction(\n user=user,\n password=password,\n url=url,\n predicates_add_null=predicates_add_null,\n partition_column=partition_column,\n dbtable=dbtable,\n )\n)\n\npredicates=jdbc_util.get_predicates(predicates_query)\n\nacon={\n "input_specs":[\n {\n "spec_id":"my_identifier_2_source",\n "read_type":"batch",\n "data_format":"sap_b4",\n "options":{\n "url":"my_sap_b4_url",\n "user":"my_user",\n "password":"my_b4_hana_pwd",\n "driver":"com.sap.db.jdbc.Driver",\n "dbtable":"my_database.my_table_2",\n "changelog_table":"my_database.my_table_3",\n "extraction_type":extraction_type,\n "latest_timestamp_data_location":"s3://my_path/my_identifier_2_prov_predicates/",\n "adso_type":"CL",\n "predicates":predicates,\n },\n }\n ],\n "output_specs":[\n {\n "spec_id":"my_identifier_2_bronze",\n "input_id":"my_identifier_2_source",\n "write_type":write_type,\n "data_format":"delta",\n "partitions":["REQTSN"],\n "location":"s3://my_path/my_identifier_2_prov_predicates/",\n }\n ],\n "exec_env":{\n "spark.databricks.delta.schema.autoMerge.enabled":True,\n "spark.databricks.delta.optimizeWrite.enabled":True,\n "spark.databricks.delta.autoCompact.enabled":True,\n },\n}\n\nload_data(acon=acon)\n
\n
\n\n
2.5 - Parallel Extraction, Generate Predicates
\n\n
This scenario is very similar to the scenario 2.4, with the only difference that it automatically\ngenerates the predicates (\"generate_predicates\": True).
\n\n
This is an adequate example for you to follow if you have/know a column in the ADSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise \nthose would probably be recommended).
\n\n
When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumnLOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.
\n\n
Example: for \"partitionColumn\": \"record\"\nGenerate predicates:
\n\n
\n
SELECT DISTINCT(RECORD) as RECORD FROM dummy_table
Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.
\n\n
\n\n
A custom sap_bw reader and a few utils are offered in the lakehouse-engine framework so that consumption of data from \nSAP BW DSOs can be easily created. The framework abstracts all the logic behind the init/delta extractions \n(active table, changelog table, activation requests table, how to identify the next delta timestamp...), \nonly requiring a few parameters that are explained and exemplified in the \ntemplate scenarios that we have created.
For extractions using the SAP BW reader, you can use the arguments listed in the SAP BW arguments, but also \nthe ones listed in the JDBC extractions, as those are inherited as well.
\n\n
\n\n
Extraction from SAP-BW template
\n\n
This template covers the following scenarios of extractions from the SAP BW DSOs:
\n\n
\n
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n
2 - Parallel extraction\n
\n
2.1 - Simplest Scenario
\n
2.2 - Provide upperBound (Recommended)
\n
2.3 - Automatic upperBound (Recommended)
\n
2.4 - Backfilling
\n
2.5 - Provide predicates (Recommended)
\n
2.6 - Generate predicates (Recommended)
\n
\n
3 - Extraction from Write Optimized DSO\n
\n
3.1 - Get initial actrequest_timestamp from Activation Requests Table
\n
\n
\n\n
\n\n
Introductory Notes: if you want to have a better understanding about JDBC Spark optimizations, \nhere you have a few useful links:
1 - The Simplest Scenario (Not parallel - Not Recommended)
\n\n
This scenario is the simplest one, not taking any advantage of Spark JDBC optimisation techniques \nand using a single connection to retrieve all the data from the source. It should only be used in case the DSO \nyou want to extract from SAP BW is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source DSO, there are two options:
\n\n
\n
Delta Init - full extraction of the source DSO. You should use it in the first time you extract from the \nDSO or any time you want to re-extract completely. Similar to a so-called full load.
\n
Delta - extracts the portion of the data that is new or has changed in the source, since the last\nextraction (using the max actrequest_timestamp value in the location of the data already extracted,\nby default).
\n
\n\n
Below example is composed of two cells.
\n\n
\n
The first cell is only responsible to define the variables extraction_type and write_type,\ndepending on the extraction type Delta Init (LOAD_TYPE = INIT) or a Delta (LOAD_TYPE = DELTA).\nThe variables in this cell will also be referenced by other acons/examples in this notebook, similar to what\nyou would do in your pipelines/jobs, defining this centrally and then re-using it.
\n
The second cell is where the acon to be used is defined (which uses the two variables extraction_type and\nwrite_type defined) and the load_data algorithm is executed to perform the extraction.
\n
\n\n
\n\n
There may be cases where you might want to always extract fully from the source DSO. In these cases,\nyou only need to use a Delta Init every time, meaning you would use \"extraction_type\": \"init\" and\n\"write_type\": \"overwrite\" as it is shown below. The explanation about what it is a Delta Init/Delta is\napplicable for all the scenarios presented in this notebook.
\n\n
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nLOAD_TYPE="INIT"or"DELTA"\n\nifLOAD_TYPE=="INIT":\n extraction_type="init"\n write_type="overwrite"\nelse:\n extraction_type="delta"\n write_type="append"\n\nacon={\n "input_specs":[\n {\n "spec_id":"my_identifier_source",\n "read_type":"batch",\n # You should use this custom reader to benefit from the lakehouse-engine utils for extractions from SAP BW\n "data_format":"sap_bw",\n "options":{\n "user":"my_user",\n "password":"my_hana_pwd",\n "url":"my_sap_bw_url",\n "dbtable":"my_database.my_table",\n "odsobject":"my_ods_object",\n "changelog_table":"my_database.my_changelog_table",\n "latest_timestamp_data_location":"s3://my_path/my_identifier/",\n "extraction_type":extraction_type,\n },\n }\n ],\n "output_specs":[\n {\n "spec_id":"my_identifier_bronze",\n "input_id":"my_identifier_source",\n "write_type":write_type,\n "data_format":"delta",\n "partitions":["actrequest_timestamp"],\n "location":"s3://my_path/my_identifier/",\n }\n ],\n "exec_env":{\n "spark.databricks.delta.schema.autoMerge.enabled":True,\n "spark.databricks.delta.optimizeWrite.enabled":True,\n "spark.databricks.delta.autoCompact.enabled":True,\n },\n}\n\nload_data(acon=acon)\n
\n
\n\n
2 - Parallel extraction
\n\n
In this section, 6 possible scenarios for parallel extractions from SAP BW DSOs.
\n\n
2.1 - Parallel Extraction, Simplest Scenario
\n\n
This scenario provides the simplest example you can have for a parallel extraction from SAP BW, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people does not have\nmuch knowledge around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential. \nOn the example below, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source DSO and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the example 1.
2.2 - Parallel Extraction, Provide upper_bound (Recommended)
\n\n
This scenario performs the extraction from the SAP BW DSO in parallel, but is more concerned with trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed, using\nthe following options:
\n\n
\n
numPartitions - number of Spark partitions to split the extraction.
\n
partitionColumn - column used to split the extraction. It must be a numeric, date, or timestamp.\nIt should be a column that is able to split the extraction evenly in several tasks. An auto-increment\ncolumn is usually a very good candidate.
\n
lowerBound - lower bound to decide the partition stride.
\n
upperBound - upper bound to decide the partition stride. It can either be provided (as it is done in\nthis example) or derived automatically by our upperBound optimizer (example 2.3).
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn. If you compare with the previous example, you'll notice that now numPartitions and\nthree additional options are provided to fine tune the extraction (partitionColumn, lowerBound,\nupperBound).
\n\n
When these 4 properties are used, Spark will use them to build several queries to split the extraction.
\n\n
Example: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like this:
\n\n
\n
SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL
\n
SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20
\n
SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30
This scenario is very similar to 2.2, the only difference being that upper_bound\nis not provided. Instead, the property calculate_upper_bound equals to true is used to benefit\nfrom the automatic calculation of the upperBound (derived from the partitionColumn) offered by the\nlakehouse-engine framework, which is useful, as in most of the cases you will probably not be aware of\nthe max value for the column. The only thing you need to consider is that if you use this automatic\ncalculation of the upperBound you will be doing an initial query to the SAP BW DSO to retrieve the max\nvalue for the partitionColumn, before doing the actual query to perform the extraction.
This scenario covers the case, in which you might want to backfill the data extracted from a SAP BW DSO and\nmade available in the bronze layer. By default, the delta extraction considers the max value of the column\nactrequest_timestamp on the data already extracted. However, there might be cases, in which you might want\nto extract a delta from a particular timestamp onwards or for a particular interval of time. For this, you\ncan use the properties min_timestamp and max_timestamp.
\n\n
Below, a very similar example to the previous one is provided, the only differences being that\nthe properties \"min_timestamp\": \"20210910000000\" and \"max_timestamp\": \"20210913235959\" are not provided,\nmeaning it will extract the data from the changelog table, using a filter\n\"20210910000000\" > actrequest_timestamp <= \"20210913235959\", ignoring if some of the data is already\navailable in the destination or not. Moreover, note that the property latest_timestamp_data_location\ndoes not need to be provided, as the timestamps to be considered are being directly provided (if both\nthe timestamps and the latest_timestamp_data_location are provided, the last parameter will have no effect).\nAdditionally, \"extraction_type\": \"delta\" and \"write_type\": \"append\" is forced, instead of using the\nvariables as in the other examples, because the backfilling scenario only makes sense for delta extractions.
\n\n
\n\n
Note: be aware that the backfilling example being shown has no mechanism to enforce that\nyou don't generate duplicated data in bronze. For your scenarios, you can either use this example and solve\nany duplication in the silver layer or extract the delta with a merge strategy while writing to bronze,\ninstead of appending.
2.5 - Parallel Extraction, Provide Predicates (Recommended)
\n\n
This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:
\n\n
\n
partitionColumn - column used to split the extraction. It can be of any type.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).
\n\n
When this property is used all predicates need to be provided to Spark, otherwise it will leave data behind.
\n\n
Below the lakehouse function to generate predicate list automatically is presented.
\n\n
This function needs to be used carefully, specially on predicates_query and predicates_add_null variables.
\n\n
predicates_query: At the sample below the whole table is being considered (select distinct(x) from table),\nbut it is possible to filter predicates list here,\nspecially if you are applying filter on transformations spec, and you know entire table won't be necessary, so\nyou can change it to something like this: select distinct(x) from table where x > y.
\n\n
predicates_add_null: You can decide if you want to consider null on predicates list or not, by default this\nproperty is True.
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\nLOAD_TYPE="INIT"or"DELTA"\n\nifLOAD_TYPE=="INIT":\n extraction_type="init"\n write_type="overwrite"\nelse:\n extraction_type="delta"\n write_type="append"\n\n# import the lakehouse_engine ExecEnv class, so that you can use the functions it offers\n# import the lakehouse_engine extraction utils, so that you can use the JDBCExtractionUtils offered functions\nfromlakehouse_engine.core.exec_envimportExecEnv\nfromlakehouse_engine.utils.extraction.jdbc_extraction_utilsimport(\n JDBCExtraction,\n JDBCExtractionUtils,\n)\n\nExecEnv.get_or_create()\n\npartition_column="my_partition_column"\ndbtable="my_database.my_table"\n\npredicates_query=f"""(SELECT DISTINCT({partition_column}) FROM {dbtable})"""\ncolumn_for_predicates=partition_column\nuser="my_user"\npassword="my_hana_pwd"\nurl="my_bw_url"\npredicates_add_null=True\n\njdbc_util=JDBCExtractionUtils(\n JDBCExtraction(\n user=user,\n password=password,\n url=url,\n dbtable=dbtable,\n partition_column=partition_column,\n )\n)\n\npredicates=jdbc_util.get_predicates(predicates_query)\n\nacon={\n "input_specs":[\n {\n "spec_id":"my_identifier_source",\n "read_type":"batch",\n "data_format":"sap_bw",\n "options":{\n "user":"my_user",\n "password":"my_hana_pwd",\n "url":"my_sap_bw_url",\n "dbtable":"my_database.my_table",\n "odsobject":"my_ods_object",\n "latest_timestamp_data_location":"s3://my_path/my_identifier/",\n "extraction_type":extraction_type,\n "predicates":predicates,\n },\n }\n ],\n "output_specs":[\n {\n "spec_id":"my_identifier_bronze",\n "input_id":"my_identifier_source",\n "write_type":write_type,\n "data_format":"delta",\n "partitions":["actrequest_timestamp"],\n "location":"s3://my_path/my_identifier/",\n }\n ],\n "exec_env":{\n "spark.databricks.delta.schema.autoMerge.enabled":True,\n "spark.databricks.delta.optimizeWrite.enabled":True,\n "spark.databricks.delta.autoCompact.enabled":True,\n },\n}\n\nload_data(acon=acon)\n
This scenario performs the extraction from SAP BW DSO in parallel, useful in contexts in which there is no\nnumeric, date or timestamp column to parallelize the extraction:
\n\n
\n
partitionColumn - column used to split the extraction. It can be of any type.
\n
\n\n
This is an adequate example for you to follow if you have/know a column in the DSO that is good to be used as\nthe partitionColumn, specially if these columns are not complying with the scenarios 2.2 and 2.3 (otherwise\nthose would probably be recommended).
\n\n
When this property is used, the lakehouse engine will generate the predicates to be used to extract data from\nthe source. What the lakehouse engine does is to check for the init/delta portion of the data,\nwhat are the distinct values of the partitionColumn serving that data. Then, these values will be used by\nSpark to generate several queries to extract from the source in a parallel fashion.\nEach distinct value of the partitionColumn will be a query, meaning that you will not have control over the\nnumber of partitions used for the extraction. For example, if you face a scenario in which you\nare using a partitionColumnLOAD_DATE and for today's delta, all the data (let's suppose 2 million rows) is\nserved by a single LOAD_DATE = 20200101, that would mean Spark would use a single partition\nto extract everything. In this extreme case you would probably need to change your partitionColumn. Note:\nthese extreme cases are harder to happen when you use the strategy of the scenarios 2.2/2.3.
\n\n
Example: for \"partitionColumn\": \"record\"\nGenerate predicates:
\n\n
\n
SELECT DISTINCT(RECORD) as RECORD FROM dummy_table
This scenario is based on the best practices of the scenario 2.2, but it is ready to extract data from\nWrite Optimized DSOs, which have the changelog embedded in the active table, instead of having a separate\nchangelog table. Due to this reason, you need to specify that the changelog_table parameter value is equal\nto the dbtable parameter value.\nMoreover, these tables usually already include the changelog technical columns\nlike RECORD and DATAPAKID, for example, that the framework adds by default. Thus, you need to specify\n\"include_changelog_tech_cols\": False to change this behaviour.\nFinally, you also need to specify the name of the column in the table that can be used to join with the\nactivation requests table to get the timestamp of the several requests/deltas,\nwhich is \"actrequest\" by default (\"request_col_name\": 'request').
3.1 - Extraction from Write Optimized DSOs, Get ACTREQUEST_TIMESTAMP from Activation Requests Table
\n\n
By default, the act_request_timestamp has being hardcoded (either assumes a given extraction_timestamp or the\ncurrent timestamp) in the init extraction, however this may be causing problems when merging changes in silver,\nfor write optimised DSOs. So, a new possibility to choose when to retrieve this timestamp from the\nact_req_table was added.
\n\n
This scenario performs the data extraction from Write Optimized DSOs, forcing the actrequest_timestamp to\nassume the value from the activation requests table (timestamp column).
\n\n
This feature is only available for WODSOs and to use it you need to specify \"get_timestamp_from_actrequest\": True.
One of the most important parameters to optimise the extraction is the partitionColumn, as you can see in the template. Thus, this section helps you figure out if a column is a good candidate or not.
\n\n
Basically the partition column needs to be a column which is able to adequately split the processing, which means we can use it to \"create\" different queries with intervals/filters, so that the Spark tasks process similar amounts of rows/volume. Usually a good candidate is an integer auto-increment technical column.
\n\n
\n\n
Although RECORD is usually a good candidate, it is usually available on the changelog table only. Meaning that you would need to use a different strategy for the init. In case you don't have good candidates for partitionColumn, you can use the sample acon provided in the scenario 2.1 in the template above. It might make sense to use scenario 2.1 for the init and then scenario 2.2 or 2.3 for the subsequent deltas.
\n\n
\n\n
When there is no int, date or timestamp good candidate for partitionColumn:
\n\n
In this case you can opt by the scenario 2.5 - Generate Predicates, which supports any kind of column to be defined as partitionColumn.
\n\n
However, you should still analyse if the column you are thinking about is a good candidate or not. In this scenario, Spark will create one query per distinct value of the partitionColumn, so you can perform some analysis.
Secure File Transfer Protocol (SFTP) is a file protocol for transferring files over the web.
\n\n
This feature is available in the Lakehouse Engine with the purpose of having a mechanism to read data directly from SFTP directories without moving those files manually/physically to a S3 bucket.
\n\n
The engine uses Pandas to read the files and converts them into a Spark dataframe, which makes the available resources of an Acon usable, such as dq_specs, output_specs, terminator_specs and transform_specs.
\n\n
Furthermore, this feature provides several filters on the directories that makes easier to control the extractions.
\n\n
Introductory Notes:
\n\n
There are important parameters that must be added to input specs in order to make the SFTP extraction work properly:
\n\n
\n\n
Read type The engine supports only BATCH mode for this feature.
\n\n
\n\n
sftp_files_format - File format that will be used to read data from SFTP. The engine supports: CSV, FWF, JSON and XML.
\n\n
location - The SFTP directory to be extracted. If it is necessary to filter a specific file, it can be made using the file_name_contains option.
\n\n
options - Arguments used to set the Paramiko SSH client connection (hostname, username, password, port...), set the filter to retrieve files and set the file parameters (separators, headers, cols...). For more information about the file parameters, please go to the Pandas link in the useful links section.
\n\n
The options allowed are:
\n\n
\n\n
\n
Property type
\n
Detail
\n
Example
\n
Comment
\n
\n\n\n
\n
Connection
\n
add_auto_policy(str)
\n
true of false
\n
Indicates to allow an SFTP connection using no host key. When a connection attempt is being made using no host key, then the engine will throw an exception if the auto_add_policy property is false. The purpose of this flag is to make the user conscientiously choose a lesser secure connection.
\n
\n
\n
Connection
\n
key_type (str)
\n
\"Ed25519\" or \"RSA\"
\n
Indicates the key type to be used for the connection (SSH, Ed25519).
\n
\n
\n
Connection
\n
key_filename (str)
\n
\"/path/to/private_key/private_key.ppk\"
\n
The filename, or list of filenames, of optional private(keys), and/or certs to try for authentication. It must be used with a pkey in order to add a policy. If a pkey is not provided, then use add_auto_policy.
Value to use for the host key when connecting to the remote SFTP server.
\n
\n
\n
Filter
\n
date_time_gt (str)
\n
\"1900-01-01\" or \"1900-01-01 08:59:59\"
\n
Filter the files greater than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
\n
\n
\n
Filter
\n
date_time_lt (str)
\n
\"3999-12-31\" or \"3999-12-31 20:59:59\"
\n
Filter the files lower than the string datetime formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\"
\n
\n
\n
Filter
\n
earliest_file (bool)
\n
true or false
\n
Filter the earliest dated file in the directory.
\n
\n
\n
Filter
\n
file_name_contains (str)
\n
\"part_of_filename\"
\n
Filter files when match the pattern.
\n
\n
\n
Filter
\n
latest_file (bool)
\n
true or false
\n
Filter the most recent dated file in the directory.
\n
\n
\n
Read data from subdirectories
\n
sub_dir (bool)
\n
true or false
\n
The engine will search files into subdirectories of the location. It will consider one level below the root location given. When sub_dir is used with latest_file/earliest_file argument, the engine will retrieve the latest/earliest file for each subdirectory.
\n
\n
\n
Add metadata info
\n
file_metadata (bool)
\n
true or false
\n
When this option is set as True, the dataframe retrieves the filename with location and the modification_time from the original files in sftp. It attaches these two columns adding the information to respective records.
The scenario below shows the extraction of a CSV file using most part of the available filter options. Also, as an example, the column \"created_on\" is created in the transform_specs in order to store the processing date for every record. As the result, it will have in the output table the original file date (provided by the option file_metadata) and the processing date from the engine.
\n\n
For an incremental load approach, it is advised to use the \"modification_time\" column created by the option file_metadata. Since it has the original file date of modification, this date can be used in the logic to control what is new and has been changed recently.
\n\n
\n\n
Below scenario uses \"add_auto_policy\": true, which is not recommended.
The following scenario shows the extraction of a JSON file using an RSA pkey authentication instead of auto_add_policy. The engine supports Ed25519Key and RSA for pkeys.
\n\n
For the pkey file location, it is important to have the file in a location accessible by the cluster. This can be achieved either by mounting the location or with volumes.
\n\n
\n\n
This scenario uses a more secure authentication, thus it is the recommended option, instead of the previous scenario.
Parallel extractions can bring a jdbc source down if a lot of stress is put on the system. Be careful choosing the number of partitions. Spark is a distributed system and can lead to many connections.
\n\n
\n\n
Introduction
\n\n
Many databases allow a JDBC connection to extract data. Our engine has one reader where you can configure all the necessary definitions to connect to a database using JDBC.
\n\n
In the next section you will find several examples about how to do it.
\n\n
The Simplest Scenario using sqlite
\n\n
\n\n
Not parallel - Recommended for smaller datasets only, or when stressing the source system is a high concern
\n\n
\n\n
This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques and using a single connection to retrieve all the data from the source.
\n\n
Here we use a sqlite database where any connection is allowed. Due to that, we do not specify any username or password.
\n\n
Same as spark, we provide two different ways to run jdbc reader.
\n\n
1 - We can use the jdbc() function, passing inside all the arguments needed for Spark to work, and we can even combine this with additional options passed through .options().
\n\n
2 - Other way is using .format(\"jdbc\") and pass all necessary arguments through .options(). It's important to say by choosing jdbc() we can also add options() to the execution.
\n\n
You can find and run the following code in our local test for the engine.
\n\n
jdbc() function
\n\n
As we can see in the next cell, all the arguments necessary to establish the jdbc connection are passed inside the jdbc_args object. Here we find the url, the table, and the driver. Besides that, we can add options, such as the partition number. The partition number will impact in the queries' parallelism.
\n\n
The below code is an example in how to use jdbc() function in our ACON.\nAs for other cases, the acon configuration should be executed with load_data using:
In this example we do not use the jdbc_args object. All the jdbc connection parameters are inside the dictionary with the object options.\nAs for other cases, the acon configuration should be executed with load_data using:
In this template we will use a SAP as example for a more complete and runnable example.\nThese definitions can be used in several databases that allow JDBC connection.
\n\n
The following scenarios of extractions are covered:
\n\n
\n
1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets only,\nor when stressing the source system is a high concern)
\n
2 - Parallel extraction \n
\n
2.1 - Simplest Scenario
\n
2.2 - Provide upperBound (Recommended)
\n
2.3 - Provide predicates (Recommended)
\n
\n
\n\n
\n\n
Disclaimer: This template only uses SAP as demonstration example for JDBC connection.\nThis isn't a SAP template!!!\nIf you are looking to extract data from SAP, please use our sap_b4 reader or the sap_bw reader.
\n\n
\n\n
The JDBC connection has 2 main sections to be filled, the jdbc_args and options:
\n\n
\n
jdbc_args - Here you need to fill everything related to jdbc connection itself, like table/query, url, user,\n..., password.
\n
options - This section is more flexible, and you can provide additional options like \"fetchSize\", \"batchSize\",\n\"numPartitions\", ..., upper and \"lowerBound\".
\n
\n\n
If you want to know more regarding jdbc spark options you can follow the link below:
1 - The Simplest Scenario (Not parallel - Recommended for smaller datasets, or for not stressing the source)
\n\n
This scenario is the simplest one we can have, not taking any advantage of Spark JDBC optimisation techniques\nand using a single connection to retrieve all the data from the source. It should only be used in case the data\nyou want to extract from is a small one, with no big requirements in terms of performance to fulfill.\nWhen extracting from the source, we can have two options:
\n\n
\n
Delta Init - full extraction of the source. You should use it in the first time you extract from the\nsource or any time you want to re-extract completely. Similar to a so-called full load.
\n
Delta - extracts the portion of the data that is new or has changed in the source, since the last\nextraction (for that, the logic at the transformation step needs to be applied). On the examples below,\nthe logic using REQTSN column is applied, which means that the maximum value on bronze is filtered\nand its value is used to filter incoming data from the data source.
On this section we present 3 possible scenarios for parallel extractions from JDBC sources.
\n\n
\n\n
Disclaimer for parallel extraction: Parallel extractions can bring a jdbc source down if a lot of stress\nis put on the system. Be careful when choosing the number of partitions. \nSpark is a distributed system and can lead to many connections.
\n\n
\n\n
2.1 - Parallel Extraction, Simplest Scenario
\n\n
This scenario provides the simplest example you can have for a parallel extraction from JDBC sources, only using\nthe property numPartitions. The goal of the scenario is to cover the case in which people do not have\nmuch experience around how to optimize the extraction from JDBC sources or cannot identify a column that can\nbe used to split the extraction in several tasks. This scenario can also be used if the use case does not\nhave big performance requirements/concerns, meaning you do not feel the need to optimize the performance of\nthe extraction to its maximum potential.
\n\n
On the example bellow, \"numPartitions\": 10 is specified, meaning that Spark will open 10 parallel connections\nto the source and automatically decide how to parallelize the extraction upon that requirement. This is the\nonly change compared to the example provided in the scenario 1.
2.2 - Parallel Extraction, Provide upper_bound (Recommended)
\n\n
This scenario performs the extraction from the JDBC source in parallel, but has more concerns trying to\noptimize and have more control (compared to 2.1 example) on how the extraction is split and performed,\nusing the following options:
\n\n
\n
numPartitions - number of Spark partitions to split the extraction.
\n
partitionColumn - column used to split the extraction. It must be a numeric, date, or timestamp.\nIt should be a column that is able to split the extraction evenly in several tasks. An auto-increment\ncolumn is usually a very good candidate.
\n
lowerBound - lower bound to decide the partition stride.
\n
upperBound - upper bound to decide the partition stride.
\n
\n\n
This is an adequate example to be followed if there is a column in the data source that is good to\nbe used as the partitionColumn. Comparing with the previous example,\nthe numPartitions and three additional options to fine tune the extraction (partitionColumn, lowerBound,\nupperBound) are provided.
\n\n
When these 4 properties are used, Spark will use them to build several queries to split the extraction.\nExample: for \"numPartitions\": 10, \"partitionColumn\": \"record\", \"lowerBound: 1\", \"upperBound: 100\",\nSpark will generate 10 queries like:
\n\n
\n
SELECT * FROM dummy_table WHERE RECORD < 10 OR RECORD IS NULL
\n
SELECT * FROM dummy_table WHERE RECORD >= 10 AND RECORD < 20
\n
SELECT * FROM dummy_table WHERE RECORD >= 20 AND RECORD < 30
2.3 - Parallel Extraction with Predicates (Recommended)
\n\n
This scenario performs the extraction from JDBC source in parallel, useful in contexts where there aren't\nnumeric, date or timestamp columns to parallelize the extraction:
\n\n
\n
partitionColumn - column used to split the extraction (can be of any type).
\n
\n\n
\n
This is an adequate example to be followed if there is a column in the data source that is good to be\nused as the partitionColumn, specially if these columns are not complying with the scenario 2.2.
\n
\n\n
When this property is used, all predicates to Spark need to be provided, otherwise it will leave data behind.
\n\n
Bellow, a lakehouse function to generate predicate list automatically, is presented.
\n\n
By using this function one needs to be careful specially on predicates_query and predicates_add_null variables.
\n\n
predicates_query: At the sample below the whole table (select distinct(x) from table) is being considered,\nbut it is possible to filter using predicates list here, specially if you are applying filter on\ntransformations spec, and you know entire table won't be necessary, so you can change it to something like this:\nselect distinct(x) from table where x > y.
\n\n
predicates_add_null: One can consider if null on predicates list or not. By default, this property is True.\nExample: for \"partitionColumn\": \"record\"
This scenario is very similar to the full load, but it filters the data coming from the source, instead of doing a complete full load.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"csv",\n"options":{\n"header":true,\n"delimiter":"|",\n"inferSchema":true\n},\n"location":"file:///app/tests/lakehouse/in/feature/full_load/with_filter/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"filtered_sales",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"expression_filter",\n"args":{\n"exp":"date like '2016%'"\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"filtered_sales",\n"write_type":"overwrite",\n"data_format":"parquet",\n"location":"file:///app/tests/lakehouse/out/feature/full_load/with_filter/data"\n}\n]\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
As seen in the ACON, the filtering capabilities are provided by a transformer called expression_filter, where you can provide a custom Spark SQL filter.
This scenario is very similar to the Filtered Full Load, but we only replace a subset of the partitions, leaving the other ones untouched, so we don't replace the entire table. This capability is very useful for backfilling scenarios.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"csv",\n"options":{\n"header":true,\n"delimiter":"|",\n"inferSchema":true\n},\n"location":"file:///app/tests/lakehouse/in/feature/full_load/with_filter_partition_overwrite/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"filtered_sales",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"expression_filter",\n"args":{\n"exp":"date like '2016%'"\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"filtered_sales",\n"write_type":"overwrite",\n"data_format":"delta",\n"partitions":[\n"date",\n"customer"\n],\n"location":"file:///app/tests/lakehouse/out/feature/full_load/with_filter_partition_overwrite/data",\n"options":{\n"replaceWhere":"date like '2016%'"\n}\n}\n]\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
The key option for this scenario in the ACON is the replaceWhere, which we use to only overwrite a specific period of time, that realistically can match a subset of all the partitions of the table. Therefore, this capability is very useful for backfilling scenarios.
Related with schema, we can make two kind of operations:
\n\n
\n
Flatten Schema: transformer named \"flatten_schema\" used to flatten the schema of dataframe.
\n\n
\n
Parameters to be defined:\n
\n
max_level: 2 => this sets the level until you want to flatten the schema.
\n
shorten_names: True => this flag is when you want to shorten the name of the prefixes of the fields.
\n
alias: True => this flag is used when you want to define a prefix for the column to be flattened.
\n
num_chars: 7 => this sets the number of characters to consider when shortening the names of the fields.
\n
ignore_cols: True => this list value should be set to specify the columns you don't want to flatten.
\n
\n
\n
Explode Columns: transformer named \"explode_columns\" used to explode columns with types ArrayType and MapType.
\n\n
\n
Parameters to be defined:\n
\n
explode_arrays: True => this flag should be set to true to explode all array columns present in the dataframe.
\n
array_cols_to_explode: [\"sample_col\"] => this list value should be set when to specify the array columns desired to explode.
\n
explode_maps: True => this flag should be set to true to explode all map columns present in the dataframe.
\n
map_cols_to_explode: [\"map_col\"] => this list value should be set when to specify the map columns desired to explode.
\n
\n
Recommendation: use array_cols_to_explode and map_cols_to_explode to specify the columns desired to explode and do not do it for all of them.
\n
\n
\n\n
The below scenario of flatten_schema is transforming one or more columns and dividing the content nested in more columns, as desired. We defined the number of levels we want to flatten in the schema, regarding the nested values. In this case, we are just setting max_level of 2.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"json",\n"schema_path":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/source_schema.json",\n"location":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_schema/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_source",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"rename",\n"args":{\n"cols":{\n"date":"date2",\n"customer":"customer2"\n}\n}\n},\n{\n"function":"with_expressions",\n"args":{\n"cols_and_exprs":{\n"constant":"'just a constant'",\n"length_customer2":"length(customer2)"\n}\n}\n},\n{\n"function":"from_json",\n"args":{\n"input_col":"sample",\n"schema":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field2",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field3",\n"type":"double",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field4",\n"type":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field2",\n"type":"string",\n"nullable":true,\n"metadata":{}\n}\n]\n},\n"nullable":true,\n"metadata":{}\n}\n]\n}\n}\n},\n{\n"function":"to_json",\n"args":{\n"in_cols":[\n"item",\n"amount"\n],\n"out_col":"item_amount_json"\n}\n},\n{\n"function":"flatten_schema",\n"args":{\n"max_level":2\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"sales_source",\n"write_type":"append",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_schema/batch/data"\n}\n]\n}\n
\n
\n\n
The scenario of explode_arrays is transforming the arrays columns in one or more rows, depending on the number of elements, so, it replicates the row for each array value. In this case we are using explode to all array columns, using explode_arrays as true.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"json",\n"schema_path":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/source_schema.json",\n"location":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/explode_arrays/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_source",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"rename",\n"args":{\n"cols":{\n"date":"date2",\n"customer":"customer2"\n}\n}\n},\n{\n"function":"with_expressions",\n"args":{\n"cols_and_exprs":{\n"constant":"'just a constant'",\n"length_customer2":"length(customer2)"\n}\n}\n},\n{\n"function":"to_json",\n"args":{\n"in_cols":[\n"item",\n"amount"\n],\n"out_col":"item_amount_json"\n}\n},\n{\n"function":"explode_columns",\n"args":{\n"explode_arrays":true\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"sales_source",\n"write_type":"append",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/explode_arrays/batch/data"\n}\n]\n}\n
\n
\n\n
The scenario of flatten_and_explode_arrays_and_maps is using flatten_schema and explode_columns to have the desired output. In this case, the desired output is to flatten all schema and explode maps and arrays, even having an array inside a struct. Steps:
\n\n
1. In this case, we have an array column inside a struct column, so first we need to use the `flatten_schema` transformer to extract the columns inside that struct;\n2. Then, we are able to explode all the array columns desired and map columns, using `explode_columns` transformer.\n3. To be able to have the map column in 2 columns, we use again the `flatten_schema` transformer.\n
\n\n
As for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_source",\n"read_type":"batch",\n"data_format":"json",\n"schema_path":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/source_schema.json",\n"location":"file:///app/tests/lakehouse/in/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_source",\n"input_id":"sales_source",\n"transformers":[\n{\n"function":"rename",\n"args":{\n"cols":{\n"date":"date2",\n"customer":"customer2"\n}\n}\n},\n{\n"function":"with_expressions",\n"args":{\n"cols_and_exprs":{\n"constant":"'just a constant'",\n"length_customer2":"length(customer2)"\n}\n}\n},\n{\n"function":"from_json",\n"args":{\n"input_col":"agg_fields",\n"schema":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"nullable":true,\n"metadata":{},\n"type":{\n"containsNull":true,\n"elementType":"string",\n"type":"array"\n}\n},\n{\n"name":"field2",\n"type":{\n"type":"struct",\n"fields":[\n{\n"name":"field1",\n"type":"string",\n"nullable":true,\n"metadata":{}\n},\n{\n"name":"field2",\n"type":"string",\n"nullable":true,\n"metadata":{}\n}\n]\n},\n"nullable":true,\n"metadata":{}\n}\n]\n}\n}\n},\n{\n"function":"to_json",\n"args":{\n"in_cols":[\n"item",\n"amount"\n],\n"out_col":"item_amount_json"\n}\n},\n{\n"function":"flatten_schema",\n"args":{\n"max_level":2\n}\n},\n{\n"function":"explode_columns",\n"args":{\n"explode_arrays":true,\n"map_cols_to_explode":[\n"sample"\n]\n}\n},\n{\n"function":"flatten_schema",\n"args":{\n"max_level":2\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_bronze",\n"input_id":"sales_source",\n"write_type":"append",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/transformations/column_reshapers/flatten_and_explode_arrays_and_maps/batch/data"\n}\n]\n}\n
This scenario reads CSV data from a path and writes in full to another path with delta lake files.
\n\n
Relevant notes
\n\n
\n
This ACON infers the schema automatically through the option inferSchema (we use it for local tests only). This is usually not a best practice using CSV files, and you should provide a schema through the InputSpec variables schema_path, read_schema_from_table or schema.
\n
The transform_specs in this case are purely optional, and we basically use the repartition transformer to create one partition per combination of date and customer. This does not mean you have to use this in your algorithm.
\n
A full load is also adequate for an init load (initial load).
\n
\n\n
As for other cases, the acon configuration should be executed with load_data using:
Don't use this feature if the Lakehouse Engine already has a supported data format for your use case, as in that case it is preferred to use the dedicated data formats which are more extensively tested and predictable. Check the supported data formats here.
\n\n
\n\n
Reading from a Spark DataFrame is very simple using our framework. You just need to define the input_specs as follows:
This scenario illustrates an append load done via streaming instead of batch, providing an efficient way of picking up new files from an S3 folder, instead of relying on the incremental filtering from the source needed from a batch based process (see append loads in batch from a JDBC source to understand the differences between streaming and batch append loads). However, not all sources (e.g., JDBC) allow streaming.\nAs for other cases, the acon configuration should be executed with load_data using:
In this scenario, we use DROPMALFORMED read mode, which drops rows that do not comply with the provided schema;
\n
In this scenario, the schema is provided through the input_spec \"schema\" variable. This removes the need of a separate JSON Spark schema file, which may be more convenient in certain cases.
\n
As can be seen, we use the output_spec Spark option checkpointLocation to specify where to save the checkpoints indicating what we have already consumed from the input data. This allows fault-tolerance if the streaming job fails, but more importantly, it allows us to run a streaming job using AvailableNow and the next job automatically picks up the stream state since the last checkpoint, allowing us to do efficient append loads without having to manually specify incremental filters as we do for batch append loads.
Streaming Append Load with Optimize Dataset Terminator
\n\n
This scenario includes a terminator which optimizes a dataset (table), being able of vacuuming the table, optimising it with z-order or not, computing table statistics and more. You can find more details on the Terminator here.
\n\n
As for other cases, the acon configuration should be executed with load_data using:
Streaming Delta Load with Group and Rank Condensation
\n\n
This scenario is useful for when we want to do delta loads based on changelogs that need to be first condensed based on a group by and then a rank only, instead of the record mode logic in the record mode based change data capture.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n"input_specs":[\n{\n"spec_id":"sales_bronze",\n"read_type":"streaming",\n"data_format":"csv",\n"schema_path":"file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/source_schema.json",\n"with_filepath":true,\n"options":{\n"mode":"FAILFAST",\n"header":true,\n"delimiter":"|"\n},\n"location":"file:///app/tests/lakehouse/in/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data"\n}\n],\n"transform_specs":[\n{\n"spec_id":"sales_bronze_with_extraction_date",\n"input_id":"sales_bronze",\n"transformers":[\n{\n"function":"with_regex_value",\n"args":{\n"input_col":"lhe_extraction_filepath",\n"output_col":"extraction_date",\n"drop_input_col":true,\n"regex":".*WE_SO_SCL_(\\\\d+).csv"\n}\n},\n{\n"function":"with_auto_increment_id"\n},\n{\n"function":"group_and_rank",\n"args":{\n"group_key":[\n"salesorder",\n"item"\n],\n"ranking_key":[\n"extraction_date",\n"changed_on",\n"lhe_row_id"\n]\n}\n},\n{\n"function":"repartition",\n"args":{\n"num_partitions":1\n}\n}\n]\n}\n],\n"output_specs":[\n{\n"spec_id":"sales_silver",\n"input_id":"sales_bronze_with_extraction_date",\n"write_type":"merge",\n"data_format":"delta",\n"location":"file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/data",\n"options":{\n"checkpointLocation":"file:///app/tests/lakehouse/out/feature/delta_load/group_and_rank/with_duplicates_in_same_file/streaming/checkpoint"\n},\n"with_batch_id":true,\n"merge_opts":{\n"merge_predicate":"current.salesorder = new.salesorder and current.item = new.item",\n"update_predicate":"new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on",\n"delete_predicate":"new.extraction_date >= current.extraction_date and new.changed_on >= current.changed_on and new.event = 'deleted'"\n}\n}\n]\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
This type of delta load with this type of condensation is useful when the source changelog can be condensed based on dates, instead of technical fields like datapakid, record, record_mode, etc., as we see in SAP BW DSOs.An example of such system is Omnihub Tibco orders and deliveries files.
Streaming Delta Load with Late Arriving and Out of Order Events (with and without watermarking)
\n\n
How to Deal with Late Arriving Data without using Watermark
\n\n
This scenario covers a delta load in streaming mode that is able to deal with late arriving and out of order events.\nAs for other cases, the acon configuration should be executed with load_data using:
{\n "input_specs":[\n {\n "spec_id":"sales_source",\n "read_type":"streaming",\n "data_format":"csv",\n "options":{\n "header":true,\n "delimiter":"|"\n },\n "location":"file:///app/tests/lakehouse/in/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data"\n }\n ],\n "transform_specs":[\n {\n "spec_id":"transformed_sales_source",\n "input_id":"sales_source",\n "transformers":[\n {\n "function":"condense_record_mode_cdc",\n "args":{\n "business_key":[\n "salesorder",\n "item"\n ],\n "ranking_key_desc":[\n "extraction_timestamp",\n "actrequest_timestamp",\n "datapakid",\n "partno",\n "record"\n ],\n "record_mode_col":"recordmode",\n "valid_record_modes":[\n "",\n "N",\n "R",\n "D",\n "X"\n ]\n }\n }\n ]\n }\n ],\n "output_specs":[\n {\n "spec_id":"sales_bronze",\n "input_id":"transformed_sales_source",\n "write_type":"merge",\n "data_format":"delta",\n "location":"file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/data",\n "options":{\n "checkpointLocation":"file:///app/tests/lakehouse/out/feature/delta_load/record_mode_cdc/late_arriving_changes/streaming/checkpoint"\n },\n "merge_opts":{\n "merge_predicate":"current.salesorder = new.salesorder and current.item = new.item and current.date <=> new.date",\n "update_predicate":"new.extraction_timestamp > current.extraction_timestamp or new.actrequest_timestamp > current.actrequest_timestamp or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid > current.datapakid) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno > current.partno) or ( new.actrequest_timestamp = current.actrequest_timestamp and new.datapakid = current.datapakid and new.partno = current.partno and new.record >= current.record)",\n "delete_predicate":"new.recordmode in ('R','D','X')",\n "insert_predicate":"new.recordmode is null or new.recordmode not in ('R','D','X')"\n }\n }\n ],\n "exec_env":{\n "spark.sql.streaming.schemaInference":true\n }\n}\n
\n
\n\n
Relevant notes:
\n\n
\n
First question we can impose is: Do we need such complicated update predicate to handle late arriving and out of order events? Simple answer is no. Because we expect that the latest event (e.g., latest status of a record in the source) will eventually arrive, and therefore the target delta lake table will eventually be consistent. However, when will that happen? Do we want to have our target table inconsistent until the next update comes along? This of course is only true when your source cannot ensure the order of the changes and cannot avoid late arriving changes (e.g., some changes that should have come in this changelog extraction, will only arrive in the next changelog extraction). From previous experiences, this is not the case with SAP BW, for example (as SAP BW is ACID compliant, and it will extract data from an SAP source and only have the updated changelog available when the extraction goes through, so theoretically we should not be able to extract data from the SAP BW changelog while SAP BW is still extracting data).
\n
However, when the source cannot fully ensure ordering (e.g., Kafka) and we want to make sure we don't load temporarily inconsistent data into the target table, we can pay extra special attention, as we do here, to our update and insert predicates, that will enable us to only insert or update data if the new event meets the respective predicates:\n
\n
In this scenario, we will only update if the update_predicate is true, and that long predicate we have here ensures that the change that we are receiving is likely the latest one;
\n
In this scenario, we will only insert the record if the record is not marked for deletion (this can happen if the new event is a record that is marked for deletion, but the record was not in the target table (late arriving changes where the delete came before the insert), and therefore, without the insert_predicate, the algorithm would still try to insert the row, even if the record_mode indicates that that row is for deletion. By using the insert_predicate above we avoid that to happen. However, even in such scenario, to prevent the algorithm to insert the data that comes later (which is old, as we said, the delete came before the insert and was actually the latest status), we would even need a more complex predicate based on your data's nature. Therefore, please read the disclaimer below.
\n
\n
\n\n
\n\n
Disclaimer! The scenario illustrated in this page is purely fictional, designed for the Lakehouse Engine local tests specifically. Your data source changelogs may be different and the scenario and predicates discussed here may not make sense to you. Consequently, the data product team should reason about the adequate merge predicate and insert, update and delete predicates, that better reflect how they want to handle the delta loads for their data.
\n\n
\n\n
\n
We use spark.sql.streaming.schemaInference in our local tests only. We don't encourage you to use it in your data product.
How to Deal with Late Arriving Data using Watermark
\n\n
When building real-time pipelines, one of the realities that teams have to work with is that distributed data ingestion is inherently unordered. Additionally, in the context of stateful streaming operations, teams need to be able to properly track event time progress in the stream of data they are ingesting for the proper calculation of time-window aggregations and other stateful operations. While working with real-time streaming data there will be delays between event time and processing time due to how data is ingested and whether the overall application experiences issues like downtime. Due to these potential variable delays, the engine that you use to process this data needs to have some mechanism to decide when to close the aggregate windows and produce the aggregate result.
\n\n
Imagine a scenario where we will need to perform stateful aggregations on the streaming data to understand and identify problems in the machines. This is where we need to leverage Structured Streaming and Watermarking to produce the necessary stateful aggregations.
To explain this visually let\u2019s take a scenario where we are receiving data at various times from around 10:50 AM \u2192 11:20 AM. We are creating 10-minute tumbling windows that calculate the average of the temperature and pressure readings that came in during the windowed period.
\n\n
In this first picture, we have the tumbling windows trigger at 11:00 AM, 11:10 AM and 11:20 AM leading to the result tables shown at the respective times. When the second batch of data comes around 11:10 AM with data that has an event time of 10:53 AM this gets incorporated into the temperature and pressure averages calculated for the 11:00 AM \u2192 11:10 AM window that closes at 11:10 AM, which does not give the correct result.
\n\n
Approach 2 - Watermark
\n\n
We can define a watermark that will allow Spark to understand when to close the aggregate window and produce the correct aggregate result. In Structured Streaming applications, we can ensure that all relevant data for the aggregations we want to calculate is collected by using a feature called watermarking. In the most basic sense, by defining a watermark Spark Structured Streaming then knows when it has ingested all data up to some time, T, (based on a set lateness expectation) so that it can close and produce windowed aggregates up to timestamp T.
Unlike the first scenario where Spark will emit the windowed aggregation for the previous ten minutes every ten minutes (i.e. emit the 11:00 AM \u219211:10 AM window at 11:10 AM), Spark now waits to close and output the windowed aggregation once the max event time seen minus the specified watermark is greater than the upper bound of the window.
\n\n
In other words, Spark needed to wait until it saw data points where the latest event time seen minus 10 minutes was greater than 11:00 AM to emit the 10:50 AM \u2192 11:00 AM aggregate window. At 11:00 AM, it does not see this, so it only initialises the aggregate calculation in Spark\u2019s internal state store. At 11:10 AM, this condition is still not met, but we have a new data point for 10:53 AM so the internal state gets updated, just not emitted. Then finally by 11:20 AM Spark has seen a data point with an event time of 11:15 AM and since 11:15 AM minus 10 minutes is 11:05 AM which is later than 11:00 AM the 10:50 AM \u2192 11:00 AM window can be emitted to the result table.
\n\n
This produces the correct result by properly incorporating the data based on the expected lateness defined by the watermark. Once the results are emitted the corresponding state is removed from the state store.
\n\n
Watermarking and Different Output Modes
\n\n
It is important to understand how state, late-arriving records, and the different output modes could lead to different behaviours of your application running on Spark. The main takeaway here is that in both append and update modes, once the watermark indicates that all data is received for an aggregate time window, the engine can trim the window state. In append mode the aggregate is produced only at the closing of the time window plus the watermark delay while in update mode it is produced on every update to the window.
\n\n
Lastly, by increasing your watermark delay window you will cause the pipeline to wait longer for data and potentially drop less data \u2013 higher precision, but also higher latency to produce the aggregates. On the flip side, smaller watermark delay leads to lower precision but also lower latency to produce the aggregates.
\n\n
Watermarks can only be used when you are running your streaming application in append or update output modes. There is a third output mode, complete mode, in which the entire result table is written to storage. This mode cannot be used because it requires all aggregate data to be preserved, and hence cannot use watermarking to drop intermediate state.
\n\n
Joins With Watermark
\n\n
There are three types of stream-stream joins that can be implemented in Structured Streaming: inner, outer, and semi joins. The main problem with doing joins in streaming applications is that you may have an incomplete picture of one side of the join. Giving Spark an understanding of when there are no future matches to expect is similar to the earlier problem with aggregations where Spark needed to understand when there were no new rows to incorporate into the calculation for the aggregation before emitting it.
\n\n
To allow Spark to handle this, we can leverage a combination of watermarks and event-time constraints within the join condition of the stream-stream join. This combination allows Spark to filter out late records and trim the state for the join operation through a time range condition on the join.
\n\n
Spark has a policy for handling multiple watermark definitions. Spark maintains one global watermark that is based on the slowest stream to ensure the highest amount of safety when it comes to not missing data.
\n\n
We can change this behaviour by changing spark.sql.streaming.multipleWatermarkPolicy to max; however, this means that data from the slower stream will be dropped.
\n\n
State Store Performance Considerations
\n\n
As of Spark 3.2, Spark offers RocksDB state store provider.
\n\n
If you have stateful operations in your streaming query (for example, streaming aggregation, streaming dropDuplicates, stream-stream joins, mapGroupsWithState, or flatMapGroupsWithState) and you want to maintain millions of keys in the state, then you may face issues related to large JVM garbage collection (GC) pauses causing high variations in the micro-batch processing times. This occurs because, by the implementation of HDFSBackedStateStore, the state data is maintained in the JVM memory of the executors and large number of state objects puts memory pressure on the JVM causing high GC pauses.
\n\n
In such cases, you can choose to use a more optimized state management solution based on RocksDB. Rather than keeping the state in the JVM memory, this solution uses RocksDB to efficiently manage the state in the native memory and the local disk. Furthermore, any changes to this state are automatically saved by Structured Streaming to the checkpoint location you have provided, thus providing full fault-tolerance guarantees (the same as default state management).
\n\n
To enable the new build-in state store implementation, set spark.sql.streaming.stateStore.providerClass to org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider.
DataFrame writer can give us some advantages by returning a dictionary containing the spec_id and the computed dataframe.\nIn these examples we will cover the following scenarios of using the output dataframe format:
Debugging purposes: as we can access any dataframe used in any part of our ACON\nwe can observe what is happening with the computation and identify what might be wrong\nor can be improved.
\n
Flexibility: in case we have some very specific need not covered yet by the lakehouse\nengine capabilities, example: return the Dataframe for further processing like using a machine\nlearning model/prediction.
\n
Simplify ACONs: instead developing a single complex ACON, using the Dataframe writer,\nwe can compose our ACON from the output of another ACON. This allows us to identify\nand split the notebook logic across ACONs.
\n
\n\n
If you want/need, you can add as many dataframes as you want in the output spec\nreferencing the spec_id you want to add.
\n\n
\n\n
This is not intended to replace the other capabilities offered by the\nlakehouse-engine and in case other feature can cover your use case,\nyou should use it instead of using the Dataframe writer, as they\nare much more extensively tested on different type of operations.
\n\n
Additionally, please always introspect if the problem that you are trying to resolve and for which no lakehouse-engine feature is available, could be a common problem and thus deserve a common solution and feature.
\n\n
Moreover, Dataframe writer is not supported for the streaming trigger\ntypes processing time and continuous.
\n\n
\n\n
1. Write to dataframe: Consuming the output spec as DataFrame
\n\n
Silver Dummy Sales Write to DataFrame
\n\n
In this example we will cover the Dummy Sales write to a result containing the output DataFrame.
\n\n
\n
An ACON is used to read from bronze, apply silver transformations and write to a dictionary\ncontaining the output spec as key and the dataframe as value through the following steps:\n
\n
1 - Definition of how to read data (input data location, read type and data format);
\n
2 - Transformation of data (rename relevant columns);
\n
3 - Write the data to dict containing the dataframe;
\n
\n
\n\n
\n\n
If you are trying to retrieve more than once the same data using checkpoint it will return an empty dataframe with empty schema as we don't have new data to read.
2. Write all dataframes: Consuming all DataFrames generated per specs
\n\n
Silver Dummy Sales Write to DataFrame
\n\n
In this example we will cover the Dummy Sales write to a result containing the specs and related DataFrame.
\n\n
\n
An ACON is used to read from bronze, apply silver transformations and write to a dictionary\ncontaining the spec id as key and the DataFrames as value through the following steps:\n
\n
Definition of how to read data (input data location, read type and data format);
\n
Transformation of data (rename relevant columns);
\n
Write the data to a dictionary containing all the spec ids and DataFrames computed per step;
Run the Load and Return the Dictionary with the related DataFrames by Spec
\n\n
This exploratory test will return a dictionary with all specs and the related dataframe.\nYou can access the DataFrame you need by output.get(<spec_id>) for future developments and tests.
3. Read from and Write to dataframe: Making use of the DataFrame output spec to compose silver data
\n\n
Silver Load Dummy Deliveries
\n\n
In this example we will cover the Dummy Deliveries table read and incremental load to silver composing the silver data to write using the DataFrame output spec:
\n\n
\n
First ACON is used to get the latest data from bronze, in this step we are using more than one output because we will need the bronze data with the latest data in the next step.
\n
Second ACON is used to consume the bronze data and the latest data to perform silver transformation, in this ACON we are using as input the two dataframes computed by the first ACON.
\n
Third ACON is used to write the silver computed data from the previous ACON to the target.
\n
\n\n
\n\n
This example is not a recommendation on how to deal with incremental loads, the ACON was split in 3 for demo purposes.
\n\n
\n\n
Consume bronze data, generate the latest data and return a dictionary with bronze and transformed dataframes:
Consume previous dataframes generated by the first ACON (bronze and latest bronze data) to generate the silver data. In this acon we are only using just one output because we only need the dataframe from the output for the next step.
Console writer is an interesting feature to debug / validate what have been done on lakehouse engine. Before moving forward and store data somewhere, it is possible to show / print the final dataframe to the console, which means it is possible to transform the data as many times as you want and display the final result to validate if it is as expected.
\n\n
Silver Dummy Sales Write to Console Example
\n\n
In this template we will cover the Dummy Sales write to console. An ACON is used to read from bronze, apply silver transformations and write on console through the following steps:
\n\n\n
Definition of how to read data (input data location, read type and data format);
\n
Transformation of data (rename relevant columns);
\n
Definition of how to print to console (limit, truncate, vertical options);
\n\n\n
For this, the ACON specs are :
\n\n
\n
input_specs (MANDATORY): specify how to read data;
\n
transform specs (OPTIONAL): specify how to transform data;
\n
output_specs (MANDATORY): specify how to write data to the target.
\n
\n\n
\n\n
Writer to console is a wrapper for spark.show() function, if you want to know more about the function itself or the available options, please check the spark documentation here.
REST API writer is an interesting feature to send data from Spark to a REST API within the data pipeline context. It uses the Python requests library to execute the REST calls.
\n\n
It is possible to configure a few aspects of the writer, like if the payload should be sent via JSON body or via file, or configure additional JSON body parameters to add to the payload generated via Spark.
\n\n
In the current implementation of the writer, each row will generate a request to the API, so it is important that you prepare your dataframe accordingly (check example below).
\n\n
Silver Dummy Sales Write to REST API Example
\n\n
In this template we will cover the Dummy Sales write to a REST API. An ACON is used to read from bronze, apply silver transformations to prepare the REST api payload and write to the API through the following steps:
\n\n\n
Definition of how to read data (input data location, read type and data format);
\n
Transformation of the data so that we form a payload column per each row.\nImportant Note: In the current implementation of the writer, each row will generate a request to the API, so create_payload is a lakehouse engine custom transformer function that creates a JSON string with the payload to be sent to the API. The column name should be exactly \"payload\", so that the lakehouse engine further processes that column accordingly, in order to correctly write the data to the REST API.
\n
Definition of how to write to a REST api (url, authentication, payload format configuration, ...);
\n\n\n
For this, the ACON specs are :
\n\n
\n
input_specs (MANDATORY): specify how to read data;
\n
transform specs (MANDATORY): specify how to transform data to prepare the payload;
\n
output_specs (MANDATORY): specify how to write data to the target.
\n
\n\n
\n
fromlakehouse_engine.engineimportload_data\n\ndefcreate_payload(df:DataFrame)->DataFrame:\n payload_df=payload_df.withColumn(\n "payload",\n lit('{"just a dummy key": "just a dummy value"}')\n )\n\n returnpayload_df\n\nacon={\n "input_specs":[\n {\n "spec_id":"dummy_sales_bronze",\n "read_type":"streaming",\n "data_format":"delta",\n "location":"s3://my_data_product_bucket/bronze/dummy_sales",\n }\n ],\n "transform_specs":[\n {\n "spec_id":"dummy_sales_transform",\n "input_id":"dummy_sales_bronze",\n "transformers":[\n {\n "function":"custom_transformation",\n "args":{\n "custom_transformer":create_payload,\n },\n }\n ],\n },\n ],\n "output_specs":[\n { \n "spec_id":"data_to_send_to_api",\n "input_id":"dummy_sales_transform",\n "data_format":"rest_api",\n "options":{\n "rest_api_url":"https://foo.bar.com",\n "rest_api_method":"post",\n "rest_api_basic_auth":{\n "username":"...",\n "password":"...",\n },\n "rest_api_is_file_payload":False,# True if payload is to be sent via JSON file instead of JSON body (application/json)\n "rest_api_file_payload_name":"custom_file",# this is the name of the file to be sent in cases where the payload uses file uploads rather than JSON body.\n "rest_api_extra_json_payload":{"x":"y"}\n }\n }\n ],\n}\n\nload_data(acon=acon)\n
The Data Quality framework is based on Great Expectations (GX) and other custom-made \ndevelopments, providing a very light abstraction on top of the GX open source framework and the Spark framework.
\n\n
How to use Data Quality?
\n\n
Data Loader
\n\n
You can define data quality rules inside the DataLoader algorithm that you use to load data.
\n\n
\n\n
The DataLoader algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the \ncapability to store DQ results having history over all the DQ executions, which can be used for debugging, \nto create DQ dashboards on top of the data, and much more.
\n\n
\n\n
Examples:\nIn these examples, dummy sales local data is used to cover a few example usages of the DQ Framework\n(based on Great Expectations).\nThe main difference between the sample acons is on the usage of dq_specs.
The DQValidator algorithm focuses on validating data (e.g., spark DataFrames, Files or Tables).\nIn contrast to the dq_specs inside the DataLoader algorithm, the DQValidator focuses on validating data at rest \n(post-mortem) instead of validating data in-transit (before it is loaded to the destination).
\n\n
\n\n
The DQValidator algorithm allows you to store the results of the data quality checks inside your custom location\nusing the result_sink options (e.g., a delta table on your data product). Using result sink unlocks the\ncapability to store DQ results having history over all the DQ executions, which can be used for debugging,\nto create DQ dashboards on top of the data, and much more.
Similarly to the Data Quality Validator algorithm, the Reconciliator algorithm focuses on \nvalidating data at rest (post-mortem). In contrast to the DQValidator algorithm, the Reconciliator always compares a \ntruth dataset (e.g., spark DataFrames, Files or Tables) with the current dataset (e.g., spark DataFrames, Files or \nTables), instead of executing DQ rules defined by the teams. \nHere you can find more information regarding reconciliator and examples.
\n\n
\n\n
Reconciliator does not use Great Expectations, therefore Data Docs and Result Sink and others native methods are not available.
\n\n
\n\n
Custom Expectations
\n\n
If your data has a data quality check that cannot be done with the expectations provided by Great Expectations you \ncan create a custom expectation to make this verification.
\n\n
\n\n
Before creating a custom expectation check if there is an expectation already created to address your needs, \nboth in Great Expectations and the Lakehouse Engine.\nAny Custom Expectation that is too specific (using hardcoded table/column names) will be rejected.\nExpectations should be generic by definition.
How to check the results of the Data Quality Process?
\n\n
1. Table/location analysis
\n\n
The possibility to configure a Result Sink allows you to store the history of executions of the DQ process. \nYou can query the table or the location to search through data and analyse history.
\n\n
2. Power BI Dashboard
\n\n
With the information expanded, interactive analysis can be built on top of the history of the DQ process.\nA dashboard can be created with the results that we have in dq_specs. To be able to have this information you \nneed to use arguments result_sink_db_table and/or result_sink_location.
\n\n
Through having a dashboard, the runs and expectations can be analysed, filtered by year, month, source and \nrun name, and you will have information about the number of runs, some statistics, status of expectations and more. \nAnalysis such as biggest failures per expectation type, biggest failures by columns, biggest failures per source, \nand others can be made, using the information in the result_sink_db_table/result_sink_location.
\n\n
\n\n
The recommendation is to use the same result sink table/location for all your dq_specs and \nin the dashboard you will get a preview of the status of all of them.
\n\n
\n\n
\n\n
3. Data Docs Website
\n\n
A site that is auto generated to present you all the relevant information can also be used. If you choose to define \nthe parameter data_docs_bucket you will be able to store the GX documentation in the defined bucket,\nand therefore make your data docs available in the DQ Web App (GX UI) visible to everyone. \nThe data_docs_bucket property supersedes the bucket property only for data docs storage.
"""Expectation to check if column 'a' is lower or equal than column 'b'."""\n\nfromtypingimportAny,Dict,Optional\n\nfromgreat_expectations.coreimportExpectationConfiguration\nfromgreat_expectations.execution_engineimportExecutionEngine,SparkDFExecutionEngine\nfromgreat_expectations.expectations.expectationimportColumnPairMapExpectation\nfromgreat_expectations.expectations.metrics.map_metric_providerimport(\n ColumnPairMapMetricProvider,\n column_pair_condition_partial,\n)\n\nfromlakehouse_engine.utils.expectations_utilsimportvalidate_result\n\n\nclassColumnPairCustom(ColumnPairMapMetricProvider):\n"""Asserts that column 'A' is lower or equal than column 'B'.\n\n Additionally, the 'margin' parameter can be used to add a margin to the\n check between column 'A' and 'B': 'A' <= 'B' + 'margin'.\n """\n\n condition_metric_name="column_pair_values.a_smaller_or_equal_than_b"\n condition_domain_keys=(\n "batch_id",\n "table",\n "column_A",\n "column_B",\n "ignore_row_if",\n )\n condition_value_keys=("margin",)\n\n @column_pair_condition_partial(engine=SparkDFExecutionEngine)\n def_spark(\n self:ColumnPairMapMetricProvider,\n column_A:Any,\n column_B:Any,\n margin:Any,\n **kwargs:dict,\n )->Any:\n"""Implementation of the expectation's logic.\n\n Args:\n column_A: Value of the row of column_A.\n column_B: Value of the row of column_B.\n margin: margin value to be added to column_b.\n kwargs: dict with additional parameters.\n\n Returns:\n If the condition is met.\n """\n ifmarginisNone:\n approx=0\n elifnotisinstance(margin,(int,float,complex)):\n raiseTypeError(\n f"margin must be one of int, float, complex."\n f" Found: {margin} as {type(margin)}"\n )\n else:\n approx=margin# type: ignore\n\n returncolumn_A<=column_B+approx# type: ignore\n\n\nclassExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n"""Expect values in column A to be lower or equal than column B.\n\n Args:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.\n\n Keyword Args:\n - allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n - ignore_row_if: "both_values_are_missing",\n "either_value_is_missing", "neither" (default).\n - result_format: Which output mode to use:\n `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n - include_config: If True (default), then include the expectation config\n as part of the result object.\n - catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n - meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.\n\n Returns:\n An ExpectationSuiteValidationResult.\n """\n\n examples=[\n {\n "dataset_name":"Test Dataset",\n "data":[\n {\n "data":{\n "a":[11,22,50],\n "b":[10,21,100],\n "c":[9,21,30],\n },\n "schemas":{\n "spark":{\n "a":"IntegerType",\n "b":"IntegerType",\n "c":"IntegerType",\n }\n },\n }\n ],\n "tests":[\n {\n "title":"negative_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"c",\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["c"],\n },\n },\n "out":{\n "success":False,\n "unexpected_index_list":[\n {"c":9,"a":11},\n {"c":21,"a":22},\n {"c":30,"a":50},\n ],\n },\n },\n {\n "title":"positive_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"b",\n "margin":1,\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["a"],\n },\n },\n "out":{\n "success":True,\n "unexpected_index_list":[],\n },\n },\n ],\n },\n ]\n\n map_metric="column_pair_values.a_smaller_or_equal_than_b"\n success_keys=(\n "column_A",\n "column_B",\n "ignore_row_if",\n "margin",\n "mostly",\n )\n default_kwarg_values={\n "mostly":1.0,\n "ignore_row_if":"neither",\n "result_format":"BASIC",\n "include_config":True,\n "catch_exceptions":False,\n }\n\n def_validate(\n self,\n configuration:ExpectationConfiguration,\n metrics:Dict,\n runtime_configuration:Optional[dict]=None,\n execution_engine:Optional[ExecutionEngine]=None,\n )->Any:\n"""Custom implementation of the GE _validate method.\n\n This method is used on the tests to validate both the result\n of the tests themselves and if the unexpected index list\n is correctly generated.\n The GE test logic does not do this validation, and thus\n we need to make it manually.\n\n Args:\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n runtime_configuration: Configuration used when running the expectation.\n execution_engine: Execution Engine where the expectation was run.\n\n Returns:\n Dictionary with the result of the validation.\n """\n returnvalidate_result(\n self,\n configuration,\n metrics,\n runtime_configuration,\n execution_engine,\n ColumnPairMapExpectation,\n )\n\n\n"""Mandatory block of code. If it is removed the expectation will not be available."""\nif__name__=="__main__":\n # test the custom expectation with the function `print_diagnostic_checklist()`\n ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
\n
\n\n
Naming Conventions
\n\n
Your expectation's name should start with expect.
\n\n
The name of the file must be the name of the expectation written in snake case. Ex: expect_column_length_match_input_length
\n\n
The name of the class must be the name of the expectation written in camel case. Ex: ExpectColumnLengthMatchInputLength
\n\n
File Structure
\n\n
The file contains two main sections:
\n\n
\n
the definition of the metric that we are tracking (where we define the logic of the expectation);
\n
the definition of the expectation
\n
\n\n
Metric Definition
\n\n
In this section we define the logic of the expectation. This needs to follow a certain structure:
\n\n
Code Structure
\n\n
1) The class you define needs to extend one of the Metric Providers defined by Great Expectations that corresponds \nto your expectation's type. More info on the metric providers.
\n\n
2) You need to define the name of your metric. This name must be unique and must follow the following structure: \ntype of expectation.name of metric. Ex.: column_pair_values.a_smaller_or_equal_than_b\nTypes of expectations:column_values, multicolumn_values, column_pair_values, table_rows, table_columns.
\n\n
3) Any GX default parameters that are necessary to calculate your metric must be defined as \"condition_domain_keys\".
\n\n
4) Any additional parameters that are necessary to calculate your metric must be defined as \"condition_value_keys\".
\n\n
5) The logic of your expectation must be defined for the SparkDFExecutionEngine in order to be run on the Lakehouse.
\n\n
\n
1)classColumnMapMetric(ColumnMapMetricProvider):\n"""Asserts that a column matches a pattern."""\n\n 2)condition_metric_name="column_pair_values.a_smaller_or_equal_than_b"\n 3)condition_domain_keys=(\n "batch_id",\n "table",\n "column_A",\n "column_B",\n "ignore_row_if",\n )\n 4)condition_value_keys=("margin",)\n\n 5)@column_pair_condition_partial(engine=SparkDFExecutionEngine)\n def_spark(\n self:ColumnPairMapMetricProvider,\n column_A:Any,\n column_B:Any,\n margin:Any,\n **kwargs:dict,\n )->Any:\n"""Implementation of the expectation's logic.\n\n Args:\n column_A: Value of the row of column_A.\n column_B: Value of the row of column_B.\n margin: margin value to be added to column_b.\n kwargs: dict with additional parameters.\n\n Returns:\n If the condition is met.\n """\n ifmarginisNone:\n approx=0\n elifnotisinstance(margin,(int,float,complex)):\n raiseTypeError(\n f"margin must be one of int, float, complex."\n f" Found: {margin} as {type(margin)}"\n )\n else:\n approx=margin# type: ignore\n\n returncolumn_A<=column_B+approx# type: ignore\n
\n
\n\n
Expectation Definition
\n\n
In this section we define the expectation. This needs to follow a certain structure:
\n\n
Code Structure
\n\n
1) The class you define needs to extend one of the Expectations defined by Great Expectations that corresponds to your expectation's type.
\n\n
2) You must define an \"examples\" object where you define at least one success and one failure of your expectation to \ndemonstrate its logic. The result format must be set to complete, and you must set the unexpected_index_name variable.
\n\n
\n\n
For any examples where you will have unexpected results you must define unexpected_index_list in your \"out\" element.\nThis will be validated during the testing phase.
\n\n
\n\n
3) The metric must be the same you defined in the metric definition.
\n\n
4) You must define all additional parameters that the user has to/should provide to the expectation.
\n\n
5) You should define any default values for your expectations parameters.
\n\n
6) You must define the _validate method like shown in the example. You must call the validate_result function \ninside your validate method, this process adds a validation to the unexpected index list in the examples.
\n\n
\n\n
If your custom expectation requires any extra validations, or you require additional fields to be returned on \nthe final dataframe, you can add them in this function. \nThe validate_result method has two optional parameters (partial_success and `partial_result) that can be used to \npass the result of additional validations and add more information to the result key of the returned dict respectively.
\n\n
\n\n
\n
1)classExpectColumnPairAToBeSmallerOrEqualThanB(ColumnPairMapExpectation):\n"""Expect values in column A to be lower or equal than column B.\n\n Args:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.\n\n Keyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: "both_values_are_missing",\n "either_value_is_missing", "neither" (default).\n result_format: Which output mode to use:\n `BOOLEAN_ONLY`, `BASIC` (default), `COMPLETE`, or `SUMMARY`.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.\n\n Returns:\n An ExpectationSuiteValidationResult.\n """\n 2)examples=[\n {\n "dataset_name":"Test Dataset",\n "data":{\n "a":[11,22,50],\n "b":[10,21,100],\n "c":[9,21,30],\n },\n "schemas":{\n "spark":{"a":"IntegerType","b":"IntegerType","c":"IntegerType"}\n },\n "tests":[\n {\n "title":"negative_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"c",\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["c"],\n "include_unexpected_rows":True,\n },\n },\n "out":{\n "success":False,\n "unexpected_index_list":[\n {"c":9,"a":11},\n {"c":21,"a":22},\n {"c":30,"a":50},\n ],\n },\n },\n {\n "title":"positive_test",\n "exact_match_out":False,\n "include_in_gallery":True,\n "in":{\n "column_A":"a",\n "column_B":"b",\n "margin":1,\n "result_format":{\n "result_format":"COMPLETE",\n "unexpected_index_column_names":["a"],\n },\n },\n "out":{"success":True},\n },\n ],\n },\n ]\n\n 3)map_metric="column_values.pattern_match"\n 4)success_keys=(\n "validation_regex",\n "mostly",\n )\n 5)default_kwarg_values={\n "ignore_row_if":"never",\n "result_format":"BASIC",\n "include_config":True,\n "catch_exceptions":False,\n "mostly":1,\n }\n\n 6)def_validate(\n self,\n configuration:ExpectationConfiguration,\n metrics:Dict,\n runtime_configuration:Optional[dict]=None,\n execution_engine:Optional[ExecutionEngine]=None,\n )->dict:\n"""Custom implementation of the GX _validate method.\n\n This method is used on the tests to validate both the result\n of the tests themselves and if the unexpected index list\n is correctly generated.\n The GX test logic does not do this validation, and thus\n we need to make it manually.\n\n Args:\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n runtime_configuration: Configuration used when running the expectation.\n execution_engine: Execution Engine where the expectation was run.\n\n Returns:\n Dictionary with the result of the validation.\n """\n returnvalidate_result(self,configuration,metrics)\n
\n
\n\n
Printing the Expectation Diagnostics
\n\n
Your expectations must include the ability to call the Great Expectations diagnostic function in order to be validated.
\n\n
In order to do this code must be present.
\n\n
\n
"""Mandatory block of code. If it is removed the expectation will not be available."""\nif__name__=="__main__":\n # test the custom expectation with the function `print_diagnostic_checklist()`\n ExpectColumnPairAToBeSmallerOrEqualThanB().print_diagnostic_checklist()\n
\n
\n\n
Creation Process
\n\n
1) Create a branch from lakehouse engine.
\n\n
2) Create a custom expectation with your specific logic:
\n\n\n
All new expectations must be placed inside folder /lakehouse_engine/dq_processors/custom_expectations.
\n
The name of the expectation must be added to the file /lakehouse_engine/core/definitions.py, to the variable: CUSTOM_EXPECTATION_LIST.
\n
All new expectations must be tested on /tests/feature/custom_expectations/test_custom_expectations.py.\nIn order to create a new test for your custom expectation it is necessary to:
\n\n\n
\n
Copy one of the expectation folders in tests/resources/feature/custom_expectations renaming it to your custom expectation.
\n
Make any necessary changes on the data/schema file present.
\n
On /tests/feature/custom_expectations/test_custom_expectations.py add a scenario to test your expectation, all expectations \nmust be tested on batch and streaming. The test is implemented to generate an acon based on each scenario data.
\n
Test your developments to check that everything is working as intended.
\n
\n\n
3) When the development is completed, create a pull request with your changes.
\n\n
4) Your expectation will be available with the next release of the lakehouse engine that happens after you pull request is approved. \nThis means that you need to upgrade your version of the lakehouse engine in order to use it.
\n\n
Usage
\n\n
Custom Expectations are available to use like any other expectations provided by Great Expectations.
\n\n
Parameters
\n\n
Depending on the type of expectation you are defining some parameters are expected by default. \nEx: A ColumnMapExpectation has a default \"column\" parameter.
\n\n
Mostly
\n\n
Mostly is a standard \nparameter for a subset of expectations that is used to define a threshold for the failure of an expectation. \nEx: A mostly value of 0.7 makes it so that the expectation only fails if more than 70% of records have \na negative result.
\n\n
Result Format
\n\n
Great Expectations has several different types of result formats \nfor the expectations results. The lakehouse engine requires the result format to be set to \"COMPLETE\" in order to tag \nthe lines where the expectations failed.
\n\n
unexpected_index_column_names
\n\n
Inside this key you must define what columns are used as an index inside your data. If this is set and the result \nformat is set to \"COMPLETE\" a list with the indexes of the lines that failed the validation will be returned by \nGreat Expectations.\nThis information is used by the Lakehouse Engine to tag the lines in error after the fact. The additional tests \ninside the _validate method verify that the custom expectation is tagging these lines correctly.
DQValidator algorithm allows DQ Validations isolated from the data load (only read and apply data quality validations).\nWith this algorithm you have the capacity to apply the Lakehouse-Engine Data Quality Process,\nusing Great Expectations functions directly into a specific dataset also\nmaking use of all the InputSpecs available in the engine.
\n\n
Validating the Data Quality, using this algorithm, is a matter of defining the data you want to read and the validations you want to do to your data, detailing the great expectations functions you want to apply on the data to assess its quality.
\n\n
\n\n
This algorithm also gives the possibility to restore a previous version of a delta table or delta files in case the DQ\nprocess raises any exception. Please use it carefully!! You may lose important commits and data. Moreover, this will\nhighly depend on the frequency that you run your Data Quality validations. If you run your data loads daily and Data\nQuality validations weekly, and you define the restore_prev_version to true, this means that the table will be restored\nto the previous version, but the error could have happened 4 or 5 versions before.
\n\n
\n\n
When to use?
\n\n
\n
Post-Load validation: check quality of data already loaded to a table/location
\n
Pre-Load validation: check quality of the data you want to load (check DQ by reading a set of files in a specific\nlocation...)
\n
Validation of a DataFrame computed in the notebook itself (e.g. check data quality after joining or filtering\ndatasets, using the computed DataFrame as input for the validation)
\n
\n\n
This algorithm also gives teams some freedom to:
\n\n
\n
Schedule isolated DQ Validations to run periodically, with the frequency they need;
\n
Define a DQ Validation process as an end-to-end test of the respective data product.
\n
\n\n
How to use?
\n\n
All of these configurations are passed via the ACON to instantiate\na DQValidatorSpec object. The DQValidator algorithm uses an\nACON to configure its execution. In DQValidatorSpec you can\nfind the meaning of each ACON property.
On this page you will also find the following examples of usage:
\n\n\n
Dataframe as input & Success on the DQ Validation
\n
Table as input & Failure on DQ Validation & Restore previous version
\n
Files as input & Failure on DQ Validation & Fail_on_error disabled
\n
Files as input & Failure on DQ Validation & Critical functions defined
\n
Files as input & Failure on DQ Validation & Max failure percentage defined
\n\n\n
Example 1 : Dataframe as input & Success on the DQ Validation
\n\n
This example focuses on using a dataframe, computed in this notebook, directly in the input spec. First, a new\nDataFrame is generated as a result of the join of data from two tables (dummy_deliveries and dummy_pd_article) and\nsome DQ Validations are applied on top of this dataframe.
Example 2: Table as input & Failure on DQ Validation & Restore previous version
\n\n
In this example we are using a table as input to validate the data that was loaded. Here, we are forcing the DQ Validations to fail in order to show the possibility of restoring the table to the previous version.
\n\n
\n\n
Be careful when using the feature of restoring a previous version of a delta table or delta files. You may\nlose important commits and data. Moreover, this will highly depend on the frequency that you run your Data Quality\nvalidations. If you run your data loads daily and Data Quality validations weekly, and you define the\nrestore_prev_version to true, this means that the table will be restored to the previous version, but the error\ncould have happened 4 or 5 versions before (because loads are daily, validations are weekly).
\n\n
\n\n
Steps followed in this example to show how the restore_prev_version feature works.
\n\n\n
Insert rows into the dummy_deliveries table to adjust the total numbers of rows and make the DQ process fail.
\n
Use the \"DESCRIBE HISTORY\" statement to check the number of versions available on the table and check the version\nnumber resulting from the insertion to the table.
\n
Execute the DQ Validation, using the configured acon (based on reading the dummy_deliveries table and setting the \nrestore_prev_version to true). Checking the logs of the process, you can see that the data did not pass all the \nexpectations defined and that the table version restore process was triggered.
\n
Re-run a \"DESCRIBE HISTORY\" statement to check that the previous version of the table was restored and thus, the row inserted in the beginning of the process is no longer present in the table.
\n\n\n
\n
fromlakehouse_engine.engineimportexecute_dq_validation\n\n# Force failure of data quality by adding new row\nspark.sql("""INSERT INTO my_database.dummy_deliveries VALUES (7, 1, 20180601, 71, "article1", "delivered")""")\n\n\n# Check history of the table\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n\nacon={\n "input_spec":{\n "spec_id":"deliveries_input",\n "read_type":"batch",\n "db_table":"my_database.dummy_deliveries",\n },\n "dq_spec":{\n "spec_id":"dq_deliveries",\n "input_id":"deliveries_input",\n "dq_type":"validator",\n "bucket":"my_data_product_bucket",\n "data_docs_bucket":"my_dq_data_docs_bucket",\n "data_docs_prefix":"dq/my_data_product/data_docs/site/",\n "tbl_to_derive_pk":"my_database.dummy_deliveries",\n "dq_functions":[\n {"function":"expect_column_values_to_not_be_null","args":{"column":"delivery_date"}},\n {"function":"expect_table_row_count_to_be_between","args":{"min_value":15,"max_value":19}},\n ],\n },\n "restore_prev_version":True,\n}\n\nexecute_dq_validation(acon=acon)\n\n# Check that the previous version of the table was restored\nspark.sql("""DESCRIBE HISTORY my_database.dummy_deliveries""")\n
\n
\n\n
Example 3: Files as input & Failure on DQ Validation & Fail_on_error disabled
\n\n
In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail, however disabling the \"fail_on_error\" configuration,\nso the algorithm warns about the expectations that failed but the process/the execution of the algorithm doesn't fail.
Example 4: Files as input & Failure on DQ Validation & Critical functions defined
\n\n
In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the critical functions feature, which will throw an error\nif any of the functions fails.
Example 5: Files as input & Failure on DQ Validation & Max failure percentage defined
\n\n
In this example we are using a location as input to validate the files in a specific folder.\nHere, we are forcing the DQ Validations to fail by using the max_percentage_failure,\nwhich will throw an error if the percentage of failures surpasses the defined maximum threshold.
Unlike DataLoader, this new DQValidator algorithm only allows, for now, one input_spec (instead of a list of input_specs) and one dq_spec (instead of a list of dq_specs). There are plans and efforts already initiated to make this available in the input_specs and one dq_spec (instead of a list of dq_specs). However, you can prepare a Dataframe which joins more than a source, and use it as input, in case you need to assess the Data Quality from different sources at the same time. Alternatively, you can also show interest on any enhancement on this feature, as well as contributing yourself.
This scenario illustrates the minimal configuration that you can have to use dq_specs, in which\nit uses required parameters: spec_id, input_id, dq_type, bucket, dq_functions and the optional\nparameter data_docs_bucket. This parameter allows you to store the GX documentation in another\nbucket that can be used to make your data docs available, in DQ Web App (GX UI), without giving users access to your bucket.\nThedata_docs_bucket property supersedes the bucket property only for data docs storage.
\n\n
Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check:
\n\n
\n
expect_column_to_exist - if a column exist in the data;
\n
expect_table_row_count_to_be_between - if the row count of the data is between the defined interval;
\n
expect_table_column_count_to_be_between - if the number of columns in the data is bellow the max value defined.
These scenarios store the results of the dq_specs into a result sink. For that, both scenarios include parameters defining\nthe specific table and location (result_sink_db_table and result_sink_location) where the results\nare expected to be stored. With this configuration, people can, later on, check the history of the DQ\nexecutions using the configured table/location, as shown bellow. You can configure saving the output of the\nresults in the result sink following two approaches:
\n\n
\n
Denormalized/exploded Data Model (recommended) - the results are stored in a detailed format in which\npeople are able to analyse them by Data Quality Run, by expectation_type and by keyword arguments.
\n
\n\n
\n\n
\n
...
\n
source
\n
column
\n
max_value
\n
min_value
\n
expectation_type
\n
expectation_success
\n
observed_value
\n
run_time_year
\n
...
\n
\n\n\n
\n
all columns from raw + more
\n
deliveries
\n
salesorder
\n
null
\n
null
\n
expect_column_to_exist
\n
TRUE
\n
null
\n
2023
\n
...
\n
\n
\n
all columns from raw + more
\n
deliveries
\n
null
\n
null
\n
null
\n
expect_table_row_count_to_be_between
\n
TRUE
\n
23
\n
2023
\n
...
\n
\n
\n
all columns from raw + more
\n
deliveries
\n
null
\n
null
\n
null
\n
expect_table_column_count_to_be_between
\n
TRUE
\n
6
\n
2023
\n
...
\n
\n\n
\n\n
\n
Raw Format Data Model (not recommended) - the results are stored in the raw format that Great\nExpectations outputs. This is not recommended as the data will be highly nested and in a\nstring format (to prevent problems with schema changes), which makes analysis and the creation of a dashboard on top way \nharder.
\n
\n\n
\n\n
\n
checkpoint_config
\n
run_name
\n
run_time
\n
run_results
\n
success
\n
validation_result_identifier
\n
spec_id
\n
input_id
\n
\n\n\n
\n
entire configuration
\n
20230323-...-dq_validation
\n
2023-03-23T15:11:32.225354+00:00
\n
results of the 3 expectations
\n
true/false for the run
\n
identifier
\n
spec_id
\n
input_id
\n
\n\n
\n\n
\n\n
\n
More configurations can be applied in the result sink, as the file format and partitions.
\n
It is recommended to:\n
\n
Use the same result sink table/location for all dq_specs across different data loads, from different \nsources, in the same Data Product.
\n
Use the parameter source (only available with \"result_sink_explode\": True), in the dq_specs, as\nused in both scenarios, with the name of the data source, to be easier to distinguish sources in the\nanalysis. If not specified, the input_id of the dq_spec will be considered as the source.
\n
These recommendations will enable more rich analysis/dashboard at Data Product level, considering\nall the different sources and data loads that the Data Product is having.
\n
\n
\n\n
\n\n
1. Result Sink Exploded (Recommended)
\n\n
This scenario stores DQ Results (results produces by the execution of the dq_specs) in the Result Sink,\nin a detailed format, in which people are able to analyse them by Data Quality Run, by expectation_type and\nby keyword arguments. This is the recommended approach since it makes the analysis on top of the result\nsink way easier and faster.
\n\n
For achieving the exploded data model, this scenario introduces the parameter result_sink_explode, which\nis a flag to determine if the output table/location should have the columns exploded (as True) or\nnot (as False). Default:True, but it is still provided explicitly in this scenario for demo purposes.\nThe table/location will include a schema which contains general columns, statistic columns, arguments of\nexpectations, and others, thus part of the schema will be always with values and other part will depend on\nthe expectations chosen.
To check the history of the DQ results, you can run commands like:
\n\n
\n
the table: display(spark.table(\"my_database.dq_result_sink\"))
\n
the location: display(spark.read.format(\"delta\").load(\"my_dq_path/dq_result_sink/\"))
\n
\n\n
2. Raw Result Sink
\n\n
This scenario is very similar to the previous one, but it changes the parameter result_sink_explode to False so that\nit produces a raw result sink output containing only one row representing the full run of dq_specs (no\nmatter the amount of expectations/dq_functions defined there). Being a raw output, it is not a\nrecommended approach, as it will be more complicated to analyse and make queries on top of it.
Data quality is essential for any organisation that relies on data to make informed decisions. \nHigh-quality data provides accurate, reliable, and timely information that enables organisations to identify\nopportunities, mitigate risks, and optimize their operations. In contrast, low-quality data can lead to incorrect\nconclusions, faulty decisions, and wasted resources.
\n\n
There are several common issues that can compromise data quality, such as:
\n\n
\n
data entry errors;
\n
data duplication;
\n
incomplete / inconsistent data;
\n
changes where data is collected (e.g. sources);
\n
faulty data processing, such as inaccurate data cleansing or transformations.
\n
\n\n
Therefore, implementing data quality controls, such as data validation rules, and regularly monitoring data for \naccuracy and completeness is key for any organisation.
\n\n
One of these controls that can be applied is the DQ Row Tagging Strategy so that you not only apply validations on \nyour data to ensure Data Quality, but you also tag your data with the results of the Data Quality validations \nproviding advantages like:
\n\n
\n
Transparency for downstream and upstream consumers;
\n
Data Observability and Reliability;
\n
More trust over the data;
\n
Anomaly Detection;
\n
Easier and faster discovery of Data Quality problems, and, consequently faster resolution;
\n
Makes it easier to deal with integrations with other systems and migrations (you can have validations capturing that a column was changed or simply disappeared);
\n
\n\n
\n\n
When using the DQ Row Tagging approach data availability will take precedence over Data Quality, meaning \nthat all the data will be introduced into the final target (e.g. table or location) no matter what Data Quality\nissues it is having.
\n\n
\n\n
Different Types of Expectations:
\n\n
\n
Table Level
\n
Column Aggregated Level
\n
Query Level
\n
Column Values (row level)
\n
Column Pair Value (row level)
\n
Multicolumn Values (row level)
\n
\n\n
The expectations highlighted as row level will be the ones enabling to Tag failures on specific rows and adding \nthe details about each failure (they affect the field run_row_result inside dq_validations). The expectations \nwith other levels (not row level) influence the overall result of the Data Quality execution, but won't be used to tag\nspecific rows (they affect the field run_success only, so you can even have situations for which you get \nrun_success False and run_row_success True for all rows).
\n\n
How does the Strategy work?
\n\n
The strategy relies mostly on the 6 below arguments.
\n\n
\n\n
When you specify \"tag_source_data\": True the arguments fail_on_error, gx_result_format and \nresult_sink_explode are set to the expected values.
\n\n
\n\n
\n
unexpected_rows_pk - the list columns composing the primary key of the source data to use to identify the rows \nfailing the DQ validations.
\n
tbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.
\n
gx_result_format - great expectations result format. Default: COMPLETE.
\n
tag_source_data - flag to enable the tagging strategy in the source data, adding the information of \nthe DQ results in a column dq_validations. This column makes it possible to identify if the DQ run was\nsucceeded in general and, if not, it unlocks the insights to know what specific rows have made the DQ validations\nfail and why. Default: False.
\n
\n\n
\n\n
It only works if result_sink_explode is True, result_format is COMPLETE and \nfail_on_error is `False.
\n\n
\n\n
\n
fail_on_error - whether to fail the algorithm if the validations of your data in the DQ process failed.
\n
result_sink_explode - flag to determine if the output table/location should have the columns exploded (as True)\nor not (as False). Default: True.
\n
\n\n
\n\n
It is mandatory to provide one of the arguments (unexpected_rows_pk or tbl_to_derive_pk) when using \ntag_source_data as True. \nWhen tag_source_data is False, this is not mandatory, but still recommended.
\n\n
\n\n
\n\n
\n\n
The tagging strategy only works when tag_source_data is True, which automatically\nassigns the expected values for the parameters result_sink_explode (True), fail_on_error (False)\nand gx_result_format (\"COMPLETE\").
\n\n
\n\n
\n\n
For the DQ Row Tagging to work, in addition to configuring the aforementioned arguments in the dq_specs, \nyou will also need to add the dq_validations field into your table (your DDL statements, recommended) or \nenable schema evolution.
\n\n
\n\n
\n\n
Kwargs field is a string, because it can assume different schemas for different expectations and runs. \nIt is useful to provide the complete picture of the row level failure and to allow filtering/joining with \nthe result sink table, when there is one. Some examples of kwargs bellow:
\n\n
\n
{\"column\": \"country\", \"min_value\": 1, \"max_value\": 2, \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for \nexpectations starting with expect_column_values (they always make use of \"column\", the other arguments vary).
\n
{\"column_A: \"country\", \"column_B\": \"city\", \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for expectations \nstarting with expect_column_pair (they make use of \"column_A\" and \"column_B\", the other arguments vary).
\n
{\"column_list\": [\"col1\", \"col2\", \"col3\"], \"batch_id\": \"o723491yyr507ho4nf3\"} \u2192 example for expectations \nstarting with expect_multicolumn (they make use of \"column_list\", the other arguments vary).\nbatch_id is common to all expectations, and it is an identifier for the batch of data being validated by\nGreat Expectations.
\n
\n\n
\n\n
Example
\n\n
This scenario uses the row tagging strategy which allow users to tag the rows that failed to be easier to\nidentify the problems in the validations.
Running bellow cell shows the new column created, named dq_validations with information about DQ validations.\ndisplay(spark.read.format(\"delta\").load(\"s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/\"))
\n\n
Performance and Limitations Trade-offs
\n\n
When using the DQ Row Tagging Strategy, by default we are using Great Expectations Result Format \"Complete\" with \nUnexpected Index Column Names (a primary key for the failures), meaning that for each failure, we are getting all \nthe distinct values for the primary key. After getting all the failures, we are applying some needed transformations \nand joining them with the source data, so that it can be tagged by filling the \"dq_validations\" column.
\n\n
Hence, this can definitely be a heavy and time-consuming operation on your data loads. To reduce this disadvantage \nyou can cache the dataframe by passing the \"cache_df\": True in your DQ Specs. In addition to this, always have in \nmind that each expectation (dq_function) that you add into your DQ Specs, is more time that you are adding into your \ndata loads, so always balance performance vs amount of validations that you need.
\n\n
Moreover, Great Expectations is currently relying on the driver node to capture the results of the execution and \nreturn/store them. Thus, in case you have huge amounts of rows failing (let's say 500k or more) Great Expectations \nmight raise exceptions.
\n\n
On these situations, the data load will still happen and the data will still be tagged with the Data Quality \nvalidations information, however you won't have the complete picture of the failures, so the raised_exceptions \nfield is filled as True, so that you can easily notice it and debug it.
\n\n
Most of the time, if you have such an amount of rows failing, it will probably mean that you did something wrong \nand want to fix it as soon as possible (you are not really caring about tagging specific rows, because you will \nnot want your consumers to be consuming a million of defective rows). However, if you still want to try to make it \npass, you can try to increase your driver and play with some spark configurations like:
\n\n
\n
spark.driver.maxResultSize
\n
spark.task.maxFailures
\n
\n\n
For debugging purposes, you can also use a different Great Expectations Result Format like \"SUMMARY\" (adding in your DQ Spec\n\"gx_result_format\": \"SUMMARY\"), so that you get only a partial list of the failures, avoiding surpassing the driver\ncapacity.
\n\n
\n\n
When using a Result Format different from the default (\"COMPLETE\"), the flag \"tag_source_data\" will be \noverwritten to False, as the results of the tagging wouldn't be complete which could lead to erroneous \nconclusions from stakeholders (but you can always get the details about the result of the DQ execution in\nthe result_sink_location or result_sink_db_table that you have configured).
The scenarios presented on this page are similar, but their goal is to show what happens when a DQ expectation fails the validations.\nThe logs generated by the execution of the code will contain information regarding which expectation(s) have failed and why.
\n\n
1. Fail on Error
\n\n
In this scenario is specified below two parameters:
\n\n
\n
\"fail_on_error\": False - this parameter is what controls what happens if a DQ expectation fails. In case\nthis is set to true (default), your job will fail/be aborted and an exception will be raised.\nIn case this is set to false, a log message will be printed about the error (as shown in this\nscenario) and the result status will also be available in result sink (if configured) and in the\n[data docs great expectation site](../data_quality.html#3-data-docs-website). On this scenario it is set tofalse` \nto avoid failing the execution of the notebook.
\n
the max_value of the function expect_table_column_count_to_be_between is defined with specific value so that\nthis expectation fails the validations.
If you run bellow command, you would be able to see the success column has the value false\nfor the last execution.\ndisplay(spark.table(RENDER_UTILS.render_content(\"my_database.dq_result_sink\")))
\n\n
2. Critical Functions
\n\n
In this scenario, alternative parameters to fail_on_error are used:
\n\n
\n
critical_functions - this parameter defaults to None if not defined.\nIt controls what DQ functions are considered a priority and as such, it stops the validation\nand throws an execution error whenever a function defined as critical doesn't pass the test.\nIf any other function that is not defined in this parameter fails, an error message is printed in the logs.\nThis parameter has priority over fail_on_error.\nIn this specific example, after defining the expect_table_column_count_to_be_between as critical,\nit is made sure that the execution is stopped whenever the conditions for the function are not met.
\n
\n\n
Additionally, it can also be defined additional parameters like:
\n\n
\n
max_percentage_failure - this parameter defaults to None if not defined.\nIt controls what percentage of the total functions can fail without stopping the execution of the validation.\nIf the threshold is surpassed the execution stops and a failure error is thrown.\nThis parameter has priority over fail_on_error and critical_functions.
\n
\n\n
You can also pair critical_functions with max_percentage_failure by defining something like\na 0.6 max percentage of failure and also defining some critical function.\nIn this case even if the threshold is respected, the list defined on critical_functions still is checked.
Checking if data reconciles, using this algorithm, is a matter of reading the truth data and the current data.\nYou can use any input specification compatible with the lakehouse engine to read truth or current data. On top\nof that, you can pass a truth_preprocess_query and a current_preprocess_query so you can preprocess the data before\nit goes into the actual reconciliation process. The reconciliation process is focused on joining truth\nwith current by all provided columns except the ones passed as metrics.
\n\n
In the table below, we present how a simple reconciliation would look like:
\n\n
\n\n
\n
current_country
\n
current_count
\n
truth_country
\n
truth_count
\n
absolute_diff
\n
perc_diff
\n
yellow
\n
red
\n
recon_type
\n
\n\n\n
\n
Sweden
\n
123
\n
Sweden
\n
120
\n
3
\n
0.025
\n
0.1
\n
0.2
\n
percentage
\n
\n
\n
Germany
\n
2946
\n
Sweden
\n
2946
\n
0
\n
0
\n
0.1
\n
0.2
\n
percentage
\n
\n
\n
France
\n
2901
\n
France
\n
2901
\n
0
\n
0
\n
0.1
\n
0.2
\n
percentage
\n
\n
\n
Belgium
\n
426
\n
Belgium
\n
425
\n
1
\n
0.002
\n
0.1
\n
0.2
\n
percentage
\n
\n\n
\n\n
The Reconciliator algorithm uses an ACON to configure its execution. You can find the meaning of each ACON property\nin ReconciliatorSpec object.
\n\n
Below there is an example of usage of reconciliator.
\n\n
\n
fromlakehouse_engine.engineimportexecute_reconciliation\n\ntruth_query="""\n SELECT\n shipping_city,\n sum(sales_order_qty) as qty,\n order_date_header\n FROM (\n SELECT\n ROW_NUMBER() OVER (\n PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n ORDER BY changed_on desc\n ) as rank1,\n sales_order_header,\n sales_order_item,\n sales_order_qty,\n order_date_header,\n shipping_city\n FROM truth -- truth is a locally accessible temp view created by the lakehouse engine\n WHERE order_date_header = '2021-10-01'\n ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\ncurrent_query="""\n SELECT\n shipping_city,\n sum(sales_order_qty) as qty,\n order_date_header\n FROM (\n SELECT\n ROW_NUMBER() OVER (\n PARTITION BY sales_order_header, sales_order_schedule, sales_order_item, shipping_city\n ORDER BY changed_on desc\n ) as rank1,\n sales_order_header,\n sales_order_item,\n sales_order_qty,\n order_date_header,\n shipping_city\n FROM current -- current is a locally accessible temp view created by the lakehouse engine\n WHERE order_date_header = '2021-10-01'\n ) a\nWHERE a.rank1 = 1\nGROUP BY a.shipping_city, a.order_date_header\n"""\n\nacon={\n "metrics":[{"metric":"qty","type":"percentage","aggregation":"avg","yellow":0.05,"red":0.1}],\n "truth_input_spec":{\n "spec_id":"truth",\n "read_type":"batch",\n "data_format":"csv",\n "schema_path":"s3://my_data_product_bucket/artefacts/metadata/schemas/bronze/orders.json",\n "options":{\n "delimiter":"^",\n "dateFormat":"yyyyMMdd",\n },\n "location":"s3://my_data_product_bucket/bronze/orders",\n },\n "truth_preprocess_query":truth_query,\n "current_input_spec":{\n "spec_id":"current",\n "read_type":"batch",\n "data_format":"delta",\n "db_table":"my_database.orders",\n },\n "current_preprocess_query":current_query,\n}\n\nexecute_reconciliation(acon=acon)\n
The lakehouse engine sensors are an abstraction to otherwise complex spark code that can be executed in very small\nsingle-node clusters to check if an upstream system or data product contains new data since the last execution of our\njob. With this feature, we can trigger a job to run in more frequent intervals and if the upstream does not contain new\ndata, then the rest of the job exits without creating bigger clusters to execute more intensive data ETL (Extraction,\nTransformation, and Loading).
\n\n
How do Sensor-based jobs work?
\n\n
\n\n
With the sensors capability, data products in the lakehouse can sense if another data product or an upstream system (source\nsystem) have new data since the last successful job. We accomplish this through the approach illustrated above, which\ncan be interpreted as follows:
\n\n\n
A Data Product can check if Kafka, JDBC or any other Lakehouse Engine Sensors supported sources, contains new data using the respective sensors;
\n
The Sensor task may run in a very tiny single-node cluster to ensure cost\nefficiency (check sensor cost efficiency);
\n
If the sensor has recognised that there is new data in the upstream, then you can start a different ETL Job Cluster\nto process all the ETL tasks (data processing tasks).
\n
In the same way, a different Data Product can sense if an upstream Data Product has new data by using 1 of 2 options:\n\n
(Preferred) Sense the upstream Data Product sensor control delta table;
\n
Sense the upstream Data Product data files in s3 (files sensor) or any of their delta tables (delta table\nsensor);
\n
\n\n\n
The Structure and Relevance of the Data Product\u2019s Sensors Control Table
\n\n
The concept of a lakehouse engine sensor is based on a special delta table stored inside the data product that chooses\nto opt in for a sensor-based job. That table is used to control the status of the various sensors implemented by that\ndata product. You can refer to the below table to understand the sensor delta table structure:
\n\n
\n\n
\n
Column Name
\n
Type
\n
Description
\n
\n\n\n
\n
sensor_id
\n
STRING
\n
A unique identifier of the sensor in a specific job. This unique identifier is really important because it is used by the engine to identify if there is new data in the upstream. Each sensor in each job should have a different sensor_id. If you attempt to create 2 sensors with the same sensor_id, the engine will fail.
\n
\n
\n
assets
\n
ARRAY<STRING>
\n
A list of assets (e.g., tables or dataset folder) that are considered as available to consume downstream after the sensor has status PROCESSED_NEW_DATA.
\n
\n
\n
status
\n
STRING
\n
Status of the sensor. Can either be:
ACQUIRED_NEW_DATA \u2013 when the sensor in a job has recognised that there is new data from the upstream but, the job where the sensor is, was still not successfully executed.
PROCESSED_NEW_DATA - when the job where the sensor is located has processed all the tasks in that job.
\n
\n
\n
status_change_timestamp
\n
STRING
\n
Timestamp when the status has changed for the last time.
\n
\n
\n
checkpoint_location
\n
STRING
\n
Base location of the Spark streaming checkpoint location, when applicable (i.e., when the type of sensor uses Spark streaming checkpoints to identify if the upstream has new data). E.g. Spark streaming checkpoints are used for Kafka, Delta and File sensors.
\n
\n
\n
upstream_key
\n
STRING
\n
Upstream key (e.g., used to store an attribute name from the upstream so that new data can be detected automatically). This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the name of a field in the JDBC upstream that contains the values that will allow us to identify new data (e.g., a timestamp in the upstream that tells us when the record was loaded into the database).
\n
\n
\n
upstream_value
\n
STRING
\n
Upstream value (e.g., used to store the max attribute value from the upstream so that new data can be detected automatically). This is the value for upstream_key. This is useful for sensors that do not rely on Spark streaming checkpoints, like the JDBC sensor, as it stores the value of a field in the JDBC upstream that contains the maximum value that was processed by the sensor, and therefore useful for recognizing that there is new data in the upstream (e.g., the value of a timestamp attribute in the upstream that tells us when the record was loaded into the database).
\n
\n\n
\n\n
Note: to make use of the sensors you will need to add this table to your data product.
\n\n
How is it different from scheduled jobs?
\n\n
Sensor-based jobs are still scheduled, but they can be scheduled with higher frequency, as they are more cost-efficient\nthan ramping up a multi-node cluster supposed to do heavy ETL, only to figure out that the upstream does not have new\ndata.
\n\n
Are sensor-based jobs cost-efficient?
\n\n
For the same schedule (e.g., 4 times a day), sensor-based jobs are more cost-efficient than scheduling a regular job, because with sensor-based jobs you can start a very tiny single-node cluster, and only if there is new data in the upstream the bigger ETL cluster is spin up. For this reason, they are considered more cost-efficient.\nMoreover, if you have very hard SLAs to comply with, you can also play with alternative architectures where you can have several sensors in a continuous (always running) cluster, which then keeps triggering the respective data processing jobs, whenever there is new data.
\n\n
Sensor Steps
\n\n\n
Create your sensor task for the upstream source. Examples of available sources:\n
This shows how to create a Sensor to detect new data from a Delta Table.
\n\n
Configuration required to have a Sensor
\n\n
\n
sensor_id: A unique identifier of the sensor in a specific job.
\n
assets: List of assets considered for the sensor, which are considered as available once the\nsensor detects new data and status is ACQUIRED_NEW_DATA.
\n
control_db_table_name: Name of the sensor control table.
\n
input_spec: Input spec with the upstream source.
\n
preprocess_query: Query to filter data returned by the upstream.
\n
\n\n
\n\n
This parameter is only needed when the upstream data have to be filtered, in this case a custom query should be created with the source table as sensor_new_data.
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it \nhas acquired new data. This value can be used to execute or not the next steps.
It makes use of generate_sensor_query to generate the preprocess_query,\ndifferent from delta_table.
\n\n
Data from other sensor delta table, in streaming mode, will be consumed. If there is any new data it will trigger \nthe condition to proceed to the next task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
Using these sensors and consuming the data in streaming mode, if any new file is added to the file location, \nit will automatically trigger the proceeding task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
\n\n
\n
fromlakehouse_engine.engineimportexecute_sensor\n\nacon={\n "sensor_id":"MY_SENSOR_ID",\n "assets":["MY_SENSOR_ASSETS"],\n "control_db_table_name":"my_database.lakehouse_engine_sensors",\n "input_spec":{\n "spec_id":"sensor_upstream",\n "read_type":"streaming",\n "data_format":"csv",# You can use any of the data formats supported by the lakehouse engine, e.g: "avro|json|parquet|csv|delta|cloudfiles"\n "location":"s3://my_data_product_bucket/path",\n },\n "base_checkpoint_location":"s3://my_data_product_bucket/checkpoints",\n "fail_on_empty_result":True,\n}\n\nexecute_sensor(acon=acon)\n
\n
\n\n
fail_on_empty_result as False
\n\n
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
This shows how to create a Sensor to detect new data from a JDBC table.
\n\n
Configuration required to have a Sensor
\n\n
\n
jdbc_args: Arguments of the JDBC upstream.
\n
generate_sensor_query: Generates a Sensor query to consume data from the upstream, this function can be used on preprocess_query ACON option.\n
\n
sensor_id: The unique identifier for the Sensor.
\n
filter_exp: Expression to filter incoming new data.\nA placeholder ?upstream_key and ?upstream_value can be used, example: ?upstream_key > ?upstream_value so that it can be replaced by the respective values from the sensor control_db_table_name for this specific sensor_id.
\n
control_db_table_name: Sensor control table name.
\n
upstream_key: the key of custom sensor information to control how to identify new data from the upstream (e.g., a time column in the upstream).
\n
upstream_value: the first upstream value to identify new data from the upstream (e.g., the value of a time present in the upstream). Note: This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is -2147483647.
\n
upstream_table_name: Table name to consume the upstream value. If it's empty the default value applied is sensor_new_data.
\n
\n
\n\n
If you want to know more please visit the definition of the class here.
\n\n
Scenarios
\n\n
This covers the following scenarios of using the Sensor:
Data from JDBC, in batch mode, will be consumed. If there is new data based in the preprocess query from the source table, it will trigger the condition to proceed to the next task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
Data from Kafka, in streaming mode, will be consumed, so if there is any new data in the kafka topic it will give condition to proceed to the next task.
\n\n
fail_on_empty_result as True (default and SUGGESTED)
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.
This shows how to create a Sensor to detect new data from a SAP LOGCHAIN table.
\n\n
Configuration required to have a Sensor
\n\n
\n
sensor_id: A unique identifier of the sensor in a specific job.
\n
assets: List of assets considered for the sensor, which are considered as available once the\nsensor detects new data and status is ACQUIRED_NEW_DATA.
\n
control_db_table_name: Name of the sensor control table.
\n
input_spec: Input spec with the upstream source.
\n
preprocess_query: Query to filter data returned by the upstream.
\n
\n\n
\n\n
This parameter is only needed when the upstream data have to be filtered,
\n\n
in this case a custom query should be created with the source table as sensor_new_data.
\n\n
\n\n
\n
base_checkpoint_location: Spark streaming checkpoints to identify if the upstream has new data.
\n
fail_on_empty_result: Flag representing if it should raise NoNewDataException when\nthere is no new data detected from upstream.
\n
\n\n
Specific configuration required to have a Sensor consuming a SAP BW/B4 upstream.\nThe Lakehouse Engine provides two utility functions to make easier to consume SAP as upstream:\ngenerate_sensor_sap_logchain_query and generate_sensor_query.
\n\n
\n
generate_sensor_sap_logchain_query: This function aims\nto create a temporary table with timestamp from the SAP LOGCHAIN table, which is a process control table.
\n\n
Note: this temporary table only lives during runtime, and it is related with the\nsap process control table but has no relationship or effect on the sensor control table.
\n\n
\n
chain_id: SAP Chain ID process.
\n
dbtable: SAP LOGCHAIN db table name, default: my_database.RSPCLOGCHAIN.
\n
status: SAP Chain Status of your process, default: G.
\n
engine_table_name: Name of the temporary table created from the upstream data,\ndefault: sensor_new_data.\nThis temporary table will be used as source in the query option.
\n
\n
generate_sensor_query: Generates a Sensor query to consume data from the temporary table created in the prepareQuery.
\n\n
\n
sensor_id: The unique identifier for the Sensor.
\n
filter_exp: Expression to filter incoming new data.\nA placeholder ?upstream_key and ?upstream_value can be used, example: ?upstream_key > ?upstream_value\nso that it can be replaced by the respective values from the sensor control_db_table_name\nfor this specific sensor_id.
\n
control_db_table_name: Sensor control table name.
\n
upstream_key: the key of custom sensor information to control how to identify\nnew data from the upstream (e.g., a time column in the upstream).
\n
upstream_value: the first upstream value to identify new data from the\nupstream (e.g., the value of a time present in the upstream).
\n\n
\n\n
This parameter will have effect just in the first run to detect if the upstream have new data. If it's empty the default value applied is -2147483647.
\n\n
\n
upstream_table_name: Table name to consume the upstream value.\nIf it's empty the default value applied is sensor_new_data.
\n\n
\n\n
In case of using the generate_sensor_sap_logchain_query the default value for the temp table is sensor_new_data, so if passing a different value in the engine_table_name this parameter should have the same value.
\n\n
\n
\n
\n\n
If you want to know more please visit the definition of the class here.
\n\n
Scenarios
\n\n
This covers the following scenarios of using the Sensor:
Using fail_on_empty_result=False, in which the execute_sensor function returns a boolean representing if it\nhas acquired new data. This value can be used to execute or not the next steps.