From f6d2bed62b80359e430459fd2470aaaa4a0e41dc Mon Sep 17 00:00:00 2001 From: Jacek Laskowski Date: Sat, 11 Jan 2025 14:19:34 +0100 Subject: [PATCH] Identity Columns cntd. --- docs/ColumnWithDefaultExprUtils.md | 39 ++++++++----- docs/DeltaColumnBuilder.md | 61 ++++++++++++++++++--- docs/commands/merge/MergeIntoCommandBase.md | 9 +++ docs/identity-columns/IdentityColumn.md | 17 ++++++ docs/identity-columns/index.md | 27 ++++++++- docs/spark-connector/DeltaSourceUtils.md | 17 +++--- 6 files changed, 138 insertions(+), 32 deletions(-) create mode 100644 docs/identity-columns/IdentityColumn.md diff --git a/docs/ColumnWithDefaultExprUtils.md b/docs/ColumnWithDefaultExprUtils.md index aa59cbd29a..b26d3c8cd2 100644 --- a/docs/ColumnWithDefaultExprUtils.md +++ b/docs/ColumnWithDefaultExprUtils.md @@ -7,7 +7,7 @@ IDENTITY column is not supported ``` -## IDENTITY_MIN_WRITER_VERSION +## IDENTITY_MIN_WRITER_VERSION { #IDENTITY_MIN_WRITER_VERSION } `ColumnWithDefaultExprUtils` uses `6` as the [minimum version of a writer](Protocol.md#minWriterVersion) for writing to `IDENTITY` columns. @@ -16,7 +16,7 @@ * `ColumnWithDefaultExprUtils` is used to [satisfyProtocol](#satisfyProtocol) * `Protocol` utility is used to [determine the required minimum protocol](Protocol.md#requiredMinimumProtocol) -## columnHasDefaultExpr +## columnHasDefaultExpr { #columnHasDefaultExpr } ```scala columnHasDefaultExpr( @@ -30,7 +30,7 @@ columnHasDefaultExpr( * `DeltaAnalysis` logical resolution rule is requested to `resolveQueryColumnsByName` -## hasIdentityColumn +## hasIdentityColumn { #hasIdentityColumn } ```scala hasIdentityColumn( @@ -43,41 +43,50 @@ hasIdentityColumn( * `Protocol` utility is used for the [required minimum protocol](Protocol.md#requiredMinimumProtocol) -## isIdentityColumn +## isIdentityColumn { #isIdentityColumn } ```scala isIdentityColumn( field: StructField): Boolean ``` -`isIdentityColumn` uses the `Metadata` (of the given `StructField`) to check the existence of [delta.identity.start](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_START), [delta.identity.step](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_STEP) and [delta.identity.allowExplicitInsert](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_ALLOW_EXPLICIT_INSERT) metadata keys. +`isIdentityColumn` is used to find out whether a `StructField` is an [identity column](identity-columns/index.md) or not. -!!! note "IDENTITY column" - **IDENTITY column** is a column with [delta.identity.start](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_START), [delta.identity.step](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_STEP) and [delta.identity.allowExplicitInsert](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_ALLOW_EXPLICIT_INSERT) metadata. +`isIdentityColumn` uses the `Metadata` (of the given `StructField`) to check the existence of the following metadata keys: + +* [delta.identity.start](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_START) +* [delta.identity.step](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_STEP) +* [delta.identity.allowExplicitInsert](spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_ALLOW_EXPLICIT_INSERT) + +--- `isIdentityColumn` is used when: -* `ColumnWithDefaultExprUtils` is used to [hasIdentityColumn](#hasIdentityColumn) and [removeDefaultExpressions](#removeDefaultExpressions) +* `ColumnWithDefaultExprUtils` is used to [addDefaultExprsOrReturnConstraints](#addDefaultExprsOrReturnConstraints), [columnHasDefaultExpr](#columnHasDefaultExpr), [hasIdentityColumn](#hasIdentityColumn) and [removeDefaultExpressions](#removeDefaultExpressions) +* `IdentityColumn` is requested to [blockExplicitIdentityColumnInsert](identity-columns/IdentityColumn.md#blockExplicitIdentityColumnInsert), [getIdentityColumns](identity-columns/IdentityColumn.md#getIdentityColumns), [syncIdentity](identity-columns/IdentityColumn.md#syncIdentity), [updateSchema](identity-columns/IdentityColumn.md#updateSchema), [updateToValidHighWaterMark](identity-columns/IdentityColumn.md#updateToValidHighWaterMark) +* `DeltaCatalog` is requested to [alterTable](DeltaCatalog.md#alterTable) and [createDeltaTable](DeltaCatalog.md#createDeltaTable) +* `MergeIntoCommandBase` is requested to [checkIdentityColumnHighWaterMarks](commands/merge/MergeIntoCommandBase.md#checkIdentityColumnHighWaterMarks) +* `WriteIntoDelta` is requested to [writeAndReturnCommitData](commands/WriteIntoDelta.md#writeAndReturnCommitData) -## Removing Default Expressions +## Remove Default Expressions from Table Schema { #removeDefaultExpressions } ```scala removeDefaultExpressions( schema: StructType, - keepGeneratedColumns: Boolean = false): StructType + keepGeneratedColumns: Boolean = false, + keepIdentityColumns: Boolean = false): StructType ``` `removeDefaultExpressions`...FIXME +--- + `removeDefaultExpressions` is used when: -* `DeltaLog` is requested to [create a BaseRelation](DeltaLog.md#createRelation) and [createDataFrame](DeltaLog.md#createDataFrame) +* `DeltaTableUtils` is requested to [removeInternalWriterMetadata](DeltaTableUtils.md#removeInternalWriterMetadata) * `OptimisticTransactionImpl` is requested to [updateMetadataInternal](OptimisticTransactionImpl.md#updateMetadataInternal) -* `DeltaTableV2` is requested for the [tableSchema](DeltaTableV2.md#tableSchema) -* `DeltaDataSource` is requested for the [sourceSchema](spark-connector/DeltaDataSource.md#sourceSchema) -* `DeltaSourceBase` is requested for the [schema](spark-connector/DeltaSource.md#schema) -## tableHasDefaultExpr +## tableHasDefaultExpr { #tableHasDefaultExpr } ```scala tableHasDefaultExpr( diff --git a/docs/DeltaColumnBuilder.md b/docs/DeltaColumnBuilder.md index a4abad20a2..917e0fd956 100644 --- a/docs/DeltaColumnBuilder.md +++ b/docs/DeltaColumnBuilder.md @@ -28,22 +28,22 @@ import io.delta.tables.DeltaColumnBuilder ## Operators -### build +### Build StructField { #build } ```scala build(): StructField ``` -Creates a `StructField` ([Spark SQL]({{ book.spark_sql }}/types/StructField)) +Creates a `StructField` ([Spark SQL]({{ book.spark_sql }}/types/StructField)) (possibly with some field metadata) -### comment +### comment { #comment } ```scala comment( comment: String): DeltaColumnBuilder ``` -### dataType +### dataType { #dataType } ```scala dataType( @@ -52,7 +52,7 @@ dataType( dataType: String): DeltaColumnBuilder ``` -### generatedAlwaysAs +### generatedAlwaysAs { #generatedAlwaysAs } ```scala generatedAlwaysAs( @@ -61,14 +61,46 @@ generatedAlwaysAs( Registers the [Generation Expression](#generationExpr) of this field -### nullable +### generatedAlwaysAsIdentity { #generatedAlwaysAsIdentity } + +```scala +generatedAlwaysAsIdentity( + start: Long, + step: Long): DeltaColumnBuilder +``` + +Sets the following: + +Property | Value +-|- +[identityStart](#identityStart) | `start` +[identityStep](#identityStep) | `step` +[identityAllowExplicitInsert](#identityAllowExplicitInsert) | `false` + +### generatedByDefaultAsIdentity { #generatedByDefaultAsIdentity } + +```scala +generatedByDefaultAsIdentity( + start: Long, + step: Long): DeltaColumnBuilder +``` + +Sets the following: + +Property | Value +-|- +[identityStart](#identityStart) | `start` +[identityStep](#identityStep) | `step` +[identityAllowExplicitInsert](#identityAllowExplicitInsert) | `true` + +### nullable { #nullable } ```scala nullable( nullable: Boolean): DeltaColumnBuilder ``` -## Generation Expression +## Generation Expression { #generationExpr } ```scala generationExpr: Option[String] = None @@ -77,3 +109,18 @@ generationExpr: Option[String] = None `DeltaColumnBuilder` uses `generationExpr` internal registry for the [generatedAlwaysAs](#generatedAlwaysAs) expression. When requested to [build a StructField](#build), `DeltaColumnBuilder` registers `generationExpr` under [delta.generationExpression](spark-connector/DeltaSourceUtils.md#GENERATION_EXPRESSION_METADATA_KEY) key in the metadata (of this field). + +## identityAllowExplicitInsert { #identityAllowExplicitInsert } + +```scala +identityAllowExplicitInsert: Option[Boolean] = None +``` + +`identityAllowExplicitInsert` flag is used to indicate a call to the following methods: + +Method | Value +-|- +[generatedAlwaysAsIdentity](#generatedAlwaysAsIdentity) | `false` +[generatedByDefaultAsIdentity](#generatedByDefaultAsIdentity) | `true` + +`identityAllowExplicitInsert` is used to [build a StructField](#build). diff --git a/docs/commands/merge/MergeIntoCommandBase.md b/docs/commands/merge/MergeIntoCommandBase.md index 6f114120f9..c32153c032 100644 --- a/docs/commands/merge/MergeIntoCommandBase.md +++ b/docs/commands/merge/MergeIntoCommandBase.md @@ -108,6 +108,15 @@ Used when: * `MergeIntoCommandBase` is requested to [run](#run) +### checkIdentityColumnHighWaterMarks { #checkIdentityColumnHighWaterMarks } + +```scala +checkIdentityColumnHighWaterMarks( + deltaTxn: OptimisticTransaction): Unit +``` + +`checkIdentityColumnHighWaterMarks`...FIXME + ## Implementations * [MergeIntoCommand](MergeIntoCommand.md) diff --git a/docs/identity-columns/IdentityColumn.md b/docs/identity-columns/IdentityColumn.md new file mode 100644 index 0000000000..8ff3d80ee8 --- /dev/null +++ b/docs/identity-columns/IdentityColumn.md @@ -0,0 +1,17 @@ +# IdentityColumn + +## getIdentityInfo { #getIdentityInfo } + +```scala +getIdentityInfo( + field: StructField): IdentityInfo +``` + +`getIdentityInfo`...FIXME + +--- + +`getIdentityInfo` is used when: + +* `IdentityColumn` is requested to [copySchemaWithMergedHighWaterMarks](#copySchemaWithMergedHighWaterMarks), [createIdentityColumnGenerationExpr](#createIdentityColumnGenerationExpr), [syncIdentity](#syncIdentity), [updateSchema](#updateSchema), [updateToValidHighWaterMark](#updateToValidHighWaterMark) +* `MergeIntoCommandBase` is requested to [checkIdentityColumnHighWaterMarks](../commands/merge/MergeIntoCommandBase.md#checkIdentityColumnHighWaterMarks) diff --git a/docs/identity-columns/index.md b/docs/identity-columns/index.md index 30c8cb59a1..a0d51007f8 100644 --- a/docs/identity-columns/index.md +++ b/docs/identity-columns/index.md @@ -1,3 +1,28 @@ # Identity Columns -**Identity Columns** is a new feature in Delta Lake 3.3.0 that allows assigning unique values for each record inserted into a table. +**Identity Columns** is a new feature in Delta Lake 3.3.0 that allows assigning unique values for each record writted out into a table (unless users provide values for them explicitly). + +Identity Columns feature is supported by delta tables that meet one of the following requirements: + +* The tables must be on Writer Version 6 +* The table must be on Writer Version 7, and a feature name `identityColumns` must exist in the table protocol's `writerFeatures`. + +Identity Columns cannot be specified with a generated column expression (or a `DeltaAnalysisException` is reported). + +Identity Columns can only be of `LongType`. + +IDENTITY column step cannot be 0 (or a `DeltaAnalysisException` is reported). + +Internally, identity columns are columns (fields) with the following `Metadata`: + +Key | Value +-|- +[delta.identity.allowExplicitInsert](../spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_ALLOW_EXPLICIT_INSERT) | [identityAllowExplicitInsert](../DeltaColumnBuilder.md#identityAllowExplicitInsert) +[delta.identity.start](../spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_START) | [identityStart](../DeltaColumnBuilder.md#identityStart) +[delta.identity.step](../spark-connector/DeltaSourceUtils.md#IDENTITY_INFO_STEP) | [identityStep](../DeltaColumnBuilder.md#identityStep) + +[IdentityColumn](IdentityColumn.md) and [ColumnWithDefaultExprUtils](../ColumnWithDefaultExprUtils.md#isIdentityColumn) utilities are used to work with identity columns. + +## Learn More + +* [Identity Columns]({{ delta.github }}/PROTOCOL.md#identity-columns) in Delta Lake's table protocol specification diff --git a/docs/spark-connector/DeltaSourceUtils.md b/docs/spark-connector/DeltaSourceUtils.md index 485101fd98..0e0eb6fe6b 100644 --- a/docs/spark-connector/DeltaSourceUtils.md +++ b/docs/spark-connector/DeltaSourceUtils.md @@ -4,7 +4,7 @@ title: DeltaSourceUtils # DeltaSourceUtils -## delta.generationExpression +## delta.generationExpression { #delta.generationExpression } `DeltaSourceUtils` defines `delta.generationExpression` metadata key for the generation expression of a [generated column](../DeltaColumnBuilder.md#generatedAlwaysAs) of a delta table. @@ -17,7 +17,7 @@ Used when: * [GeneratedColumn](../generated-columns/GeneratedColumn.md) utility is used to [isGeneratedColumn](../generated-columns/GeneratedColumn.md#isGeneratedColumn) and [getGenerationExpressionStr](../generated-columns/GeneratedColumn.md#getGenerationExpressionStr) * `SchemaUtils` utility is used to [reportDifferences](../SchemaUtils.md#reportDifferences) -## delta.identity.allowExplicitInsert +## delta.identity.allowExplicitInsert { #delta.identity.allowExplicitInsert } `DeltaSourceUtils` defines `delta.identity.allowExplicitInsert` metadata key for...FIXME @@ -25,15 +25,14 @@ Used when: * `ColumnWithDefaultExprUtils` utility is used to [isIdentityColumn](../ColumnWithDefaultExprUtils.md#isIdentityColumn) and [removeDefaultExpressions](../ColumnWithDefaultExprUtils.md#removeDefaultExpressions) -## delta.identity.start +## delta.identity.start { #delta.identity.start } -`DeltaSourceUtils` defines `delta.identity.start` metadata key for...FIXME +`delta.identity.start` table metadata key is used when: -Used when: - -* `ColumnWithDefaultExprUtils` utility is used to [isIdentityColumn](../ColumnWithDefaultExprUtils.md#isIdentityColumn) and [removeDefaultExpressions](../ColumnWithDefaultExprUtils.md#removeDefaultExpressions) +* `DeltaColumnBuilder` is requested to [build a StructField](../DeltaColumnBuilder.md#build) (with [identityAllowExplicitInsert](../DeltaColumnBuilder.md#identityAllowExplicitInsert) defined) +* `ColumnWithDefaultExprUtils` is used to [isIdentityColumn](../ColumnWithDefaultExprUtils.md#isIdentityColumn) and [removeDefaultExpressions](../ColumnWithDefaultExprUtils.md#removeDefaultExpressions) -## delta.identity.step +## delta.identity.step { #delta.identity.step } `DeltaSourceUtils` defines `delta.identity.step` metadata key for...FIXME @@ -41,7 +40,7 @@ Used when: * `ColumnWithDefaultExprUtils` utility is used to [isIdentityColumn](../ColumnWithDefaultExprUtils.md#isIdentityColumn) and [removeDefaultExpressions](../ColumnWithDefaultExprUtils.md#removeDefaultExpressions) -## isDeltaDataSourceName +## isDeltaDataSourceName { #isDeltaDataSourceName } ```scala isDeltaDataSourceName(