-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Kernel]Add simple crc post commit for incremental crc writing. #4134
Changes from 73 commits
1c05213
ca06a90
3caed70
3b7dc2c
2f58dfa
e7f4beb
52bda11
d1626dc
2a1092d
6c8069f
471e172
8c3c33b
5d0b5a7
6796fa5
2a93c49
826f623
ebaf2c8
000a36d
0c425be
e972c31
73df627
1df8af5
a3ba086
ee72c53
c8cf5b1
d922524
d2fc815
d1504f0
78477d1
7f7e9e7
bbd9ea3
76a5c38
b723b7d
f85c845
97f36fc
56b2e7e
a06bf98
1e7828b
4cddb67
dc4c94c
bf6613b
6b81868
daa03fd
0ac1005
63f2f46
622d28d
13bb0a7
972109a
c52c58e
bba00bd
93b9aeb
a11badd
cf33624
a6189d7
26cd8b8
c0178ec
cff4b84
658f770
6c70ba5
abc2bd9
305997e
e7710cb
0bec898
d392927
f60b9c6
c4ae5e5
a6be52b
eb1bca1
941d9c8
a14d42e
c4462a7
744beaf
562cfe4
b639110
5a34d7e
b9b4aac
7900389
a2e2883
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,16 +29,19 @@ | |
import io.delta.kernel.expressions.Column; | ||
import io.delta.kernel.hook.PostCommitHook; | ||
import io.delta.kernel.internal.actions.*; | ||
import io.delta.kernel.internal.checksum.CRCInfo; | ||
import io.delta.kernel.internal.data.TransactionStateRow; | ||
import io.delta.kernel.internal.fs.Path; | ||
import io.delta.kernel.internal.hook.CheckpointHook; | ||
import io.delta.kernel.internal.hook.ChecksumSimpleHook; | ||
import io.delta.kernel.internal.metrics.TransactionMetrics; | ||
import io.delta.kernel.internal.metrics.TransactionReportImpl; | ||
import io.delta.kernel.internal.replay.ConflictChecker; | ||
import io.delta.kernel.internal.replay.ConflictChecker.TransactionRebaseState; | ||
import io.delta.kernel.internal.rowtracking.RowTracking; | ||
import io.delta.kernel.internal.tablefeatures.TableFeatures; | ||
import io.delta.kernel.internal.util.*; | ||
import io.delta.kernel.metrics.TransactionMetricsResult; | ||
import io.delta.kernel.metrics.TransactionReport; | ||
import io.delta.kernel.types.StructType; | ||
import io.delta.kernel.utils.CloseableIterable; | ||
|
@@ -342,6 +345,7 @@ private TransactionCommitResult doCommit( | |
dataAndMetadataActions.map( | ||
action -> { | ||
transactionMetrics.totalActionsCounter.increment(); | ||
// TODO: handle RemoveFiles. | ||
if (!action.isNullAt(ADD_FILE_ORDINAL)) { | ||
transactionMetrics.addFilesCounter.increment(); | ||
transactionMetrics.addFilesSizeInBytesCounter.increment( | ||
|
@@ -362,6 +366,10 @@ private TransactionCommitResult doCommit( | |
postCommitHooks.add(new CheckpointHook(dataPath, commitAsVersion)); | ||
} | ||
|
||
buildPostCommitCrcInfoIfCurrentCrcAvailable( | ||
commitAsVersion, transactionMetrics.captureTransactionMetricsResult()) | ||
.ifPresent(crcInfo -> postCommitHooks.add(new ChecksumSimpleHook(crcInfo, logPath))); | ||
|
||
return new TransactionCommitResult(commitAsVersion, postCommitHooks); | ||
} catch (FileAlreadyExistsException e) { | ||
throw e; | ||
|
@@ -437,6 +445,36 @@ private void recordTransactionReport( | |
engine.getMetricsReporters().forEach(reporter -> reporter.report(transactionReport)); | ||
} | ||
|
||
private Optional<CRCInfo> buildPostCommitCrcInfoIfCurrentCrcAvailable( | ||
long commitAtVersion, TransactionMetricsResult metricsResult) { | ||
if (isNewTable) { | ||
return Optional.of( | ||
new CRCInfo( | ||
commitAtVersion, | ||
metadata, | ||
protocol, | ||
metricsResult.getTotalAddFilesSizeInBytes(), | ||
metricsResult.getNumAddFiles(), | ||
Optional.of(txnId.toString()))); | ||
} | ||
|
||
return readSnapshot | ||
.getCurrentCrcInfo() | ||
// in the case of a conflicting txn and successful retry the readSnapshot may not be | ||
// commitVersion - 1 | ||
.filter(lastCrcInfo -> commitAtVersion == lastCrcInfo.getVersion() + 1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: want to add a comment like When I first saw this it was intuitive to me immediately why we needed to filter There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Side note, in those cases, is it possible for us to see if there was a CRC written and use that for simple crc write? This can be a P2+ but maybe we should track it somewhere (i.e. create a github issue?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be a good optimization, filed #4177 for track There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should all just be a part of the rebase process. When we rebase, we need to list + read all the JSONs and do various conflict checkpoing. We might as well list + read the latest CRC file, too. |
||
.map( | ||
lastCrcInfo -> | ||
new CRCInfo( | ||
commitAtVersion, | ||
metadata, | ||
protocol, | ||
// TODO: handle RemoveFiles for calculating table size and num of files. | ||
lastCrcInfo.getTableSizeBytes() + metricsResult.getTotalAddFilesSizeInBytes(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a TODO that in the future we will need to handle RemoveFiles |
||
lastCrcInfo.getNumFiles() + metricsResult.getNumAddFiles(), | ||
Optional.of(txnId.toString()))); | ||
} | ||
|
||
/** | ||
* Get the part of the schema of the table that needs the statistics to be collected per file. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
/* | ||
* Copyright (2025) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel.internal.checksum; | ||
|
||
import static io.delta.kernel.internal.util.Preconditions.checkArgument; | ||
import static java.util.Objects.requireNonNull; | ||
|
||
import io.delta.kernel.data.ColumnVector; | ||
import io.delta.kernel.data.ColumnarBatch; | ||
import io.delta.kernel.data.Row; | ||
import io.delta.kernel.internal.actions.Metadata; | ||
import io.delta.kernel.internal.actions.Protocol; | ||
import io.delta.kernel.internal.data.GenericRow; | ||
import io.delta.kernel.internal.util.InternalUtils; | ||
import io.delta.kernel.types.LongType; | ||
import io.delta.kernel.types.StringType; | ||
import io.delta.kernel.types.StructType; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class CRCInfo { | ||
private static final Logger logger = LoggerFactory.getLogger(CRCInfo.class); | ||
|
||
// Constants for schema field names | ||
private static final String TABLE_SIZE_BYTES = "tableSizeBytes"; | ||
private static final String NUM_FILES = "numFiles"; | ||
private static final String NUM_METADATA = "numMetadata"; | ||
private static final String NUM_PROTOCOL = "numProtocol"; | ||
private static final String METADATA = "metadata"; | ||
private static final String PROTOCOL = "protocol"; | ||
private static final String TXN_ID = "txnId"; | ||
|
||
public static final StructType CRC_FILE_SCHEMA = | ||
new StructType() | ||
.add(TABLE_SIZE_BYTES, LongType.LONG) | ||
.add(NUM_FILES, LongType.LONG) | ||
.add(NUM_METADATA, LongType.LONG) | ||
.add(NUM_PROTOCOL, LongType.LONG) | ||
.add(METADATA, Metadata.FULL_SCHEMA) | ||
.add(PROTOCOL, Protocol.FULL_SCHEMA) | ||
.add(TXN_ID, StringType.STRING, /*nullable*/ true); | ||
|
||
public static Optional<CRCInfo> fromColumnarBatch( | ||
scottsand-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
long version, ColumnarBatch batch, int rowId, String crcFilePath) { | ||
// Read required fields. | ||
Protocol protocol = | ||
Protocol.fromColumnVector(batch.getColumnVector(getSchemaIndex(PROTOCOL)), rowId); | ||
Metadata metadata = | ||
Metadata.fromColumnVector(batch.getColumnVector(getSchemaIndex(METADATA)), rowId); | ||
long tableSizeBytes = | ||
InternalUtils.requireNonNull( | ||
batch.getColumnVector(getSchemaIndex(TABLE_SIZE_BYTES)), rowId, TABLE_SIZE_BYTES) | ||
.getLong(rowId); | ||
long numFiles = | ||
InternalUtils.requireNonNull( | ||
batch.getColumnVector(getSchemaIndex(NUM_FILES)), rowId, NUM_FILES) | ||
.getLong(rowId); | ||
|
||
// Read optional fields | ||
ColumnVector txnIdColumnVector = batch.getColumnVector(getSchemaIndex(TXN_ID)); | ||
Optional<String> txnId = | ||
txnIdColumnVector.isNullAt(rowId) | ||
? Optional.empty() | ||
: Optional.of(txnIdColumnVector.getString(rowId)); | ||
|
||
// protocol and metadata are nullable per fromColumnVector's implementation. | ||
if (protocol == null || metadata == null) { | ||
logger.warn("Invalid checksum file missing protocol and/or metadata: {}", crcFilePath); | ||
return Optional.empty(); | ||
} | ||
return Optional.of(new CRCInfo(version, metadata, protocol, tableSizeBytes, numFiles, txnId)); | ||
} | ||
|
||
private final long version; | ||
private final Metadata metadata; | ||
private final Protocol protocol; | ||
private final long tableSizeBytes; | ||
private final long numFiles; | ||
private final Optional<String> txnId; | ||
|
||
public CRCInfo( | ||
long version, | ||
Metadata metadata, | ||
Protocol protocol, | ||
long tableSizeBytes, | ||
long numFiles, | ||
Optional<String> txnId) { | ||
checkArgument(tableSizeBytes >= 0); | ||
checkArgument(numFiles >= 0); | ||
this.version = version; | ||
this.metadata = requireNonNull(metadata); | ||
this.protocol = requireNonNull(protocol); | ||
this.tableSizeBytes = tableSizeBytes; | ||
this.numFiles = numFiles; | ||
this.txnId = requireNonNull(txnId); | ||
} | ||
|
||
/** The version of the Delta table that this CRCInfo represents. */ | ||
public long getVersion() { | ||
return version; | ||
} | ||
|
||
/** The {@link Metadata} stored in this CRCInfo. */ | ||
public Metadata getMetadata() { | ||
return metadata; | ||
} | ||
|
||
/** The {@link Protocol} stored in this CRCInfo. */ | ||
public Protocol getProtocol() { | ||
return protocol; | ||
} | ||
|
||
public long getNumFiles() { | ||
return numFiles; | ||
} | ||
|
||
public long getTableSizeBytes() { | ||
return tableSizeBytes; | ||
} | ||
|
||
public Optional<String> getTxnId() { | ||
return txnId; | ||
} | ||
|
||
/** | ||
* Encode as a {@link Row} object with the schema {@link CRCInfo#CRC_FILE_SCHEMA}. | ||
* | ||
* @return {@link Row} object with the schema {@link CRCInfo#CRC_FILE_SCHEMA} | ||
*/ | ||
public Row toRow() { | ||
Map<Integer, Object> values = new HashMap<>(); | ||
// Add required fields | ||
values.put(getSchemaIndex(TABLE_SIZE_BYTES), tableSizeBytes); | ||
values.put(getSchemaIndex(NUM_FILES), numFiles); | ||
values.put(getSchemaIndex(NUM_METADATA), 1L); | ||
values.put(getSchemaIndex(NUM_PROTOCOL), 1L); | ||
values.put(getSchemaIndex(METADATA), metadata.toRow()); | ||
values.put(getSchemaIndex(PROTOCOL), protocol.toRow()); | ||
|
||
// Add optional fields | ||
txnId.ifPresent(txn -> values.put(getSchemaIndex(TXN_ID), txn)); | ||
return new GenericRow(CRC_FILE_SCHEMA, values); | ||
} | ||
|
||
private static int getSchemaIndex(String fieldName) { | ||
return CRC_FILE_SCHEMA.indexOf(fieldName); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
* Copyright (2025) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel.internal.checksum; | ||
|
||
import static io.delta.kernel.internal.DeltaErrors.wrapEngineExceptionThrowsIO; | ||
import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; | ||
import static java.util.Objects.requireNonNull; | ||
|
||
import io.delta.kernel.engine.Engine; | ||
import io.delta.kernel.internal.fs.Path; | ||
import io.delta.kernel.internal.util.FileNames; | ||
import java.io.IOException; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
/** Writers for writing checksum files from a snapshot */ | ||
public class ChecksumWriter { | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(ChecksumWriter.class); | ||
|
||
private final Path logPath; | ||
|
||
public ChecksumWriter(Path logPath) { | ||
this.logPath = requireNonNull(logPath); | ||
} | ||
|
||
/** Writes a checksum file */ | ||
public void writeCheckSum(Engine engine, CRCInfo crcInfo) throws IOException { | ||
Path newChecksumPath = FileNames.checksumFile(logPath, crcInfo.getVersion()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add some validations to make sure we are not writing some incorrect values (e.g. numMetadata is not 1) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we add validations, make sure to add tests for it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could also instead add this validation as part of the |
||
logger.info("Writing checksum file to path: {}", newChecksumPath); | ||
wrapEngineExceptionThrowsIO( | ||
() -> { | ||
engine | ||
.getJsonHandler() | ||
.writeJsonFileAtomically( | ||
newChecksumPath.toString(), | ||
singletonCloseableIterator(crcInfo.toRow()), | ||
false /* overwrite */); | ||
logger.info("Write checksum file `{}` succeeds", newChecksumPath); | ||
return null; | ||
scottsand-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}, | ||
"Write checksum file `%s`", | ||
newChecksumPath); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* | ||
* Copyright (2025) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel.internal.hook; | ||
|
||
import static io.delta.kernel.internal.util.Preconditions.checkArgument; | ||
import static java.util.Objects.requireNonNull; | ||
|
||
import io.delta.kernel.engine.Engine; | ||
import io.delta.kernel.hook.PostCommitHook; | ||
import io.delta.kernel.internal.checksum.CRCInfo; | ||
import io.delta.kernel.internal.checksum.ChecksumWriter; | ||
import io.delta.kernel.internal.fs.Path; | ||
import java.io.IOException; | ||
|
||
/** | ||
* A post-commit hook that writes a new checksum file at the version committed by the transaction. | ||
* This hook performs a simple checksum operation without requiring previous checkpoint or log | ||
* reading. | ||
*/ | ||
public class ChecksumSimpleHook implements PostCommitHook { | ||
|
||
private final CRCInfo crcInfo; | ||
private final Path logPath; | ||
|
||
public ChecksumSimpleHook(CRCInfo crcInfo, Path logPath) { | ||
this.crcInfo = requireNonNull(crcInfo); | ||
this.logPath = requireNonNull(logPath); | ||
} | ||
|
||
@Override | ||
public void threadSafeInvoke(Engine engine) throws IOException { | ||
checkArgument(engine != null); | ||
new ChecksumWriter(logPath).writeCheckSum(engine, crcInfo); | ||
} | ||
|
||
@Override | ||
public PostCommitHookType getType() { | ||
return PostCommitHookType.CHECKSUM_SIMPLE; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yay I find these docs super clear!