Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Spark] Auto Compaction was incorrectly including large files towards minNumFiles #4045 #4178

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1108,9 +1108,11 @@ trait OptimisticTransactionImpl extends DeltaTransaction
def createAutoCompactStatsCollector(): AutoCompactPartitionStatsCollector = {
try {
if (spark.conf.get(DeltaSQLConf.DELTA_AUTO_COMPACT_RECORD_PARTITION_STATS_ENABLED)) {
val maxFileSize = spark.conf
.get(DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE)
val minFileSize = spark.conf
.get(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_FILE_SIZE)
.getOrElse(Long.MaxValue)
.getOrElse(maxFileSize / 2L)
return AutoCompactPartitionStats.instance(spark)
.createStatsCollector(minFileSize, reportAutoCompactStatsError)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,12 @@ class AutoCompactPartitionStats(
var wasAutoCompacted: Boolean = false) {

/**
* Determine whether this partition can be autocompacted based on the number of small files or
* if this [[AutoCompactPartitionStats]] instance has not auto compacted it yet.
* Determine whether this partition can be autocompacted based on the number of small files.
* @param minNumFiles The minimum number of files this table-partition should have to trigger
* Auto Compaction in case it has already been compacted once.
*/
def hasSufficientSmallFilesOrHasNotBeenCompacted(minNumFiles: Long): Boolean =
!wasAutoCompacted || hasSufficientFiles(minNumFiles)
def hasSufficientSmallFilesAndHasNotBeenCompacted(minNumFiles: Long): Boolean =
hasSufficientFiles(minNumFiles)

def hasSufficientFiles(minNumFiles: Long): Boolean = numFiles >= minNumFiles
}
Expand Down Expand Up @@ -305,7 +304,7 @@ class AutoCompactPartitionStats(
tablePartitionStatsCache.get(tableId).map { tablePartitionStates =>
targetPartitions.filter { partitionKey =>
tablePartitionStates.get(partitionKey.##).exists { partitionState =>
partitionState.hasSufficientSmallFilesOrHasNotBeenCompacted(minNumFiles)
partitionState.hasSufficientSmallFilesAndHasNotBeenCompacted(minNumFiles)
}
}
}.getOrElse(Set.empty)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,13 @@ class AutoCompactSuite extends
DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED.key -> s"true",
DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES.key -> "30") {
val path = dir.getCanonicalPath
// Append 1 file to each partition: record runOnModifiedPartitions event, as is first write
// Append 1 file to each partition: record skipInsufficientFilesInModifiedPartitions event,
// as not enough small files exist
var usageLogs = captureOptimizeLogs(AutoCompact.OP_TYPE) {
createFilesToPartitions(numFilePartitions = 3, numFilesPerPartition = 1, path)
}
var log = JsonUtils.mapper.readValue[Map[String, String]](usageLogs.head.blob)
assert(log("status") == "runOnModifiedPartitions" && log("partitions") == "3")
assert(log("status") == "skipInsufficientFilesInModifiedPartitions")
// Append 10 more file to each partition: record skipInsufficientFilesInModifiedPartitions
// event.
usageLogs = captureOptimizeLogs(AutoCompact.OP_TYPE) {
Expand Down Expand Up @@ -196,7 +197,7 @@ class AutoCompactSuite extends
df.write.format("delta").mode("append").save(dir)
val deltaLog = DeltaLog.forTable(spark, dir)
val newSnapshot = deltaLog.update()
assert(newSnapshot.version === 1) // 0 is the first commit, 1 is optimize
assert(newSnapshot.version === 1)
assert(deltaLog.update().numOfFiles === 1)

val isLogged = checkAutoOptimizeLogging {
Expand Down Expand Up @@ -283,34 +284,54 @@ class AutoCompactSuite extends
}

testBothModesViaProperty("auto compact should not kick in when there aren't " +
"enough files") { dir =>
withSQLConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES.key -> "5") {
"enough small files") { dir =>
withSQLConf(
DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES.key -> "6",
DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE.key -> "20000"
) {
AutoCompactPartitionStats.instance(spark).resetTestOnly()
spark.range(10).repartition(4).write.format("delta").mode("append").save(dir)

// First write - 4 small files
spark.range(10).repartition(4).write.format("delta").mode("append").save(dir)
val deltaLog = DeltaLog.forTable(spark, dir)
val newSnapshot = deltaLog.update()
assert(newSnapshot.version === 0)
assert(deltaLog.update().numOfFiles === 4)
assert(deltaLog.update().numOfFiles === 4, "Should have 4 initial small files")

// Second write - 4 large files
spark.range(10000).repartition(4).write.format("delta").mode("append").save(dir)

val writeEvent = deltaLog.history.getHistory(Some(1)).head
assert(writeEvent.operation === "WRITE",
"Large files shouldn't trigger auto compaction")
assert(deltaLog.update().numOfFiles === 8,
"Should have 4 small + 4 large files")

// Third write - 2 more small files to reach minNumFiles
val isLogged2 = checkAutoOptimizeLogging {
spark.range(10).repartition(4).write.format("delta").mode("append").save(dir)
spark.range(10).repartition(2).write.format("delta").mode("append").save(dir)
}

assert(isLogged2)
val lastEvent = deltaLog.history.getHistory(Some(1)).head
assert(lastEvent.operation === "OPTIMIZE")
assert(lastEvent.operationParameters("auto") === "true")
val compactionEvent = deltaLog.history.getHistory(Some(3)).head
assert(compactionEvent.operation === "OPTIMIZE",
"Should trigger compaction with 6 small files")
assert(compactionEvent.operationParameters("auto") === "true")

assert(deltaLog.update().numOfFiles === 1, "Files should be optimized into a single one")
val finalSnapshot = deltaLog.update()
assert(finalSnapshot.numOfFiles === 5,
"Should have 4 large files + 1 compacted small file")

checkAnswer(
spark.read.format("delta").load(dir),
spark.range(10).union(spark.range(10)).toDF()
spark.range(10)
.union(spark.range(1000))
.union(spark.range(10))
.toDF()
)
}
}


testBothModesViaProperty("ensure no NPE in auto compact UDF with null " +
"partition values") { dir =>
Seq(null, "", " ").zipWithIndex.foreach { case (partValue, i) =>
Expand Down
Loading