delta-io · mwc360 · Feb 19, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala b/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala
@@ -1108,9 +1108,11 @@ trait OptimisticTransactionImpl extends DeltaTransaction
   def createAutoCompactStatsCollector(): AutoCompactPartitionStatsCollector = {
     try {
       if (spark.conf.get(DeltaSQLConf.DELTA_AUTO_COMPACT_RECORD_PARTITION_STATS_ENABLED)) {
+        val maxFileSize = spark.conf
+          .get(DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE)
         val minFileSize = spark.conf
               .get(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_FILE_SIZE)
-              .getOrElse(Long.MaxValue)
+              .getOrElse(maxFileSize / 2L)
         return AutoCompactPartitionStats.instance(spark)
           .createStatsCollector(minFileSize, reportAutoCompactStatsError)
       }

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/AutoCompactPartitionStats.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/AutoCompactPartitionStats.scala
@@ -69,13 +69,12 @@ class AutoCompactPartitionStats(
       var wasAutoCompacted: Boolean = false) {
 
     /**
-     * Determine whether this partition can be autocompacted based on the number of small files or
-     * if this [[AutoCompactPartitionStats]] instance has not auto compacted it yet.
+     * Determine whether this partition can be autocompacted based on the number of small files.
      * @param minNumFiles The minimum number of files this table-partition should have to trigger
      *                    Auto Compaction in case it has already been compacted once.
      */
-    def hasSufficientSmallFilesOrHasNotBeenCompacted(minNumFiles: Long): Boolean =
-      !wasAutoCompacted || hasSufficientFiles(minNumFiles)
+    def hasSufficientSmallFilesAndHasNotBeenCompacted(minNumFiles: Long): Boolean =
+      hasSufficientFiles(minNumFiles)
 
     def hasSufficientFiles(minNumFiles: Long): Boolean = numFiles >= minNumFiles
   }
@@ -305,7 +304,7 @@ class AutoCompactPartitionStats(
     tablePartitionStatsCache.get(tableId).map { tablePartitionStates =>
       targetPartitions.filter { partitionKey =>
         tablePartitionStates.get(partitionKey.##).exists { partitionState =>
-          partitionState.hasSufficientSmallFilesOrHasNotBeenCompacted(minNumFiles)
+          partitionState.hasSufficientSmallFilesAndHasNotBeenCompacted(minNumFiles)
         }
       }
     }.getOrElse(Set.empty)

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/AutoCompactSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/AutoCompactSuite.scala
@@ -152,12 +152,13 @@ class AutoCompactSuite extends
           DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED.key -> s"true",
           DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES.key -> "30") {
         val path = dir.getCanonicalPath
-        // Append 1 file to each partition: record runOnModifiedPartitions event, as is first write
+        // Append 1 file to each partition: record skipInsufficientFilesInModifiedPartitions event,
+        // as not enough small files exist
         var usageLogs = captureOptimizeLogs(AutoCompact.OP_TYPE) {
           createFilesToPartitions(numFilePartitions = 3, numFilesPerPartition = 1, path)
         }
         var log = JsonUtils.mapper.readValue[Map[String, String]](usageLogs.head.blob)
-        assert(log("status") == "runOnModifiedPartitions" && log("partitions") == "3")
+        assert(log("status") == "skipInsufficientFilesInModifiedPartitions")
         // Append 10 more file to each partition: record skipInsufficientFilesInModifiedPartitions
         // event.
         usageLogs = captureOptimizeLogs(AutoCompact.OP_TYPE) {
@@ -196,7 +197,7 @@ class AutoCompactSuite extends
     df.write.format("delta").mode("append").save(dir)
     val deltaLog = DeltaLog.forTable(spark, dir)
     val newSnapshot = deltaLog.update()
-    assert(newSnapshot.version === 1) // 0 is the first commit, 1 is optimize
+    assert(newSnapshot.version === 1)
     assert(deltaLog.update().numOfFiles === 1)
 
     val isLogged = checkAutoOptimizeLogging {
@@ -283,34 +284,54 @@ class AutoCompactSuite extends
   }
 
   testBothModesViaProperty("auto compact should not kick in when there aren't " +
-    "enough files") { dir =>
-    withSQLConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES.key -> "5") {
+    "enough small files") { dir =>
+    withSQLConf(
+      DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES.key -> "6",
+      DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE.key -> "20000"
+    ) {
       AutoCompactPartitionStats.instance(spark).resetTestOnly()
-      spark.range(10).repartition(4).write.format("delta").mode("append").save(dir)
 
+      // First write - 4 small files
+      spark.range(10).repartition(4).write.format("delta").mode("append").save(dir)
       val deltaLog = DeltaLog.forTable(spark, dir)
       val newSnapshot = deltaLog.update()
       assert(newSnapshot.version === 0)
-      assert(deltaLog.update().numOfFiles === 4)
+      assert(deltaLog.update().numOfFiles === 4, "Should have 4 initial small files")
 
+      // Second write - 4 large files
+      spark.range(10000).repartition(4).write.format("delta").mode("append").save(dir)
+
+      val writeEvent = deltaLog.history.getHistory(Some(1)).head
+      assert(writeEvent.operation === "WRITE",
+        "Large files shouldn't trigger auto compaction")
+      assert(deltaLog.update().numOfFiles === 8,
+        "Should have 4 small + 4 large files")
+
+      // Third write - 2 more small files to reach minNumFiles
       val isLogged2 = checkAutoOptimizeLogging {
-        spark.range(10).repartition(4).write.format("delta").mode("append").save(dir)
+        spark.range(10).repartition(2).write.format("delta").mode("append").save(dir)
       }
-
       assert(isLogged2)
-      val lastEvent = deltaLog.history.getHistory(Some(1)).head
-      assert(lastEvent.operation === "OPTIMIZE")
-      assert(lastEvent.operationParameters("auto") === "true")
+      val compactionEvent = deltaLog.history.getHistory(Some(3)).head
+      assert(compactionEvent.operation === "OPTIMIZE",
+        "Should trigger compaction with 6 small files")
+      assert(compactionEvent.operationParameters("auto") === "true")
 
-      assert(deltaLog.update().numOfFiles === 1, "Files should be optimized into a single one")
+      val finalSnapshot = deltaLog.update()
+      assert(finalSnapshot.numOfFiles === 5,
+        "Should have 4 large files + 1 compacted small file")
 
       checkAnswer(
         spark.read.format("delta").load(dir),
-        spark.range(10).union(spark.range(10)).toDF()
+        spark.range(10)
+          .union(spark.range(1000))
+          .union(spark.range(10))
+          .toDF()
       )
     }
   }
 
+
   testBothModesViaProperty("ensure no NPE in auto compact UDF with null " +
     "partition values") { dir =>
       Seq(null, "", " ").zipWithIndex.foreach { case (partValue, i) =>