-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Drop Type widening feature: read Parquet footers to collect files to …
…rewrite
- Loading branch information
Showing
6 changed files
with
130 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
86 changes: 86 additions & 0 deletions
86
spark/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableHelper.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package org.apache.spark.sql.delta.commands | ||
|
||
import org.apache.spark.sql.delta.Snapshot | ||
import org.apache.spark.sql.delta.actions.AddFile | ||
import org.apache.spark.sql.delta.commands.optimize.OptimizeRunner.generateCandidateFileMap | ||
import org.apache.spark.sql.delta.schema.SchemaMergingUtils | ||
import org.apache.spark.sql.delta.util.DeltaFileOperations | ||
import org.apache.hadoop.conf.Configuration | ||
import org.apache.hadoop.fs.{FileStatus, Path} | ||
|
||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetToSparkSchemaConverter} | ||
import org.apache.spark.sql.types.{AtomicType, StructField, StructType} | ||
import org.apache.spark.util.SerializableConfiguration | ||
|
||
trait ReorgTableHelper { | ||
/** | ||
* Determine whether `fileSchema` has any columns that has a type that differ from | ||
* `tablePhysicalSchema`. | ||
*/ | ||
private[databricks] def fileHasDifferentTypes( | ||
fileSchema: StructType, | ||
tablePhysicalSchema: StructType): Boolean = { | ||
SchemaMergingUtils.transformColumns(fileSchema, tablePhysicalSchema) { | ||
case (_, StructField(_, fileType: AtomicType, _, _), | ||
Some(StructField(_, tableType: AtomicType, _, _)), _) if fileType != tableType => | ||
return true | ||
case (_, field, _, _) => field | ||
} | ||
false | ||
} | ||
|
||
/** | ||
* Apply a filter on the list of AddFile to only keep the files that have their physical parquet | ||
* schema that satisfies the given filter function. | ||
*/ | ||
def filterParquetFiles( | ||
spark: SparkSession, snapshot: Snapshot, files: Seq[AddFile])( | ||
filterFileFn: StructType => Boolean) | ||
: Seq[AddFile] = { | ||
val serializedConf = new SerializableConfiguration(spark.sessionState.newHadoopConf()) | ||
val ignoreCorruptFiles = spark.sessionState.conf.ignoreCorruptFiles | ||
val assumeBinaryIsString = spark.sessionState.conf.isParquetBinaryAsString | ||
val assumeInt96IsTimestamp = spark.sessionState.conf.isParquetINT96AsTimestamp | ||
val dataPath = new Path(snapshot.deltaLog.dataPath.toString) | ||
|
||
filterParquetFiles(files, dataPath, serializedConf.value, ignoreCorruptFiles, | ||
assumeBinaryIsString, assumeInt96IsTimestamp)(filterFileFn) | ||
} | ||
|
||
protected def filterParquetFiles( | ||
files: Seq[AddFile], | ||
dataPath: Path, | ||
configuration: Configuration, | ||
ignoreCorruptFiles: Boolean, | ||
assumeBinaryIsString: Boolean, | ||
assumeInt96IsTimestamp: Boolean)( | ||
filterFileFn: StructType => Boolean): Seq[AddFile] = { | ||
val nameToAddFileMap = generateCandidateFileMap(dataPath, files) | ||
|
||
val fileStatuses = nameToAddFileMap.map { case (absPath, addFile) => | ||
new FileStatus( | ||
/* length */ addFile.size, | ||
/* isDir */ false, | ||
/* blockReplication */ 0, | ||
/* blockSize */ 1, | ||
/* modificationTime */ addFile.modificationTime, | ||
new Path(absPath) | ||
) | ||
} | ||
|
||
val footers = DeltaFileOperations.readParquetFootersInParallel( | ||
configuration, | ||
fileStatuses.toList, | ||
ignoreCorruptFiles) | ||
|
||
val converter = | ||
new ParquetToSparkSchemaConverter(assumeBinaryIsString, assumeInt96IsTimestamp) | ||
|
||
val filesNeedToRewrite = footers.filter { footer => | ||
val fileSchema = ParquetFileFormat.readSchemaFromFooter(footer, converter) | ||
filterFileFn(fileSchema) | ||
}.map(_.getFile.toString) | ||
filesNeedToRewrite.map(absPath => nameToAddFileMap(absPath)) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters