-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-40876][SQL][FOLLOWUP] Widening type promotion from integers to decimal in Parquet vectorized reader #44803
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet | |
import java.io.File | ||
|
||
import org.apache.hadoop.fs.Path | ||
import org.apache.parquet.column.{Encoding, ParquetProperties} | ||
import org.apache.parquet.format.converter.ParquetMetadataConverter | ||
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetOutputFormat} | ||
|
||
|
@@ -31,6 +32,7 @@ import org.apache.spark.sql.internal.{LegacyBehaviorPolicy, SQLConf} | |
import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType | ||
import org.apache.spark.sql.test.SharedSparkSession | ||
import org.apache.spark.sql.types._ | ||
import org.apache.spark.sql.types.DecimalType.{ByteDecimal, IntDecimal, LongDecimal, ShortDecimal} | ||
|
||
class ParquetTypeWideningSuite | ||
extends QueryTest | ||
|
@@ -121,6 +123,19 @@ class ParquetTypeWideningSuite | |
if (dictionaryEnabled && !DecimalType.isByteArrayDecimalType(dataType)) { | ||
assertAllParquetFilesDictionaryEncoded(dir) | ||
} | ||
|
||
// Check which encoding was used when writing Parquet V2 files. | ||
val isParquetV2 = spark.conf.getOption(ParquetOutputFormat.WRITER_VERSION) | ||
.contains(ParquetProperties.WriterVersion.PARQUET_2_0.toString) | ||
if (isParquetV2) { | ||
if (dictionaryEnabled) { | ||
assertParquetV2Encoding(dir, Encoding.PLAIN) | ||
} else if (DecimalType.is64BitDecimalType(dataType)) { | ||
assertParquetV2Encoding(dir, Encoding.DELTA_BINARY_PACKED) | ||
} else if (DecimalType.isByteArrayDecimalType(dataType)) { | ||
assertParquetV2Encoding(dir, Encoding.DELTA_BYTE_ARRAY) | ||
} | ||
} | ||
df | ||
} | ||
|
||
|
@@ -145,6 +160,27 @@ class ParquetTypeWideningSuite | |
} | ||
} | ||
|
||
/** | ||
* Asserts that all parquet files in the given directory have all their columns encoded with the | ||
* given encoding. | ||
*/ | ||
private def assertParquetV2Encoding(dir: File, expected_encoding: Encoding): Unit = { | ||
dir.listFiles(_.getName.endsWith(".parquet")).foreach { file => | ||
val parquetMetadata = ParquetFileReader.readFooter( | ||
spark.sessionState.newHadoopConf(), | ||
new Path(dir.toString, file.getName), | ||
ParquetMetadataConverter.NO_FILTER) | ||
parquetMetadata.getBlocks.forEach { block => | ||
block.getColumns.forEach { col => | ||
assert( | ||
col.getEncodings.contains(expected_encoding), | ||
s"Expected column '${col.getPath.toDotString}' to use encoding $expected_encoding " + | ||
s"but found ${col.getEncodings}.") | ||
} | ||
} | ||
} | ||
} | ||
|
||
for { | ||
(values: Seq[String], fromType: DataType, toType: DataType) <- Seq( | ||
(Seq("1", "2", Short.MinValue.toString), ShortType, IntegerType), | ||
|
@@ -157,24 +193,77 @@ class ParquetTypeWideningSuite | |
(Seq("2020-01-01", "2020-01-02", "1312-02-27"), DateType, TimestampNTZType) | ||
) | ||
} | ||
test(s"parquet widening conversion $fromType -> $toType") { | ||
checkAllParquetReaders(values, fromType, toType, expectError = false) | ||
} | ||
test(s"parquet widening conversion $fromType -> $toType") { | ||
checkAllParquetReaders(values, fromType, toType, expectError = false) | ||
} | ||
|
||
for { | ||
(values: Seq[String], fromType: DataType, toType: DataType) <- Seq( | ||
(Seq("1", Byte.MaxValue.toString), ByteType, IntDecimal), | ||
(Seq("1", Byte.MaxValue.toString), ByteType, LongDecimal), | ||
(Seq("1", Short.MaxValue.toString), ShortType, IntDecimal), | ||
(Seq("1", Short.MaxValue.toString), ShortType, LongDecimal), | ||
(Seq("1", Short.MaxValue.toString), ShortType, DecimalType(DecimalType.MAX_PRECISION, 0)), | ||
(Seq("1", Int.MaxValue.toString), IntegerType, IntDecimal), | ||
(Seq("1", Int.MaxValue.toString), IntegerType, LongDecimal), | ||
(Seq("1", Int.MaxValue.toString), IntegerType, DecimalType(DecimalType.MAX_PRECISION, 0)), | ||
(Seq("1", Long.MaxValue.toString), LongType, LongDecimal), | ||
(Seq("1", Long.MaxValue.toString), LongType, DecimalType(DecimalType.MAX_PRECISION, 0)), | ||
(Seq("1", Byte.MaxValue.toString), ByteType, DecimalType(IntDecimal.precision + 1, 1)), | ||
(Seq("1", Short.MaxValue.toString), ShortType, DecimalType(IntDecimal.precision + 1, 1)), | ||
(Seq("1", Int.MaxValue.toString), IntegerType, DecimalType(IntDecimal.precision + 1, 1)), | ||
(Seq("1", Long.MaxValue.toString), LongType, DecimalType(LongDecimal.precision + 1, 1)) | ||
) | ||
} | ||
test(s"parquet widening conversion $fromType -> $toType") { | ||
checkAllParquetReaders(values, fromType, toType, expectError = false) | ||
} | ||
|
||
for { | ||
(values: Seq[String], fromType: DataType, toType: DataType) <- Seq( | ||
(Seq("1", "2", Int.MinValue.toString), LongType, IntegerType), | ||
(Seq("1.23", "10.34"), DoubleType, FloatType), | ||
(Seq("1.23", "10.34"), FloatType, LongType), | ||
(Seq("1", "10"), LongType, DoubleType), | ||
(Seq("1", "10"), LongType, DateType), | ||
(Seq("1", "10"), IntegerType, TimestampType), | ||
(Seq("1", "10"), IntegerType, TimestampNTZType), | ||
(Seq("2020-01-01", "2020-01-02", "1312-02-27"), DateType, TimestampType) | ||
) | ||
} | ||
test(s"unsupported parquet conversion $fromType -> $toType") { | ||
checkAllParquetReaders(values, fromType, toType, expectError = true) | ||
} | ||
test(s"unsupported parquet conversion $fromType -> $toType") { | ||
checkAllParquetReaders(values, fromType, toType, expectError = true) | ||
} | ||
|
||
for { | ||
(values: Seq[String], fromType: DataType, toType: DecimalType) <- Seq( | ||
// Parquet stores byte, short, int values as INT32, which then requires using a decimal that | ||
// can hold at least 4 byte integers. | ||
(Seq("1", "2"), ByteType, DecimalType(1, 0)), | ||
(Seq("1", "2"), ByteType, ByteDecimal), | ||
(Seq("1", "2"), ShortType, ByteDecimal), | ||
(Seq("1", "2"), ShortType, ShortDecimal), | ||
(Seq("1", "2"), IntegerType, ShortDecimal), | ||
(Seq("1", "2"), ByteType, DecimalType(ByteDecimal.precision + 1, 1)), | ||
(Seq("1", "2"), ShortType, DecimalType(ShortDecimal.precision + 1, 1)), | ||
(Seq("1", "2"), LongType, IntDecimal), | ||
(Seq("1", "2"), ByteType, DecimalType(ByteDecimal.precision - 1, 0)), | ||
(Seq("1", "2"), ShortType, DecimalType(ShortDecimal.precision - 1, 0)), | ||
(Seq("1", "2"), IntegerType, DecimalType(IntDecimal.precision - 1, 0)), | ||
(Seq("1", "2"), LongType, DecimalType(LongDecimal.precision - 1, 0)), | ||
(Seq("1", "2"), ByteType, DecimalType(ByteDecimal.precision, 1)), | ||
(Seq("1", "2"), ShortType, DecimalType(ShortDecimal.precision, 1)), | ||
(Seq("1", "2"), IntegerType, DecimalType(IntDecimal.precision, 1)), | ||
(Seq("1", "2"), LongType, DecimalType(LongDecimal.precision, 1)) | ||
) | ||
} | ||
test(s"unsupported parquet conversion $fromType -> $toType") { | ||
checkAllParquetReaders(values, fromType, toType, | ||
expectError = | ||
// parquet-mr allows reading decimals into a smaller precision decimal type without | ||
// checking for overflows. See test below checking for the overflow case in parquet-mr. | ||
spark.conf.get(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key).toBoolean) | ||
} | ||
|
||
for { | ||
(values: Seq[String], fromType: DataType, toType: DataType) <- Seq( | ||
|
@@ -201,17 +290,17 @@ class ParquetTypeWideningSuite | |
Seq(5 -> 7, 5 -> 10, 5 -> 20, 10 -> 12, 10 -> 20, 20 -> 22) ++ | ||
Seq(7 -> 5, 10 -> 5, 20 -> 5, 12 -> 10, 20 -> 10, 22 -> 20) | ||
} | ||
test( | ||
s"parquet decimal precision change Decimal($fromPrecision, 2) -> Decimal($toPrecision, 2)") { | ||
checkAllParquetReaders( | ||
values = Seq("1.23", "10.34"), | ||
fromType = DecimalType(fromPrecision, 2), | ||
toType = DecimalType(toPrecision, 2), | ||
expectError = fromPrecision > toPrecision && | ||
// parquet-mr allows reading decimals into a smaller precision decimal type without | ||
// checking for overflows. See test below checking for the overflow case in parquet-mr. | ||
spark.conf.get(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key).toBoolean) | ||
} | ||
test( | ||
s"parquet decimal precision change Decimal($fromPrecision, 2) -> Decimal($toPrecision, 2)") { | ||
checkAllParquetReaders( | ||
values = Seq("1.23", "10.34"), | ||
fromType = DecimalType(fromPrecision, 2), | ||
toType = DecimalType(toPrecision, 2), | ||
expectError = fromPrecision > toPrecision && | ||
// parquet-mr allows reading decimals into a smaller precision decimal type without | ||
// checking for overflows. See test below checking for the overflow case in parquet-mr. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for non-vectorized parquet reader, what's the behavior? silent overflow? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Decimal values are set to null on overflow, see https://github.com/apache/spark/pull/44803/files/8935b284e5519038f78fd95c8d12e66224f29d63#diff-a5cfd7285f9adf95b2aeea90aa57cc35d2b8c6bddaa0f4652172d30a264d3614R347 Integers wrap around on overflow on the other hand: Arguably not great but changing it would be a breaking change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and vectorized reader just doesn't allow it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes the vectorized reader throws an exception which this test is checking |
||
spark.conf.get(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key).toBoolean) | ||
} | ||
|
||
for { | ||
((fromPrecision, fromScale), (toPrecision, toScale)) <- | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we remove
isDate
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was redundant since reading an INT32 as TimestampNTZType necessarily requires converting the value. The fact that this only happens for parquet dates isn't really relevant here and with the current change this would be the only case where we look at the parquet type annotation which is a bit confusing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see, this is inside
isLazyDecodingSupported