|
16 | 16 | package io.delta.kernel.defaults.internal.parquet
|
17 | 17 |
|
18 | 18 | import java.math.BigDecimal
|
19 |
| -import io.delta.golden.GoldenTableUtils.goldenTableFile |
| 19 | +import java.util.TimeZone |
| 20 | + |
| 21 | +import io.delta.golden.GoldenTableUtils.{goldenTableFile, goldenTablePath} |
20 | 22 | import io.delta.kernel.defaults.utils.{ExpressionTestUtils, TestRow}
|
21 | 23 | import io.delta.kernel.test.VectorTestUtils
|
22 | 24 | import io.delta.kernel.types._
|
| 25 | +import org.apache.spark.sql.internal.SQLConf |
23 | 26 | import org.scalatest.funsuite.AnyFunSuite
|
| 27 | +import org.apache.parquet.io.ParquetDecodingException |
24 | 28 |
|
25 | 29 | class ParquetFileReaderSuite extends AnyFunSuite
|
26 | 30 | with ParquetSuiteBase with VectorTestUtils with ExpressionTestUtils {
|
@@ -88,6 +92,148 @@ class ParquetFileReaderSuite extends AnyFunSuite
|
88 | 92 | }
|
89 | 93 | }
|
90 | 94 |
|
| 95 | + ///////////////////////////////////////////////////////////////////////////////////////////////// |
| 96 | + // Tests covering reading parquet values into a wider column type // |
| 97 | + ///////////////////////////////////////////////////////////////////////////////////////////////// |
| 98 | + /** |
| 99 | + * Test case for reading a column using a given type. |
| 100 | + * @param columnName Column to read from the file |
| 101 | + * @param toType Read type to use. May be different from the actually Parquet type. |
| 102 | + * @param expectedExpr Expression returning the expected value for each row in the file. |
| 103 | + */ |
| 104 | + case class TestCase(columnName: String, toType: DataType, expectedExpr: Int => Any) |
| 105 | + |
| 106 | + private val supportedConversions: Seq[TestCase] = Seq( |
| 107 | + // 'ByteType' column was generated with overflowing values, we need to call i.toByte to also |
| 108 | + // wrap around here and generate the correct expected values. |
| 109 | + TestCase("ByteType", ShortType.SHORT, i => if (i % 72 != 0) i.toByte.toShort else null), |
| 110 | + TestCase("ByteType", IntegerType.INTEGER, i => if (i % 72 != 0) i.toByte.toInt else null), |
| 111 | + TestCase("ByteType", LongType.LONG, i => if (i % 72 != 0) i.toByte.toLong else null), |
| 112 | + TestCase("ByteType", DoubleType.DOUBLE, i => if (i % 72 != 0) i.toByte.toDouble else null), |
| 113 | + TestCase("ShortType", IntegerType.INTEGER, i => if (i % 56 != 0) i else null), |
| 114 | + TestCase("ShortType", LongType.LONG, i => if (i % 56 != 0) i.toLong else null), |
| 115 | + TestCase("ShortType", DoubleType.DOUBLE, i => if (i % 56 != 0) i.toDouble else null), |
| 116 | + TestCase("IntegerType", LongType.LONG, i => if (i % 23 != 0) i.toLong else null), |
| 117 | + TestCase("IntegerType", DoubleType.DOUBLE, i => if (i % 23 != 0) i.toDouble else null), |
| 118 | + |
| 119 | + TestCase("FloatType", DoubleType.DOUBLE, |
| 120 | + i => if (i % 28 != 0) (i * 0.234).toFloat.toDouble else null), |
| 121 | + TestCase("decimal", new DecimalType(12, 2), |
| 122 | + i => if (i % 67 != 0) java.math.BigDecimal.valueOf(i * 12352, 2) else null), |
| 123 | + TestCase("decimal", new DecimalType(12, 4), |
| 124 | + i => if (i % 67 != 0) java.math.BigDecimal.valueOf(i * 1235200, 4) else null), |
| 125 | + TestCase("decimal", new DecimalType(26, 10), |
| 126 | + i => if (i % 67 != 0) java.math.BigDecimal.valueOf(i * 12352, 2).setScale(10) |
| 127 | + else null), |
| 128 | + TestCase("IntegerType", new DecimalType(10, 0), |
| 129 | + i => if (i % 23 != 0) new java.math.BigDecimal(i) else null), |
| 130 | + TestCase("IntegerType", new DecimalType(16, 4), |
| 131 | + i => if (i % 23 != 0) new java.math.BigDecimal(i).setScale(4) else null), |
| 132 | + TestCase("LongType", new DecimalType(20, 0), |
| 133 | + i => if (i % 25 != 0) new java.math.BigDecimal(i + 1) else null), |
| 134 | + TestCase("LongType", new DecimalType(28, 6), |
| 135 | + i => if (i % 25 != 0) new java.math.BigDecimal(i + 1).setScale(6) else null), |
| 136 | + |
| 137 | + TestCase("BinaryType", StringType.STRING, i => if (i % 59 != 0) i.toString else null) |
| 138 | + ) |
| 139 | + |
| 140 | + // The following conversions are supported by Kernel but not by Spark with parquet-mr. |
| 141 | + // TODO: We should properly reject these conversions, a lot of them produce wrong results. |
| 142 | + // Collecting them here to document the current behavior. |
| 143 | + private val kernelOnlyConversions: Seq[TestCase] = Seq( |
| 144 | + // This conversions will silently overflow. |
| 145 | + TestCase("ShortType", ByteType.BYTE, i => if (i % 56 != 0) i.toByte else null), |
| 146 | + TestCase("IntegerType", ByteType.BYTE, i => if (i % 23 != 0) i.toByte else null), |
| 147 | + TestCase("IntegerType", ShortType.SHORT, i => if (i % 23 != 0) i.toShort else null), |
| 148 | + |
| 149 | + // This is reading the unscaled decimal value as long which is wrong. |
| 150 | + TestCase("decimal", LongType.LONG, i => if (i % 67 != 0) i.toLong * 12352 else null), |
| 151 | + |
| 152 | + // The following conversions seem legit, although Spark rejects them. |
| 153 | + TestCase("ByteType", DateType.DATE, i => if (i % 72 != 0) i.toByte.toInt else null), |
| 154 | + TestCase("ShortType", DateType.DATE, i => if (i % 56 != 0) i else null), |
| 155 | + TestCase("IntegerType", DateType.DATE, i => if (i % 23 != 0) i else null), |
| 156 | + TestCase("StringType", BinaryType.BINARY, i => if (i % 57 != 0) i.toString.getBytes else null) |
| 157 | + ) |
| 158 | + |
| 159 | + for (testCase <- supportedConversions ++ kernelOnlyConversions) |
| 160 | + test(s"parquet supported conversion - ${testCase.columnName} -> ${testCase.toType.toString}") { |
| 161 | + val inputLocation = goldenTablePath("parquet-all-types") |
| 162 | + val readSchema = new StructType().add(testCase.columnName, testCase.toType) |
| 163 | + val result = readParquetFilesUsingKernel(inputLocation, readSchema) |
| 164 | + val expected = (0 until 200) |
| 165 | + .map { i => TestRow(testCase.expectedExpr(i))} |
| 166 | + checkAnswer(result, expected) |
| 167 | + |
| 168 | + if (!kernelOnlyConversions.contains(testCase)) { |
| 169 | + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { |
| 170 | + val sparkResult = readParquetFilesUsingSpark(inputLocation, readSchema) |
| 171 | + checkAnswer(result, sparkResult) |
| 172 | + } |
| 173 | + } |
| 174 | + } |
| 175 | + |
| 176 | + test (s"parquet supported conversion - date -> timestamp_ntz") { |
| 177 | + val timezones = |
| 178 | + Seq("UTC", "Iceland", "PST", "America/Los_Angeles", "Etc/GMT+9", "Asia/Beirut", "JST") |
| 179 | + for (fromTimezone <- timezones; toTimezone <- timezones) { |
| 180 | + val inputLocation = goldenTablePath(s"data-reader-date-types-$fromTimezone") |
| 181 | + TimeZone.setDefault(TimeZone.getTimeZone(toTimezone)) |
| 182 | + |
| 183 | + val readSchema = new StructType().add("date", TimestampNTZType.TIMESTAMP_NTZ) |
| 184 | + val result = readParquetFilesUsingKernel(inputLocation, readSchema) |
| 185 | + // 1577836800000000L -> 2020-01-01 00:00:00 UTC |
| 186 | + checkAnswer(result, Seq(TestRow(1577836800000000L))) |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + def checkParquetReadError(inputLocation: String, readSchema: StructType): Unit = { |
| 191 | + val ex = intercept[Throwable] { |
| 192 | + readParquetFilesUsingKernel(inputLocation, readSchema) |
| 193 | + } |
| 194 | + // We don't properly reject conversions and the error we get vary a lot, this checks various |
| 195 | + // error message we may get as result. |
| 196 | + // TODO: Uniformize rejecting unsupported conversions. |
| 197 | + assert( |
| 198 | + ex.getMessage.contains("Can not read value") || |
| 199 | + ex.getMessage.contains("column with Parquet type") || |
| 200 | + ex.getMessage.contains("Unable to create Parquet converter for") || |
| 201 | + ex.getMessage.contains("Found Delta type Decimal") || |
| 202 | + ex.getMessage.contains("cannot be cast to") |
| 203 | + ) |
| 204 | + } |
| 205 | + |
| 206 | + for(column <- Seq("BooleanType", "ByteType", "ShortType", "IntegerType", "LongType", |
| 207 | + "FloatType", "DoubleType", "StringType", "BinaryType")) { |
| 208 | + test(s"parquet unsupported conversion from $column") { |
| 209 | + val inputLocation = goldenTablePath("parquet-all-types") |
| 210 | + val supportedTypes = (supportedConversions ++ kernelOnlyConversions) |
| 211 | + .filter(_.columnName == column) |
| 212 | + .map(_.toType) |
| 213 | + val unsupportedTypes = ALL_TYPES |
| 214 | + .filterNot(supportedTypes.contains) |
| 215 | + .filterNot(_.getClass.getSimpleName == column) |
| 216 | + |
| 217 | + for (toType <- unsupportedTypes) { |
| 218 | + val readSchema = new StructType().add(column, toType) |
| 219 | + withClue(s"Converting $column to $toType") { |
| 220 | + checkParquetReadError(inputLocation, readSchema) |
| 221 | + } |
| 222 | + } |
| 223 | + } |
| 224 | + } |
| 225 | + |
| 226 | + test(s"parquet unsupported conversion from decimal") { |
| 227 | + val inputLocation = goldenTablePath("parquet-all-types") |
| 228 | + // 'decimal' column is Decimal(10, 2) which fits into a long. |
| 229 | + for (toType <- ALL_TYPES.filterNot(_ == LongType.LONG)) { |
| 230 | + val readSchema = new StructType().add("decimal", toType) |
| 231 | + withClue(s"Converting decimal to $toType") { |
| 232 | + checkParquetReadError(inputLocation, readSchema) |
| 233 | + } |
| 234 | + } |
| 235 | + } |
| 236 | + |
91 | 237 | test("read subset of columns") {
|
92 | 238 | val tablePath = goldenTableFile("parquet-all-types").getAbsolutePath
|
93 | 239 | val readSchema = new StructType()
|
|
0 commit comments