From d52b20e54ea193f000b633197cb7dc506739e2aa Mon Sep 17 00:00:00 2001 From: yingsu00 Date: Wed, 27 Mar 2024 19:11:40 +0800 Subject: [PATCH] Fix the issue that the splits cannot be skipped for some special case In some rare case, the tables created by other engines would put the partition key columns in the data file too, and fail to write ColumnStats for such columns. In such case, the split from a partition with NULL partition keys failed to be skipped, causing wrong results. This is because HiveConnectorUtils::testFilters() assumes the partition keys are not in the data file, therefore is unable to apply the filter on the partition key. This commit fixes this issue by also checking the partition key list even when the partition columns are in the data file. --- velox/connectors/hive/HiveConnectorUtil.cpp | 14 ++- velox/exec/tests/TableScanTest.cpp | 97 +++++++++++++++++-- .../tests/utils/HiveConnectorTestBase.cpp | 6 +- .../exec/tests/utils/HiveConnectorTestBase.h | 3 +- 4 files changed, 106 insertions(+), 14 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index a1d296e650a72..0aeb94bbf7ba3 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -581,18 +581,24 @@ bool testFilters( for (const auto& child : scanSpec->children()) { if (child->filter()) { const auto& name = child->fieldName(); - if (!rowType->containsChild(name)) { - // If missing column is partition key. - auto iter = partitionKey.find(name); + auto iter = partitionKey.find(name); + // The partition key columns are writen in the data file for + // IcebergTables, so we need to test both cases + if (!rowType->containsChild(name) || iter != partitionKey.end()) { if (iter != partitionKey.end() && iter->second.has_value()) { + // This is a non-null partition key return applyPartitionFilter( (*partitionKeysHandle)[name]->dataType()->kind(), iter->second.value(), child->filter()); } - // Column is missing. Most likely due to schema evolution. + // Column is missing, most likely due to schema evolution. Or it's a + // partition key but the partition value is NULL. if (child->filter()->isDeterministic() && !child->filter()->testNull()) { + VLOG(1) << "Skipping " << filePath + << " because the filter testNull() failed for column " + << child->fieldName(); return false; } } else { diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index fc261aef119a2..028836bd45070 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -66,6 +66,16 @@ class TableScanTest : public virtual HiveConnectorTestBase { } std::vector makeVectors( + int32_t count, + int32_t rowsPerVector, + const RowTypePtr& rowType = nullptr, + std::function isNullAt = nullptr) { + auto inputs = rowType ? rowType : rowType_; + return HiveConnectorTestBase::makeVectors( + inputs, count, rowsPerVector, isNullAt); + } + + std::vector makeNullableVectors( int32_t count, int32_t rowsPerVector, const RowTypePtr& rowType = nullptr) { @@ -154,7 +164,8 @@ class TableScanTest : public virtual HiveConnectorTestBase { void testPartitionedTableImpl( const std::string& filePath, const TypePtr& partitionType, - const std::optional& partitionValue) { + const std::optional& partitionValue, + bool isPartitionColumnInFile = false) { auto split = HiveConnectorSplitBuilder(filePath) .partitionKey("pkey", partitionValue) .build(); @@ -174,8 +185,11 @@ class TableScanTest : public virtual HiveConnectorTestBase { std::string partitionValueStr = partitionValue.has_value() ? "'" + *partitionValue + "'" : "null"; - assertQuery( - op, split, fmt::format("SELECT {}, * FROM tmp", partitionValueStr)); + + std::string duckdbSql = isPartitionColumnInFile + ? "SELECT * FROM tmp" + : fmt::format("SELECT {}, * FROM tmp", partitionValueStr); + assertQuery(op, split, duckdbSql); outputType = ROW({"c0", "pkey", "c1"}, {BIGINT(), partitionType, DOUBLE()}); op = PlanBuilder() @@ -213,12 +227,60 @@ class TableScanTest : public virtual HiveConnectorTestBase { op, split, fmt::format("SELECT {} FROM tmp", partitionValueStr)); } - void testPartitionedTable( + void testPartitionedTableWithFilterOnPartitionKey( + const RowTypePtr& fileSchema, const std::string& filePath, const TypePtr& partitionType, const std::optional& partitionValue) { - testPartitionedTableImpl(filePath, partitionType, partitionValue); - testPartitionedTableImpl(filePath, partitionType, std::nullopt); + // The filter on the partition key cannot eliminate the partition, therefore + // the split should NOT be skipped and all rows in it should be selected. + auto split = HiveConnectorSplitBuilder(filePath) + .partitionKey("pkey", partitionValue) + .build(); + auto outputType = ROW({"c0", "c1"}, {BIGINT(), DOUBLE()}); + ColumnHandleMap assignments = { + {"pkey", partitionKey("pkey", partitionType)}, + {"c0", regularColumn("c0", BIGINT())}, + {"c1", regularColumn("c1", DOUBLE())}}; + std::string filter = partitionValue.has_value() + ? "pkey = " + partitionValue.value() + : "pkey IS NULL"; + auto op = PlanBuilder() + .startTableScan() + .dataColumns(fileSchema) + .outputType(outputType) + .assignments(assignments) + .subfieldFilter(filter) + .endTableScan() + .planNode(); + + assertQuery(op, split, fmt::format("SELECT c0, c1 FROM tmp")); + + // The split should be skipped because the partition key does not pass + // the filter + filter = partitionValue.has_value() ? "pkey <> " + partitionValue.value() + : "pkey IS NOT NULL"; + op = PlanBuilder() + .startTableScan() + .dataColumns(fileSchema) + .outputType(outputType) + .assignments(assignments) + .subfieldFilter(filter) + .endTableScan() + .planNode(); + assertQuery(op, split, fmt::format("SELECT c0, c1 FROM tmp WHERE 1 = 0")); + } + + void testPartitionedTable( + const std::string& filePath, + const TypePtr& partitionType, + const std::optional& partitionValue, + const RowTypePtr& fileSchema = nullptr, + bool isPartitionColumnInFile = false) { + testPartitionedTableImpl( + filePath, partitionType, partitionValue, isPartitionColumnInFile); + testPartitionedTableImpl( + filePath, partitionType, std::nullopt, isPartitionColumnInFile); } RowTypePtr rowType_{ @@ -1678,7 +1740,28 @@ TEST_F(TableScanTest, partitionedTableDateKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, DATE(), "2023-10-27"); + testPartitionedTable(filePath->path, DATE(), "2023-10-27", rowType); +} + +// Partition key was written as a real column in the data file, and the value is +// NULL. The column does not have column statistics +TEST_F(TableScanTest, partitionedTablePartitionKeyInFile) { + auto fileSchema = ROW({"c0", "c1", "pkey"}, {BIGINT(), DOUBLE(), BIGINT()}); + auto filePath = TempFilePath::create(); + // Create values of all nulls. + auto vectors = + makeVectors(10, 1'000, fileSchema, [](vector_size_t i) { return true; }); + // auto vectors = makeVectors(10, 1'000, rowType); + writeToFile( + filePath->path, + vectors, + std::make_shared(), + false); + createDuckDbTable(vectors); + testPartitionedTable( + filePath->path, BIGINT(), std::nullopt, fileSchema, true); + testPartitionedTableWithFilterOnPartitionKey( + fileSchema, filePath->path, BIGINT(), std::nullopt); } std::vector toStringViews(const std::vector& values) { diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.cpp b/velox/exec/tests/utils/HiveConnectorTestBase.cpp index 63dd953ec60a4..6cbd7ccbf7e7e 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.cpp +++ b/velox/exec/tests/utils/HiveConnectorTestBase.cpp @@ -89,11 +89,13 @@ void HiveConnectorTestBase::writeToFile( std::vector HiveConnectorTestBase::makeVectors( const RowTypePtr& rowType, int32_t numVectors, - int32_t rowsPerVector) { + int32_t rowsPerVector, + std::function isNullAt) { std::vector vectors; for (int32_t i = 0; i < numVectors; ++i) { auto vector = std::dynamic_pointer_cast( - velox::test::BatchMaker::createBatch(rowType, rowsPerVector, *pool_)); + velox::test::BatchMaker::createBatch( + rowType, rowsPerVector, *pool_, isNullAt)); vectors.push_back(vector); } return vectors; diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h index 044caa8b69cf0..cf76bc118bed9 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.h +++ b/velox/exec/tests/utils/HiveConnectorTestBase.h @@ -53,7 +53,8 @@ class HiveConnectorTestBase : public OperatorTestBase { std::vector makeVectors( const RowTypePtr& rowType, int32_t numVectors, - int32_t rowsPerVector); + int32_t rowsPerVector, + std::function isNullAt = nullptr); using OperatorTestBase::assertQuery;