From 1d95a474aa2489e07a0c38bd292d2294978d207f Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Tue, 19 Mar 2024 18:11:57 -0700 Subject: [PATCH] Fix bug in PrestoVectorSerializers when serializing empty Vectors with complex types (#9157) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/9157 PrestoIterativeVectorSerializer currently only supports serializing empty Vectors/selections when the columns are flat primitives. If the columns are complex attempting to serialize them will trigger a SIGSEGV. PrestoBatchVectorSerializer doesn't support serializing empty Vectors/selections at all. The fixes VectorStream to initialize itself and its children correctly when the Vector is empty. It also updates PrestoBatchVectorSerializer to only call serialize on the children if the Vector is non-empty. With these two changes both serializers support any column types in empty Vectors. Reviewed By: aozturk Differential Revision: D55090408 fbshipit-source-id: 720bba19c265f0e1ee7dc55245e344aa4cda29fa --- velox/serializers/PrestoSerializer.cpp | 22 +++- .../tests/PrestoSerializerTest.cpp | 114 +++++++++++++++++- 2 files changed, 134 insertions(+), 2 deletions(-) diff --git a/velox/serializers/PrestoSerializer.cpp b/velox/serializers/PrestoSerializer.cpp index 60f7049c2a2e..97b8cd52e100 100644 --- a/velox/serializers/PrestoSerializer.cpp +++ b/velox/serializers/PrestoSerializer.cpp @@ -1371,6 +1371,24 @@ class VectorStream { values_(streamArena) { if (initialNumRows == 0) { initializeHeader(typeToEncodingName(type), *streamArena); + if (type_->size() > 0) { + hasLengths_ = true; + children_.resize(type_->size()); + for (int32_t i = 0; i < type_->size(); ++i) { + children_[i] = std::make_unique( + type_->childAt(i), + std::nullopt, + getChildAt(vector, i), + streamArena_, + initialNumRows, + opts_); + } + + // The first element in the offsets in the wire format is always 0 for + // nested types. + lengths_.startWrite(sizeof(vector_size_t)); + lengths_.appendOne(0); + } return; } @@ -3632,7 +3650,9 @@ class PrestoBatchVectorSerializer : public BatchVectorSerializer { numRows, opts_); - serializeColumn(vector->childAt(i), ranges, streams[i].get(), scratch); + if (numRows > 0) { + serializeColumn(vector->childAt(i), ranges, streams[i].get(), scratch); + } } flushStreams( diff --git a/velox/serializers/tests/PrestoSerializerTest.cpp b/velox/serializers/tests/PrestoSerializerTest.cpp index b76cc2560d98..fedf8d33867b 100644 --- a/velox/serializers/tests/PrestoSerializerTest.cpp +++ b/velox/serializers/tests/PrestoSerializerTest.cpp @@ -735,6 +735,25 @@ class PrestoSerializerTest alphabetSize / factor); } + RowVectorPtr makeEmptyTestVector() { + return makeRowVector( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}, + {makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeFlatVector({}), + makeRowVector( + {"1", "2"}, + {makeFlatVector({}), makeFlatVector({})}), + makeArrayVector({}), + makeMapVector({})}); + } + std::unique_ptr serde_; folly::Random::DefaultGenerator rng_; }; @@ -771,7 +790,7 @@ TEST_P(PrestoSerializerTest, dictionaryWithExtraNulls) { } TEST_P(PrestoSerializerTest, emptyPage) { - auto rowVector = makeRowVector(ROW({"a"}, {BIGINT()}), 0); + auto rowVector = makeEmptyTestVector(); std::ostringstream out; serialize(rowVector, &out, nullptr); @@ -781,6 +800,50 @@ TEST_P(PrestoSerializerTest, emptyPage) { assertEqualVectors(deserialized, rowVector); } +TEST_P(PrestoSerializerTest, serializeNoRowsSelected) { + std::ostringstream out; + facebook::velox::serializer::presto::PrestoOutputStreamListener listener; + OStreamOutputStream output(&out, &listener); + auto paramOptions = getParamSerdeOptions(nullptr); + auto arena = std::make_unique(pool_.get()); + auto rowVector = makeRowVector( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}, + {makeFlatVector({true, false, true, false}), + makeFlatVector({1, 2, 3, 4}), + makeFlatVector({5, 6, 7, 8}), + makeFlatVector({9, 10, 11, 12}), + makeFlatVector({13, 14, 15, 16}), + makeFlatVector({17.0, 18.0, 19.0, 20.0}), + makeFlatVector({21.0, 22.0, 23.0, 24.0}), + makeFlatVector({"25", "26", "27", "28"}), + makeFlatVector({{29, 30}, {32, 32}, {33, 34}, {35, 36}}), + makeRowVector( + {"1", "2"}, + {makeFlatVector({"37", "38", "39", "40"}), + makeFlatVector({41, 42, 43, 44})}), + makeArrayVector({{45, 46}, {47, 48}, {49, 50}, {51, 52}}), + makeMapVector( + {{{"53", 54}, {"55", 56}}, + {{"57", 58}, {"59", 60}}, + {{"61", 62}, {"63", 64}}, + {{"65", 66}, {"67", 68}}})}); + const IndexRange noRows{0, 0}; + + auto serializer = serde_->createIterativeSerializer( + asRowType(rowVector->type()), + rowVector->size(), + arena.get(), + ¶mOptions); + + serializer->append(rowVector, folly::Range(&noRows, 1)); + serializer->flush(&output); + + auto expected = makeEmptyTestVector(); + auto rowType = asRowType(expected->type()); + auto deserialized = deserialize(rowType, out.str(), nullptr); + assertEqualVectors(deserialized, expected); +} + TEST_P(PrestoSerializerTest, emptyArray) { auto arrayVector = makeArrayVector( 1'000, @@ -1074,6 +1137,55 @@ TEST_P(PrestoSerializerTest, dictionaryEncodingTurnedOff) { ASSERT_EQ(deserialized->childAt(9)->encoding(), VectorEncoding::Simple::FLAT); } +TEST_P(PrestoSerializerTest, emptyVectorBatchVectorSerializer) { + // Serialize an empty RowVector. + auto rowVector = makeEmptyTestVector(); + + std::ostringstream out; + serializeBatch(rowVector, &out, nullptr); + + auto rowType = asRowType(rowVector->type()); + auto deserialized = deserialize(rowType, out.str(), nullptr); + assertEqualVectors(deserialized, rowVector); +} + +TEST_P(PrestoSerializerTest, serializeNoRowsSelectedBatchVectorSerializer) { + std::ostringstream out; + facebook::velox::serializer::presto::PrestoOutputStreamListener listener; + OStreamOutputStream output(&out, &listener); + auto paramOptions = getParamSerdeOptions(nullptr); + auto serializer = serde_->createBatchSerializer(pool_.get(), ¶mOptions); + auto rowVector = makeRowVector( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"}, + {makeFlatVector({true, false, true, false}), + makeFlatVector({1, 2, 3, 4}), + makeFlatVector({5, 6, 7, 8}), + makeFlatVector({9, 10, 11, 12}), + makeFlatVector({13, 14, 15, 16}), + makeFlatVector({17.0, 18.0, 19.0, 20.0}), + makeFlatVector({21.0, 22.0, 23.0, 24.0}), + makeFlatVector({"25", "26", "27", "28"}), + makeFlatVector({{29, 30}, {32, 32}, {33, 34}, {35, 36}}), + makeRowVector( + {"1", "2"}, + {makeFlatVector({"37", "38", "39", "40"}), + makeFlatVector({41, 42, 43, 44})}), + makeArrayVector({{45, 46}, {47, 48}, {49, 50}, {51, 52}}), + makeMapVector( + {{{"53", 54}, {"55", 56}}, + {{"57", 58}, {"59", 60}}, + {{"61", 62}, {"63", 64}}, + {{"65", 66}, {"67", 68}}})}); + const IndexRange noRows{0, 0}; + + serializer->serialize(rowVector, folly::Range(&noRows, 1), &output); + + auto expected = makeEmptyTestVector(); + auto rowType = asRowType(expected->type()); + auto deserialized = deserialize(rowType, out.str(), nullptr); + assertEqualVectors(deserialized, expected); +} + TEST_P(PrestoSerializerTest, lazy) { constexpr int kSize = 1000; auto rowVector = makeTestVector(kSize);