From 258db516de0a869a1ed47ecd59fc4053fb23628a Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Thu, 27 Jun 2024 19:05:13 -0700 Subject: [PATCH] Use legacySizeOfNull argument to determine the behavior of Spark size function (#10100) Summary: 1) Spark `size` function's legacySizeOfNull is specified either by other functions (e.g., `array_size` function) or by configuration. However, in the existing implementation, it only depends on the configuration property. This pr removes the configuration property, then just uses the legacySizeOfNull arg passed from framework like Gluten. 2) At Spark's analysis time, `array_size` is replaced with `size` function. And their implementations are same actually. This pr removes duplicate code. Pull Request resolved: https://github.com/facebookincubator/velox/pull/10100 Reviewed By: xiaoxmeng Differential Revision: D58825758 Pulled By: bikramSingh91 fbshipit-source-id: 5e1c9679832dfb6ac7dd15e3c0c1d979d3219b93 --- velox/core/QueryConfig.h | 9 -- velox/docs/functions/spark/array.rst | 17 ++-- velox/docs/functions/spark/map.rst | 11 ++- velox/functions/sparksql/ArraySizeFunction.h | 34 -------- velox/functions/sparksql/Register.cpp | 4 - velox/functions/sparksql/Size.cpp | 23 +++-- .../sparksql/tests/ArraySizeTest.cpp | 80 ----------------- velox/functions/sparksql/tests/CMakeLists.txt | 1 - velox/functions/sparksql/tests/SizeTest.cpp | 85 ++++++++++++++----- 9 files changed, 94 insertions(+), 170 deletions(-) delete mode 100644 velox/functions/sparksql/ArraySizeFunction.h delete mode 100644 velox/functions/sparksql/tests/ArraySizeTest.cpp diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index fd9f78186c72..2c8763f1e822 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -289,10 +289,6 @@ class QueryConfig { static constexpr const char* kPrestoArrayAggIgnoreNulls = "presto.array_agg.ignore_nulls"; - /// If false, size function returns null for null input. - static constexpr const char* kSparkLegacySizeOfNull = - "spark.legacy_size_of_null"; - // The default number of expected items for the bloomfilter. static constexpr const char* kSparkBloomFilterExpectedNumItems = "spark.bloom_filter.expected_num_items"; @@ -623,11 +619,6 @@ class QueryConfig { return get(kSpillableReservationGrowthPct, kDefaultPct); } - bool sparkLegacySizeOfNull() const { - constexpr bool kDefault{true}; - return get(kSparkLegacySizeOfNull, kDefault); - } - bool prestoArrayAggIgnoreNulls() const { return get(kPrestoArrayAggIgnoreNulls, false); } diff --git a/velox/docs/functions/spark/array.rst b/velox/docs/functions/spark/array.rst index 6b981a44228f..aa6a39015ab0 100644 --- a/velox/docs/functions/spark/array.rst +++ b/velox/docs/functions/spark/array.rst @@ -101,12 +101,6 @@ Array Functions SELECT array_repeat(100, 0); -- [] SELECT array_repeat(100, -1); -- [] -.. spark:function:: array_size(array(E)) -> integer - - Returns the size of the array. :: - - SELECT array_size(array(1, 2, 3)); -- 3 - .. spark:function:: array_sort(array(E)) -> array(E) Returns an array which has the sorted order of the input array(E). The elements of array(E) must @@ -193,11 +187,14 @@ Array Functions SELECT shuffle(array(0, 0, 0), 0); -- [0, 0, 0] SELECT shuffle(array(1, NULL, 1, NULL, 2), 0); -- [2, 1, NULL, NULL, 1] -.. spark:function:: size(array(E)) -> bigint +.. spark:function:: size(array(E), legacySizeOfNull) -> integer + + Returns the size of the array. Returns null for null input if `legacySizeOfNull` + is set to false. Otherwise, returns -1 for null input. :: - Returns the size of the array. Returns null for null input - if :doc:`spark.legacy_size_of_null <../../configs>` is set to false. - Otherwise, returns -1 for null input. + SELECT size(array(1, 2, 3), true); -- 3 + SELECT size(NULL, true); -- -1 + SELECT size(NULL, false); -- NULL .. spark:function:: sort_array(array(E)) -> array(E) diff --git a/velox/docs/functions/spark/map.rst b/velox/docs/functions/spark/map.rst index 923de1b45309..74735e9d5fe0 100644 --- a/velox/docs/functions/spark/map.rst +++ b/velox/docs/functions/spark/map.rst @@ -56,9 +56,12 @@ Map Functions MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3]), (k, v1, v2) -> k || CAST(v1/v2 AS VARCHAR)); -.. spark:function:: size(map(K,V)) -> bigint +.. spark:function:: size(map(K,V), legacySizeOfNull) -> integer :noindex: - Returns the size of the input map. Returns null for null input - if :doc:`spark.legacy_size_of_null <../../configs>` is set to false. - Otherwise, returns -1 for null input. + Returns the size of the input map. Returns null for null input if ``legacySizeOfNull`` + is set to false. Otherwise, returns -1 for null input. :: + + SELECT size(map(array(1, 2), array(3, 4)), true); -- 2 + SELECT size(NULL, true); -- -1 + SELECT size(NULL, false); -- NULL diff --git a/velox/functions/sparksql/ArraySizeFunction.h b/velox/functions/sparksql/ArraySizeFunction.h deleted file mode 100644 index e7857815fe2b..000000000000 --- a/velox/functions/sparksql/ArraySizeFunction.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include "velox/functions/Macros.h" - -namespace facebook::velox::functions::sparksql { - -template -struct ArraySizeFunction { - VELOX_DEFINE_FUNCTION_TYPES(T); - - FOLLY_ALWAYS_INLINE void call( - int32_t& out, - const arg_type>& inputArray) { - out = inputArray.size(); - } -}; -} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp index 2dbeda471473..1978891e6923 100644 --- a/velox/functions/sparksql/Register.cpp +++ b/velox/functions/sparksql/Register.cpp @@ -30,7 +30,6 @@ #include "velox/functions/prestosql/URLFunctions.h" #include "velox/functions/sparksql/ArrayFlattenFunction.h" #include "velox/functions/sparksql/ArrayMinMaxFunction.h" -#include "velox/functions/sparksql/ArraySizeFunction.h" #include "velox/functions/sparksql/ArraySort.h" #include "velox/functions/sparksql/Bitwise.h" #include "velox/functions/sparksql/DateTimeFunctions.h" @@ -170,9 +169,6 @@ inline void registerArrayMinMaxFunctions(const std::string& prefix) { void registerFunctions(const std::string& prefix) { registerAllSpecialFormGeneralFunctions(); - registerFunction>( - {prefix + "array_size"}); - // Register size functions registerSize(prefix + "size"); diff --git a/velox/functions/sparksql/Size.cpp b/velox/functions/sparksql/Size.cpp index 96bd820d5680..70d7d3d509d2 100644 --- a/velox/functions/sparksql/Size.cpp +++ b/velox/functions/sparksql/Size.cpp @@ -30,33 +30,40 @@ struct Size { template FOLLY_ALWAYS_INLINE void initialize( const std::vector& /*inputTypes*/, - const core::QueryConfig& config, - const TInput* input /*input*/) { - legacySizeOfNull_ = config.sparkLegacySizeOfNull(); + const core::QueryConfig& /*config*/, + const TInput* /*input*/, + const bool* legacySizeOfNull) { + if (legacySizeOfNull == nullptr) { + VELOX_USER_FAIL("Constant legacySizeOfNull is expected."); + } + legacySizeOfNull_ = *legacySizeOfNull; } template - FOLLY_ALWAYS_INLINE bool callNullable(int32_t& out, const TInput* input) { + FOLLY_ALWAYS_INLINE bool callNullable( + int32_t& out, + const TInput* input, + const bool* /*legacySizeOfNull*/) { if (input == nullptr) { if (legacySizeOfNull_) { out = -1; return true; - } else { - return false; } + return false; } out = input->size(); return true; } private: + // If true, returns -1 for null input. Otherwise, returns null. bool legacySizeOfNull_; }; } // namespace void registerSize(const std::string& prefix) { - registerFunction>({prefix}); - registerFunction>({prefix}); + registerFunction, bool>({prefix}); + registerFunction, bool>({prefix}); } } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/tests/ArraySizeTest.cpp b/velox/functions/sparksql/tests/ArraySizeTest.cpp deleted file mode 100644 index 1591ff484a9a..000000000000 --- a/velox/functions/sparksql/tests/ArraySizeTest.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" -#include "velox/type/Timestamp.h" - -using namespace facebook::velox; -using namespace facebook::velox::test; -using namespace facebook::velox::functions::test; - -namespace facebook::velox::functions::sparksql::test { -namespace { - -class ArraySizeTest : public SparkFunctionBaseTest { - protected: - template - int32_t arraySize(const std::vector>& input) { - auto row = makeRowVector({makeNullableArrayVector( - std::vector>>{input})}); - return evaluateOnce("array_size(c0)", row).value(); - } -}; - -TEST_F(ArraySizeTest, boolean) { - EXPECT_EQ(arraySize({true, false}), 2); - EXPECT_EQ(arraySize({true}), 1); - EXPECT_EQ(arraySize({}), 0); - EXPECT_EQ(arraySize({true, false, true, std::nullopt}), 4); -} - -TEST_F(ArraySizeTest, smallint) { - EXPECT_EQ(arraySize({}), 0); - EXPECT_EQ(arraySize({1}), 1); - EXPECT_EQ(arraySize({std::nullopt}), 1); - EXPECT_EQ(arraySize({std::nullopt, 1}), 2); -} - -TEST_F(ArraySizeTest, real) { - EXPECT_EQ(arraySize({}), 0); - EXPECT_EQ(arraySize({1.1}), 1); - EXPECT_EQ(arraySize({std::nullopt}), 1); - EXPECT_EQ(arraySize({std::nullopt, 1.1}), 2); -} - -TEST_F(ArraySizeTest, varchar) { - EXPECT_EQ(arraySize({"red", "blue"}), 2); - EXPECT_EQ( - arraySize({std::nullopt, "blue", "yellow", "orange"}), 4); - EXPECT_EQ(arraySize({}), 0); - EXPECT_EQ(arraySize({std::nullopt}), 1); -} - -TEST_F(ArraySizeTest, integer) { - EXPECT_EQ(arraySize({1, 2}), 2); -} - -TEST_F(ArraySizeTest, timestamp) { - auto ts = [](int64_t micros) { return Timestamp::fromMicros(micros); }; - EXPECT_EQ(arraySize({}), 0); - EXPECT_EQ(arraySize({std::nullopt}), 1); - EXPECT_EQ(arraySize({ts(0), ts(1)}), 2); -} -} // namespace -} // namespace facebook::velox::functions::sparksql::test diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 56b3fea8da6b..af4b4c3f66da 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -19,7 +19,6 @@ add_executable( ArrayGetTest.cpp ArrayMaxTest.cpp ArrayMinTest.cpp - ArraySizeTest.cpp ArraySortTest.cpp ArrayShuffleTest.cpp ArgGeneratorTest.cpp diff --git a/velox/functions/sparksql/tests/SizeTest.cpp b/velox/functions/sparksql/tests/SizeTest.cpp index d885e41705af..ff69ca04a5ac 100644 --- a/velox/functions/sparksql/tests/SizeTest.cpp +++ b/velox/functions/sparksql/tests/SizeTest.cpp @@ -15,8 +15,11 @@ */ #include +#include #include +#include #include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" +#include "velox/type/Timestamp.h" using namespace facebook::velox; using namespace facebook::velox::exec; @@ -28,9 +31,9 @@ class SizeTest : public SparkFunctionBaseTest { std::function sizeAt = [](vector_size_t row) { return 1 + row % 7; }; - void testSize(VectorPtr vector, vector_size_t numRows) { - auto result = - evaluate>("size(c0)", makeRowVector({vector})); + void testLegacySizeOfNull(VectorPtr vector, vector_size_t numRows) { + auto result = evaluate>( + "size(c0, true)", makeRowVector({vector})); for (vector_size_t i = 0; i < numRows; ++i) { if (vector->isNullAt(i)) { EXPECT_EQ(result->valueAt(i), -1) << "at " << i; @@ -40,18 +43,19 @@ class SizeTest : public SparkFunctionBaseTest { } } - void testSizeLegacyNull(VectorPtr vector, vector_size_t numRows) { - auto result = - evaluate>("size(c0)", makeRowVector({vector})); + void testSize(VectorPtr vector, vector_size_t numRows) { + auto result = evaluate>( + "size(c0, false)", makeRowVector({vector})); for (vector_size_t i = 0; i < numRows; ++i) { EXPECT_EQ(result->isNullAt(i), vector->isNullAt(i)) << "at " << i; } } - void setConfig(std::string configStr, bool value) { - execCtx_.queryCtx()->testingOverrideConfigUnsafe({ - {configStr, std::to_string(value)}, - }); + template + int32_t testArraySize(const std::vector>& input) { + auto row = makeRowVector({makeNullableArrayVector( + std::vector>>{input})}); + return evaluateOnce("size(c0, false)", row).value(); } static inline vector_size_t valueAt(vector_size_t idx) { @@ -59,32 +63,73 @@ class SizeTest : public SparkFunctionBaseTest { } }; -TEST_F(SizeTest, sizetest) { +// Ensure that out is set to -1 for null input if legacySizeOfNull = true. +TEST_F(SizeTest, legacySizeOfNull) { vector_size_t numRows = 100; auto arrayVector = makeArrayVector(numRows, sizeAt, valueAt, nullptr); - testSize(arrayVector, numRows); + testLegacySizeOfNull(arrayVector, numRows); arrayVector = makeArrayVector(numRows, sizeAt, valueAt, nullEvery(5)); - testSize(arrayVector, numRows); + testLegacySizeOfNull(arrayVector, numRows); auto mapVector = makeMapVector( numRows, sizeAt, valueAt, valueAt, nullptr); - testSize(mapVector, numRows); + testLegacySizeOfNull(mapVector, numRows); mapVector = makeMapVector( numRows, sizeAt, valueAt, valueAt, nullEvery(5)); - testSize(mapVector, numRows); + testLegacySizeOfNull(mapVector, numRows); } -// Ensure that out if set to -1 if SparkLegacySizeOfNull is specified. -TEST_F(SizeTest, legacySizeOfNull) { +// Ensure that out is set to null for null input if legacySizeOfNull = false. +TEST_F(SizeTest, size) { vector_size_t numRows = 100; - setConfig(core::QueryConfig::kSparkLegacySizeOfNull, false); auto arrayVector = makeArrayVector(numRows, sizeAt, valueAt, nullEvery(1)); - testSizeLegacyNull(arrayVector, numRows); + testSize(arrayVector, numRows); auto mapVector = makeMapVector( numRows, sizeAt, valueAt, valueAt, nullEvery(1)); - testSizeLegacyNull(mapVector, numRows); + testSize(mapVector, numRows); +} + +TEST_F(SizeTest, boolean) { + EXPECT_EQ(testArraySize({true, false}), 2); + EXPECT_EQ(testArraySize({true}), 1); + EXPECT_EQ(testArraySize({}), 0); + EXPECT_EQ(testArraySize({true, false, true, std::nullopt}), 4); +} + +TEST_F(SizeTest, smallint) { + EXPECT_EQ(testArraySize({}), 0); + EXPECT_EQ(testArraySize({1}), 1); + EXPECT_EQ(testArraySize({std::nullopt}), 1); + EXPECT_EQ(testArraySize({std::nullopt, 1}), 2); +} + +TEST_F(SizeTest, real) { + EXPECT_EQ(testArraySize({}), 0); + EXPECT_EQ(testArraySize({1.1}), 1); + EXPECT_EQ(testArraySize({std::nullopt}), 1); + EXPECT_EQ(testArraySize({std::nullopt, 1.1}), 2); +} + +TEST_F(SizeTest, varchar) { + EXPECT_EQ(testArraySize({"red", "blue"}), 2); + EXPECT_EQ( + testArraySize({std::nullopt, "blue", "yellow", "orange"}), + 4); + EXPECT_EQ(testArraySize({}), 0); + EXPECT_EQ(testArraySize({std::nullopt}), 1); +} + +TEST_F(SizeTest, integer) { + EXPECT_EQ(testArraySize({1, 2}), 2); +} + +TEST_F(SizeTest, timestamp) { + auto ts = [](int64_t micros) { return Timestamp::fromMicros(micros); }; + EXPECT_EQ(testArraySize({}), 0); + EXPECT_EQ(testArraySize({std::nullopt}), 1); + EXPECT_EQ(testArraySize({ts(0), ts(1)}), 2); } } // namespace facebook::velox::functions::sparksql::test