From 86622a08e14c606d07cc757b03916897843ace7c Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Wed, 3 Apr 2024 13:42:20 +0800 Subject: [PATCH 1/5] support hash complex type --- velox/docs/functions/spark/binary.rst | 8 +- velox/functions/sparksql/Hash.cpp | 406 ++++++++++++++---- velox/functions/sparksql/tests/HashTest.cpp | 106 +++++ .../functions/sparksql/tests/XxHash64Test.cpp | 113 +++++ 4 files changed, 551 insertions(+), 82 deletions(-) diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst index 4f76b29c1148..9ec544a8e810 100644 --- a/velox/docs/functions/spark/binary.rst +++ b/velox/docs/functions/spark/binary.rst @@ -11,7 +11,7 @@ Binary Functions Computes the hash of one or more input values using seed value of 42. For multiple arguments, their types can be different. Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. + VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: hash_with_seed(seed, x, ...) -> integer @@ -19,21 +19,21 @@ Binary Functions Computes the hash of one or more input values using specified seed. For multiple arguments, their types can be different. Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. + VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: xxhash64(x, ...) -> bigint Computes the xxhash64 of one or more input values using seed value of 42. For multiple arguments, their types can be different. Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. + VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: xxhash64_with_seed(seed, x, ...) -> bigint Computes the xxhash64 of one or more input values using specified seed. For multiple arguments, their types can be different. Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP. + VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: md5(x) -> varbinary diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index 8ad8b4e7f083..20e1284b68c4 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -26,21 +26,273 @@ namespace { const int32_t kDefaultSeed = 42; +struct Murmur3Hash; +struct XxHash64; + +// A template struct that contains the seed and return type of the hash +// function. +template +struct HashTraits {}; + +template <> +struct HashTraits { + using SeedType = int32_t; + using ReturnType = int32_t; +}; + +template <> +struct HashTraits { + using SeedType = int64_t; + using ReturnType = int64_t; +}; + +// Computes the hash value of input using the hash function in HashClass. +template +ReturnType hashOne(int32_t input, SeedType seed) { + return HashClass::hashInt32(input, seed); +} + +template +ReturnType hashOne(int64_t input, SeedType seed) { + return HashClass::hashInt64(input, seed); +} + +template +ReturnType hashOne(float input, SeedType seed) { + return HashClass::hashFloat(input, seed); +} + +template +ReturnType hashOne(double input, SeedType seed) { + return HashClass::hashDouble(input, seed); +} + +template +ReturnType hashOne(int128_t input, SeedType seed) { + return HashClass::hashLongDecimal(input, seed); +} + +template +ReturnType hashOne(Timestamp input, SeedType seed) { + return HashClass::hashTimestamp(input, seed); +} + +template +ReturnType hashOne(StringView input, SeedType seed) { + return HashClass::hashBytes(input, seed); +} + +// Class to compute hashes identical to one produced by Spark. +// Hashes are computed using the algorithm implemented in HashClass. +template < + typename HashClass, + typename SeedType = typename HashTraits::SeedType, + typename ReturnType = typename HashTraits::ReturnType> +class SparkVectorHasher { + public: + SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {} + + virtual ~SparkVectorHasher() = default; + + // Compute the hash value of input vector at index. + ReturnType hashAt(vector_size_t index, SeedType seed) { + if (decoded_.isNullAt(index)) { + return seed; + } + return hashNotNullAt(index, seed); + } + + // Compute the hash value of input vector at index for non-null values. + virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0; + + protected: + const DecodedVector& decoded_; +}; + +template < + typename HashClass, + TypeKind kind, + typename SeedType = typename HashTraits::SeedType, + typename ReturnType = typename HashTraits::ReturnType> +class PrimitiveVectorHasher; + +template < + typename HashClass, + typename SeedType = typename HashTraits::SeedType, + typename ReturnType = typename HashTraits::ReturnType> +class ArrayVectorHasher; + +template < + typename HashClass, + typename SeedType = typename HashTraits::SeedType, + typename ReturnType = typename HashTraits::ReturnType> +class MapVectorHasher; + +template < + typename HashClass, + typename SeedType = typename HashTraits::SeedType, + typename ReturnType = typename HashTraits::ReturnType> +class RowVectorHasher; + +template +std::shared_ptr> createPrimitiveVectorHasher( + DecodedVector& decoded) { + return std::make_shared>(decoded); +} + +template +std::shared_ptr> createVectorHasher( + DecodedVector& decoded) { + switch (decoded.base()->typeKind()) { + case TypeKind::ARRAY: + return std::make_shared>(decoded); + case TypeKind::MAP: + return std::make_shared>(decoded); + case TypeKind::ROW: + return std::make_shared>(decoded); + default: + return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH( + createPrimitiveVectorHasher, + HashClass, + decoded.base()->typeKind(), + decoded); + } +} + +template < + typename HashClass, + TypeKind kind, + typename SeedType, + typename ReturnType> +class PrimitiveVectorHasher : public SparkVectorHasher { + public: + PrimitiveVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) {} + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + return hashOne( + this->decoded_.template valueAt::NativeType>( + index), + seed); + } +}; + +template +class ArrayVectorHasher : public SparkVectorHasher { + public: + ArrayVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) { + base_ = decoded.base()->as(); + indices_ = decoded.indices(); + + SelectivityVector rows(base_->elements()->size()); + decodedElements_.decode(*base_->elements(), rows); + elementHasher_ = createVectorHasher(decodedElements_); + } + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + auto size = base_->sizeAt(indices_[index]); + auto offset = base_->offsetAt(indices_[index]); + + ReturnType result = seed; + for (auto i = 0; i < size; ++i) { + result = elementHasher_->hashAt(i + offset, result); + } + return result; + } + + private: + const ArrayVector* base_; + const int32_t* indices_; + DecodedVector decodedElements_; + std::shared_ptr> elementHasher_; +}; + +template +class MapVectorHasher : public SparkVectorHasher { + public: + MapVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) { + base_ = decoded.base()->as(); + indices_ = decoded.indices(); + + SelectivityVector rows(base_->mapKeys()->size()); + decodedKeys_.decode(*base_->mapKeys(), rows); + decodedValues_.decode(*base_->mapValues(), rows); + keyHasher_ = createVectorHasher(decodedKeys_); + valueHasher_ = createVectorHasher(decodedValues_); + } + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + auto size = base_->sizeAt(indices_[index]); + auto offset = base_->offsetAt(indices_[index]); + + ReturnType result = seed; + for (auto i = 0; i < size; ++i) { + result = keyHasher_->hashAt(i + offset, result); + result = valueHasher_->hashAt(i + offset, result); + } + return result; + } + + private: + const MapVector* base_; + const int32_t* indices_; + DecodedVector decodedKeys_; + DecodedVector decodedValues_; + std::shared_ptr> keyHasher_; + std::shared_ptr> valueHasher_; +}; + +template +class RowVectorHasher : public SparkVectorHasher { + public: + RowVectorHasher(DecodedVector& decoded) + : SparkVectorHasher(decoded) { + base_ = decoded.base()->as(); + indices_ = decoded.indices(); + + SelectivityVector rows(base_->size()); + decodedChildren_.resize(base_->childrenSize()); + hashers_.resize(base_->childrenSize()); + for (auto i = 0; i < base_->childrenSize(); ++i) { + decodedChildren_[i].decode(*base_->childAt(i), rows); + hashers_[i] = createVectorHasher(decodedChildren_[i]); + } + } + + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + ReturnType result = seed; + for (auto i = 0; i < base_->childrenSize(); ++i) { + result = hashers_[i]->hashAt(indices_[index], result); + } + return result; + } + + private: + const RowVector* base_; + const int32_t* indices_; + std::vector decodedChildren_; + std::vector>> hashers_; +}; + // ReturnType can be either int32_t or int64_t // HashClass contains the function like hashInt32 -template +template < + typename HashClass, + typename SeedType = typename HashTraits::SeedType, + typename ReturnType = typename HashTraits::ReturnType> void applyWithType( const SelectivityVector& rows, std::vector& args, // Not using const ref so we can reuse args std::optional seed, exec::EvalCtx& context, VectorPtr& resultRef) { - HashClass hash; size_t hashIdx = seed ? 1 : 0; SeedType hashSeed = seed ? *seed : kDefaultSeed; auto& result = *resultRef->as>(); - rows.applyToSelected([&](int row) { result.set(row, hashSeed); }); + rows.applyToSelected([&](auto row) { result.set(row, hashSeed); }); exec::LocalSelectivityVector selectedMinusNulls(context); @@ -54,36 +306,16 @@ void applyWithType( decoded->nulls(&rows), rows.begin(), rows.end()); selected = selectedMinusNulls.get(); } - switch (args[i]->type()->kind()) { -// Derived from InterpretedHashFunction.hash: -// https://github.com/apache/spark/blob/382b66e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L532 -#define CASE(typeEnum, hashFn, inputType) \ - case TypeKind::typeEnum: \ - selected->applyToSelected([&](int row) { \ - result.set( \ - row, hashFn(decoded->valueAt(row), result.valueAt(row))); \ - }); \ - break; - CASE(BOOLEAN, hash.hashInt32, bool); - CASE(TINYINT, hash.hashInt32, int8_t); - CASE(SMALLINT, hash.hashInt32, int16_t); - CASE(INTEGER, hash.hashInt32, int32_t); - CASE(BIGINT, hash.hashInt64, int64_t); - CASE(VARCHAR, hash.hashBytes, StringView); - CASE(VARBINARY, hash.hashBytes, StringView); - CASE(REAL, hash.hashFloat, float); - CASE(DOUBLE, hash.hashDouble, double); - CASE(HUGEINT, hash.hashLongDecimal, int128_t); - CASE(TIMESTAMP, hash.hashTimestamp, Timestamp); -#undef CASE - default: - VELOX_NYI( - "Unsupported type for HASH(): {}", args[i]->type()->toString()); - } + + auto hasher = createVectorHasher(*decoded); + selected->applyToSelected([&](auto row) { + result.set(row, hasher->hashNotNullAt(row, result.valueAt(row))); + }); } } -// Derived from src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java. +// Derived from +// src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java. // // Spark's Murmur3 seems slightly different from the original from Austin // Appleby: in particular the fmix function's first line is different. The @@ -95,13 +327,13 @@ void applyWithType( class Murmur3Hash final { public: - uint32_t hashInt32(int32_t input, uint32_t seed) { + static uint32_t hashInt32(int32_t input, uint32_t seed) { uint32_t k1 = mixK1(input); uint32_t h1 = mixH1(seed, k1); return fmix(h1, 4); } - uint32_t hashInt64(uint64_t input, uint32_t seed) { + static uint32_t hashInt64(uint64_t input, uint32_t seed) { uint32_t low = input; uint32_t high = input >> 32; @@ -116,19 +348,19 @@ class Murmur3Hash final { // Floating point numbers are hashed as if they are integers, with // -0f defined to have the same output as +0f. - uint32_t hashFloat(float input, uint32_t seed) { + static uint32_t hashFloat(float input, uint32_t seed) { return hashInt32( input == -0.f ? 0 : *reinterpret_cast(&input), seed); } - uint32_t hashDouble(double input, uint32_t seed) { + static uint32_t hashDouble(double input, uint32_t seed) { return hashInt64( input == -0. ? 0 : *reinterpret_cast(&input), seed); } // Spark also has an hashUnsafeBytes2 function, but it was not used at the // time of implementation. - uint32_t hashBytes(const StringView& input, uint32_t seed) { + static uint32_t hashBytes(const StringView& input, uint32_t seed) { const char* i = input.data(); const char* const end = input.data() + input.size(); uint32_t h1 = seed; @@ -141,25 +373,25 @@ class Murmur3Hash final { return fmix(h1, input.size()); } - uint32_t hashLongDecimal(int128_t input, uint32_t seed) { + static uint32_t hashLongDecimal(int128_t input, uint32_t seed) { char out[sizeof(int128_t)]; int32_t length = DecimalUtil::toByteArray(input, out); return hashBytes(StringView(out, length), seed); } - uint32_t hashTimestamp(Timestamp input, uint32_t seed) { + static uint32_t hashTimestamp(Timestamp input, uint32_t seed) { return hashInt64(input.toMicros(), seed); } private: - uint32_t mixK1(uint32_t k1) { + static uint32_t mixK1(uint32_t k1) { k1 *= 0xcc9e2d51; k1 = bits::rotateLeft(k1, 15); k1 *= 0x1b873593; return k1; } - uint32_t mixH1(uint32_t h1, uint32_t k1) { + static uint32_t mixH1(uint32_t h1, uint32_t k1) { h1 ^= k1; h1 = bits::rotateLeft(h1, 13); h1 = h1 * 5 + 0xe6546b64; @@ -167,7 +399,7 @@ class Murmur3Hash final { } // Finalization mix - force all bits of a hash block to avalanche - uint32_t fmix(uint32_t h1, uint32_t length) { + static uint32_t fmix(uint32_t h1, uint32_t length) { h1 ^= length; h1 ^= h1 >> 16; h1 *= 0x85ebca6b; @@ -190,7 +422,7 @@ class Murmur3HashFunction final : public exec::VectorFunction { exec::EvalCtx& context, VectorPtr& resultRef) const final { context.ensureWritable(rows, INTEGER(), resultRef); - applyWithType(rows, args, seed_, context, resultRef); + applyWithType(rows, args, seed_, context, resultRef); } private: @@ -198,21 +430,15 @@ class Murmur3HashFunction final : public exec::VectorFunction { }; class XxHash64 final { - const uint64_t PRIME64_1 = 0x9E3779B185EBCA87L; - const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FL; - const uint64_t PRIME64_3 = 0x165667B19E3779F9L; - const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63L; - const uint64_t PRIME64_5 = 0x27D4EB2F165667C5L; - public: - int64_t hashInt32(const int32_t input, uint64_t seed) { + static uint64_t hashInt32(const int32_t input, uint64_t seed) { int64_t hash = seed + PRIME64_5 + 4L; hash ^= static_cast((input & 0xFFFFFFFFL) * PRIME64_1); hash = bits::rotateLeft64(hash, 23) * PRIME64_2 + PRIME64_3; return fmix(hash); } - int64_t hashInt64(int64_t input, uint64_t seed) { + static uint64_t hashInt64(int64_t input, uint64_t seed) { int64_t hash = seed + PRIME64_5 + 8L; hash ^= bits::rotateLeft64(input * PRIME64_2, 31) * PRIME64_1; hash = bits::rotateLeft64(hash, 27) * PRIME64_1 + PRIME64_4; @@ -221,17 +447,17 @@ class XxHash64 final { // Floating point numbers are hashed as if they are integers, with // -0f defined to have the same output as +0f. - int64_t hashFloat(float input, uint64_t seed) { + static uint64_t hashFloat(float input, uint64_t seed) { return hashInt32( input == -0.f ? 0 : *reinterpret_cast(&input), seed); } - int64_t hashDouble(double input, uint64_t seed) { + static uint64_t hashDouble(double input, uint64_t seed) { return hashInt64( input == -0. ? 0 : *reinterpret_cast(&input), seed); } - uint64_t hashBytes(const StringView& input, uint64_t seed) { + static uint64_t hashBytes(const StringView& input, uint64_t seed) { const char* i = input.data(); const char* const end = input.data() + input.size(); @@ -253,18 +479,24 @@ class XxHash64 final { return fmix(hash); } - int64_t hashLongDecimal(int128_t input, uint32_t seed) { + static uint64_t hashLongDecimal(int128_t input, uint64_t seed) { char out[sizeof(int128_t)]; int32_t length = DecimalUtil::toByteArray(input, out); return hashBytes(StringView(out, length), seed); } - int64_t hashTimestamp(Timestamp input, uint32_t seed) { + static uint64_t hashTimestamp(Timestamp input, uint64_t seed) { return hashInt64(input.toMicros(), seed); } private: - uint64_t fmix(uint64_t hash) { + static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87L; + static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FL; + static const uint64_t PRIME64_3 = 0x165667B19E3779F9L; + static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63L; + static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5L; + + static uint64_t fmix(uint64_t hash) { hash ^= hash >> 33; hash *= PRIME64_2; hash ^= hash >> 29; @@ -273,7 +505,7 @@ class XxHash64 final { return hash; } - uint64_t hashBytesByWords(const StringView& input, uint64_t seed) { + static uint64_t hashBytesByWords(const StringView& input, uint64_t seed) { const char* i = input.data(); const char* const end = input.data() + input.size(); uint32_t length = input.size(); @@ -353,13 +585,52 @@ class XxHash64Function final : public exec::VectorFunction { exec::EvalCtx& context, VectorPtr& resultRef) const final { context.ensureWritable(rows, BIGINT(), resultRef); - applyWithType(rows, args, seed_, context, resultRef); + applyWithType(rows, args, seed_, context, resultRef); } private: const std::optional seed_; }; +bool checkHashElementType(const TypePtr& type) { + switch (type->kind()) { + case TypeKind::BOOLEAN: + case TypeKind::TINYINT: + case TypeKind::SMALLINT: + case TypeKind::INTEGER: + case TypeKind::BIGINT: + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + case TypeKind::REAL: + case TypeKind::DOUBLE: + case TypeKind::HUGEINT: + case TypeKind::TIMESTAMP: + return true; + case TypeKind::ARRAY: + return checkHashElementType(type->asArray().elementType()); + case TypeKind::MAP: + return checkHashElementType(type->asMap().keyType()) && + checkHashElementType(type->asMap().valueType()); + case TypeKind::ROW: { + const auto& children = type->asRow().children(); + return std::all_of( + children.begin(), children.end(), [](const auto& child) { + return checkHashElementType(child); + }); + } + default: + return false; + } +} + +void checkArgTypes(const std::vector& args) { + for (const auto& arg : args) { + if (!checkHashElementType(arg.type)) { + VELOX_USER_FAIL("Unsupported type for hash: {}", arg.type->toString()) + } + } +} + } // namespace // Not all types are supported by now. Check types when making hash function. @@ -372,27 +643,6 @@ std::vector> hashSignatures() { .build()}; } -void checkArgTypes(const std::vector& args) { - for (const auto& arg : args) { - switch (arg.type->kind()) { - case TypeKind::BOOLEAN: - case TypeKind::TINYINT: - case TypeKind::SMALLINT: - case TypeKind::INTEGER: - case TypeKind::BIGINT: - case TypeKind::VARCHAR: - case TypeKind::VARBINARY: - case TypeKind::REAL: - case TypeKind::DOUBLE: - case TypeKind::HUGEINT: - case TypeKind::TIMESTAMP: - break; - default: - VELOX_USER_FAIL("Unsupported type for hash: {}", arg.type->toString()) - } - } -} - std::shared_ptr makeHash( const std::string& name, const std::vector& inputArgs, diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp index 422d63643e56..1c6569dc15bf 100644 --- a/velox/functions/sparksql/tests/HashTest.cpp +++ b/velox/functions/sparksql/tests/HashTest.cpp @@ -18,6 +18,8 @@ #include +using facebook::velox::test::assertEqualVectors; + namespace facebook::velox::functions::sparksql::test { namespace { @@ -27,6 +29,10 @@ class HashTest : public SparkFunctionBaseTest { std::optional hash(std::optional arg) { return evaluateOnce("hash(c0)", arg); } + + VectorPtr hash(VectorPtr vector) { + return evaluate("hash(c0)", makeRowVector({vector})); + } }; TEST_F(HashTest, String) { @@ -128,5 +134,105 @@ TEST_F(HashTest, Float) { EXPECT_EQ(hash(-limits::infinity()), 427440766); } +TEST_F(HashTest, Array) { + assertEqualVectors( + makeFlatVector({2101165938, 42, 1045631400}), + hash(makeArrayVector({{1, 2, 3, 4, 5}, {}, {1, 2, 3}}))); + + assertEqualVectors( + makeFlatVector({-559580957, 1765031574, 42}), + hash(makeNullableArrayVector( + {{1, std::nullopt}, {std::nullopt, 2}, {std::nullopt}}))); + + // Nested array. + { + using innerArrayType = std::vector>; + using outerArrayType = + std::vector>>>; + + innerArrayType a{1, std::nullopt, 2, 3}; + innerArrayType b{4, 5}; + innerArrayType c{6, 7, 8}; + outerArrayType row1{{a}, {b}}; + outerArrayType row2{{a}, {c}}; + outerArrayType row3{{{}}}; + outerArrayType row4{{{std::nullopt}}}; + auto arrayVector = makeNullableNestedArrayVector( + {{row1}, {row2}, {row3}, {row4}, std::nullopt}); + assertEqualVectors( + makeFlatVector({2101165938, -992561130, 42, 42, 42}), + hash(arrayVector)); + } + + // Array of map. + { + using S = StringView; + using P = std::pair>; + std::vector

a{P{1, S{"a"}}, P{2, std::nullopt}}; + std::vector

b{P{3, S{"c"}}}; + std::vector>> data = {{a, b}}; + auto arrayVector = makeArrayOfMapVector(data); + assertEqualVectors( + makeFlatVector(std::vector{-718462205}), + hash(arrayVector)); + } + + // Array of row. + { + std::vector>>> + data = { + {{{1, "red"}}, {{2, "blue"}}, {{3, "green"}}}, + {{{1, "red"}}, std::nullopt, {{3, "green"}}}, + {std::nullopt}, + }; + auto arrayVector = makeArrayOfRowVector(data, ROW({INTEGER(), VARCHAR()})); + assertEqualVectors( + makeFlatVector({-1458343314, 551500425, 42}), + hash(arrayVector)); + } +} + +TEST_F(HashTest, Map) { + auto mapVector = makeMapVector( + {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}}); + assertEqualVectors( + makeFlatVector(std::vector{1263683448}), + hash(mapVector)); + + auto mapOfArrays = createMapOfArraysVector( + {{{1, {{1, 2, 3}}}}, {{2, {{4, 5, 6}}}}, {{3, {{7, 8, 9}}}}}); + assertEqualVectors( + makeFlatVector({-1818148947, 529298908, 825098912}), + hash(mapOfArrays)); + + auto mapWithNullArrays = createMapOfArraysVector( + {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, {{3, {{}}}}}); + assertEqualVectors( + makeFlatVector({-1712319331, 2060637564, 519220707}), + hash(mapWithNullArrays)); +} + +TEST_F(HashTest, Row) { + auto row = makeRowVector({ + makeFlatVector({1, 3}), + makeFlatVector({2, 4}), + }); + assertEqualVectors( + makeFlatVector({-1181176833, 1717636039}), hash(row)); + + row = makeRowVector({ + makeNullableFlatVector({1, std::nullopt}), + makeNullableFlatVector({std::nullopt, 4}), + }); + assertEqualVectors( + makeFlatVector({-1712319331, 1344313940}), hash(row)); + + row->setNull(0, true); + assertEqualVectors(makeFlatVector({42, 1344313940}), hash(row)); + + row->setNull(1, true); + assertEqualVectors(makeFlatVector({42, 42}), hash(row)); +} + } // namespace } // namespace facebook::velox::functions::sparksql::test diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp index 09162f4a0279..d1508f3681fb 100644 --- a/velox/functions/sparksql/tests/XxHash64Test.cpp +++ b/velox/functions/sparksql/tests/XxHash64Test.cpp @@ -18,6 +18,8 @@ #include +using facebook::velox::test::assertEqualVectors; + namespace facebook::velox::functions::sparksql::test { namespace { class XxHash64Test : public SparkFunctionBaseTest { @@ -26,6 +28,10 @@ class XxHash64Test : public SparkFunctionBaseTest { std::optional xxhash64(std::optional arg) { return evaluateOnce("xxhash64(c0)", arg); } + + VectorPtr xxhash64(VectorPtr vector) { + return evaluate("xxhash64(c0)", makeRowVector({vector})); + } }; // The expected result was obtained by running SELECT xxhash64("Spark") query @@ -138,6 +144,113 @@ TEST_F(XxHash64Test, float) { EXPECT_EQ(xxhash64(-limits::infinity()), -7580553461823983095); } +TEST_F(XxHash64Test, array) { + assertEqualVectors( + makeFlatVector({-6041664978295882827, 42, 4904562767517797033}), + xxhash64(makeArrayVector({{1, 2, 3, 4, 5}, {}, {1, 2, 3}}))); + + assertEqualVectors( + makeFlatVector({-6698625589789238999, 8420071140774656230, 42}), + xxhash64(makeNullableArrayVector( + {{1, std::nullopt}, {std::nullopt, 2}, {std::nullopt}}))); + + // Nested array. + { + using innerArrayType = std::vector>; + using outerArrayType = + std::vector>>>; + + innerArrayType a{1, std::nullopt, 2, 3}; + innerArrayType b{4, 5}; + innerArrayType c{6, 7, 8}; + outerArrayType row1{{a}, {b}}; + outerArrayType row2{{a}, {c}}; + outerArrayType row3{{{}}}; + outerArrayType row4{{{std::nullopt}}}; + auto arrayVector = makeNullableNestedArrayVector( + {{row1}, {row2}, {row3}, {row4}, std::nullopt}); + assertEqualVectors( + makeFlatVector( + {-6041664978295882827, -1052942565807509112, 42, 42, 42}), + xxhash64(arrayVector)); + } + + // Array of map. + { + using S = StringView; + using P = std::pair>; + std::vector

a{P{1, S{"a"}}, P{2, std::nullopt}}; + std::vector

b{P{3, S{"c"}}}; + std::vector>> data = {{a, b}}; + auto arrayVector = makeArrayOfMapVector(data); + assertEqualVectors( + makeFlatVector(std::vector{2880747995994395223}), + xxhash64(arrayVector)); + } + + // Array of row. + { + std::vector>>> + data = { + {{{1, "red"}}, {{2, "blue"}}, {{3, "green"}}}, + {{{1, "red"}}, std::nullopt, {{3, "green"}}}, + {std::nullopt}, + }; + auto arrayVector = makeArrayOfRowVector(data, ROW({INTEGER(), VARCHAR()})); + assertEqualVectors( + makeFlatVector( + {-4096178443626566478, -8973283971856715104, 42}), + xxhash64(arrayVector)); + } +} + +TEST_F(XxHash64Test, map) { + auto mapVector = makeMapVector( + {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}}); + assertEqualVectors( + makeFlatVector(std::vector{-6303587702533348160}), + xxhash64(mapVector)); + + auto mapOfArrays = createMapOfArraysVector( + {{{1, {{1, 2, 3}}}}, {{2, {{4, 5, 6}}}}, {{3, {{7, 8, 9}}}}}); + assertEqualVectors( + makeFlatVector( + {-2103781794412908874, 1112887818746642853, 5787852566364222439}), + xxhash64(mapOfArrays)); + + auto mapWithNullArrays = createMapOfArraysVector( + {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, {{3, {{}}}}}); + assertEqualVectors( + makeFlatVector( + {-7001672635703045582, 7217681953522744649, 3188756510806108107}), + xxhash64(mapWithNullArrays)); +} + +TEST_F(XxHash64Test, row) { + auto row = makeRowVector({ + makeFlatVector({1, 3}), + makeFlatVector({2, 4}), + }); + assertEqualVectors( + makeFlatVector({-8198029865082835910, 351067884137457704}), + xxhash64(row)); + + row = makeRowVector({ + makeNullableFlatVector({1, std::nullopt}), + makeNullableFlatVector({std::nullopt, 4}), + }); + assertEqualVectors( + makeFlatVector({-7001672635703045582, 404280023041566627}), + xxhash64(row)); + + row->setNull(0, true); + assertEqualVectors( + makeFlatVector({42, 404280023041566627}), xxhash64(row)); + + row->setNull(1, true); + assertEqualVectors(makeFlatVector({42, 42}), xxhash64(row)); +} + TEST_F(XxHash64Test, hashSeed) { auto xxhash64WithSeed = [&](int64_t seed, const std::optional& arg) { return evaluateOnce( From 86a2d26e4376966295bf615c0c491b8d5e51b863 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Fri, 26 Apr 2024 11:37:17 +0800 Subject: [PATCH 2/5] remove virtual function call --- velox/functions/sparksql/Hash.cpp | 175 ++++++++++++++++-------------- 1 file changed, 96 insertions(+), 79 deletions(-) diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index 20e1284b68c4..a7ec95e2aa96 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -26,70 +26,76 @@ namespace { const int32_t kDefaultSeed = 42; -struct Murmur3Hash; -struct XxHash64; - -// A template struct that contains the seed and return type of the hash -// function. -template -struct HashTraits {}; - -template <> -struct HashTraits { - using SeedType = int32_t; - using ReturnType = int32_t; -}; - -template <> -struct HashTraits { - using SeedType = int64_t; - using ReturnType = int64_t; -}; - // Computes the hash value of input using the hash function in HashClass. -template -ReturnType hashOne(int32_t input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + int32_t input, + typename HashClass::SeedType seed) { return HashClass::hashInt32(input, seed); } -template -ReturnType hashOne(int64_t input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + int64_t input, + typename HashClass::SeedType seed) { return HashClass::hashInt64(input, seed); } -template -ReturnType hashOne(float input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + float input, + typename HashClass::SeedType seed) { return HashClass::hashFloat(input, seed); } -template -ReturnType hashOne(double input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + double input, + typename HashClass::SeedType seed) { return HashClass::hashDouble(input, seed); } -template -ReturnType hashOne(int128_t input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + int128_t input, + typename HashClass::SeedType seed) { return HashClass::hashLongDecimal(input, seed); } -template -ReturnType hashOne(Timestamp input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + Timestamp input, + typename HashClass::SeedType seed) { return HashClass::hashTimestamp(input, seed); } -template -ReturnType hashOne(StringView input, SeedType seed) { +template +typename HashClass::ReturnType hashOne( + StringView input, + typename HashClass::SeedType seed) { return HashClass::hashBytes(input, seed); } +template +class PrimitiveVectorHasher; + +template +class ArrayVectorHasher; + +template +class MapVectorHasher; + +template +class RowVectorHasher; + // Class to compute hashes identical to one produced by Spark. // Hashes are computed using the algorithm implemented in HashClass. -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> +template class SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {} virtual ~SparkVectorHasher() = default; @@ -103,36 +109,33 @@ class SparkVectorHasher { } // Compute the hash value of input vector at index for non-null values. - virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0; + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { + switch (decoded_.base()->typeKind()) { + case TypeKind::ARRAY: + return static_cast*>(this)->hashValueAt( + index, seed); + case TypeKind::MAP: + return static_cast*>(this)->hashValueAt( + index, seed); + case TypeKind::ROW: + return static_cast*>(this)->hashValueAt( + index, seed); + default: + return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( + hashPrimitive, decoded_.base()->typeKind(), index, seed); + } + } protected: const DecodedVector& decoded_; -}; - -template < - typename HashClass, - TypeKind kind, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> -class PrimitiveVectorHasher; - -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> -class ArrayVectorHasher; -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> -class MapVectorHasher; - -template < - typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> -class RowVectorHasher; + private: + template + ReturnType hashPrimitive(vector_size_t index, SeedType seed) { + return static_cast*>(this) + ->hashValueAt(index, seed); + } +}; template std::shared_ptr> createPrimitiveVectorHasher( @@ -159,27 +162,29 @@ std::shared_ptr> createVectorHasher( } } -template < - typename HashClass, - TypeKind kind, - typename SeedType, - typename ReturnType> +template class PrimitiveVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + PrimitiveVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) {} - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { - return hashOne( + ReturnType hashValueAt(vector_size_t index, SeedType seed) { + return hashOne( this->decoded_.template valueAt::NativeType>( index), seed); } }; -template +template class ArrayVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + ArrayVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) { base_ = decoded.base()->as(); @@ -190,7 +195,7 @@ class ArrayVectorHasher : public SparkVectorHasher { elementHasher_ = createVectorHasher(decodedElements_); } - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { auto size = base_->sizeAt(indices_[index]); auto offset = base_->offsetAt(indices_[index]); @@ -208,9 +213,12 @@ class ArrayVectorHasher : public SparkVectorHasher { std::shared_ptr> elementHasher_; }; -template +template class MapVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + MapVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) { base_ = decoded.base()->as(); @@ -223,7 +231,7 @@ class MapVectorHasher : public SparkVectorHasher { valueHasher_ = createVectorHasher(decodedValues_); } - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { auto size = base_->sizeAt(indices_[index]); auto offset = base_->offsetAt(indices_[index]); @@ -244,9 +252,12 @@ class MapVectorHasher : public SparkVectorHasher { std::shared_ptr> valueHasher_; }; -template +template class RowVectorHasher : public SparkVectorHasher { public: + using SeedType = typename HashClass::SeedType; + using ReturnType = typename HashClass::ReturnType; + RowVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) { base_ = decoded.base()->as(); @@ -261,7 +272,7 @@ class RowVectorHasher : public SparkVectorHasher { } } - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { + ReturnType hashValueAt(vector_size_t index, SeedType seed) { ReturnType result = seed; for (auto i = 0; i < base_->childrenSize(); ++i) { result = hashers_[i]->hashAt(indices_[index], result); @@ -280,8 +291,8 @@ class RowVectorHasher : public SparkVectorHasher { // HashClass contains the function like hashInt32 template < typename HashClass, - typename SeedType = typename HashTraits::SeedType, - typename ReturnType = typename HashTraits::ReturnType> + typename SeedType = typename HashClass::SeedType, + typename ReturnType = typename HashClass::ReturnType> void applyWithType( const SelectivityVector& rows, std::vector& args, // Not using const ref so we can reuse args @@ -327,6 +338,9 @@ void applyWithType( class Murmur3Hash final { public: + using SeedType = int32_t; + using ReturnType = int32_t; + static uint32_t hashInt32(int32_t input, uint32_t seed) { uint32_t k1 = mixK1(input); uint32_t h1 = mixH1(seed, k1); @@ -431,6 +445,9 @@ class Murmur3HashFunction final : public exec::VectorFunction { class XxHash64 final { public: + using SeedType = int64_t; + using ReturnType = int64_t; + static uint64_t hashInt32(const int32_t input, uint64_t seed) { int64_t hash = seed + PRIME64_5 + 4L; hash ^= static_cast((input & 0xFFFFFFFFL) * PRIME64_1); From e6a49d145225e41364a71906118b68c456dac919 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Wed, 1 May 2024 11:41:52 +0800 Subject: [PATCH 3/5] address comments --- velox/docs/functions/spark/binary.rst | 9 ------- velox/functions/sparksql/tests/HashTest.cpp | 25 +++++++------------ .../functions/sparksql/tests/XxHash64Test.cpp | 19 +++++--------- 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst index 9ec544a8e810..249c7f4ee24f 100644 --- a/velox/docs/functions/spark/binary.rst +++ b/velox/docs/functions/spark/binary.rst @@ -10,30 +10,21 @@ Binary Functions Computes the hash of one or more input values using seed value of 42. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. - .. spark:function:: hash_with_seed(seed, x, ...) -> integer Computes the hash of one or more input values using specified seed. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: xxhash64(x, ...) -> bigint Computes the xxhash64 of one or more input values using seed value of 42. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: xxhash64_with_seed(seed, x, ...) -> bigint Computes the xxhash64 of one or more input values using specified seed. For multiple arguments, their types can be different. - Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR, - VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW. .. spark:function:: md5(x) -> varbinary diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp index 1c6569dc15bf..58a9d4565c0d 100644 --- a/velox/functions/sparksql/tests/HashTest.cpp +++ b/velox/functions/sparksql/tests/HashTest.cpp @@ -134,7 +134,7 @@ TEST_F(HashTest, Float) { EXPECT_EQ(hash(-limits::infinity()), 427440766); } -TEST_F(HashTest, Array) { +TEST_F(HashTest, array) { assertEqualVectors( makeFlatVector({2101165938, 42, 1045631400}), hash(makeArrayVector({{1, 2, 3, 4, 5}, {}, {1, 2, 3}}))); @@ -146,19 +146,12 @@ TEST_F(HashTest, Array) { // Nested array. { - using innerArrayType = std::vector>; - using outerArrayType = - std::vector>>>; - - innerArrayType a{1, std::nullopt, 2, 3}; - innerArrayType b{4, 5}; - innerArrayType c{6, 7, 8}; - outerArrayType row1{{a}, {b}}; - outerArrayType row2{{a}, {c}}; - outerArrayType row3{{{}}}; - outerArrayType row4{{{std::nullopt}}}; - auto arrayVector = makeNullableNestedArrayVector( - {{row1}, {row2}, {row3}, {row4}, std::nullopt}); + auto arrayVector = makeNestedArrayVectorFromJson( + {"[[1, null, 2, 3], [4, 5]]", + "[[1, null, 2, 3], [6, 7, 8]]", + "[[]]", + "[[null]]", + "[null]"}); assertEqualVectors( makeFlatVector({2101165938, -992561130, 42, 42, 42}), hash(arrayVector)); @@ -192,7 +185,7 @@ TEST_F(HashTest, Array) { } } -TEST_F(HashTest, Map) { +TEST_F(HashTest, map) { auto mapVector = makeMapVector( {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}}); assertEqualVectors( @@ -212,7 +205,7 @@ TEST_F(HashTest, Map) { hash(mapWithNullArrays)); } -TEST_F(HashTest, Row) { +TEST_F(HashTest, row) { auto row = makeRowVector({ makeFlatVector({1, 3}), makeFlatVector({2, 4}), diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp index d1508f3681fb..6e086ffd918f 100644 --- a/velox/functions/sparksql/tests/XxHash64Test.cpp +++ b/velox/functions/sparksql/tests/XxHash64Test.cpp @@ -156,19 +156,12 @@ TEST_F(XxHash64Test, array) { // Nested array. { - using innerArrayType = std::vector>; - using outerArrayType = - std::vector>>>; - - innerArrayType a{1, std::nullopt, 2, 3}; - innerArrayType b{4, 5}; - innerArrayType c{6, 7, 8}; - outerArrayType row1{{a}, {b}}; - outerArrayType row2{{a}, {c}}; - outerArrayType row3{{{}}}; - outerArrayType row4{{{std::nullopt}}}; - auto arrayVector = makeNullableNestedArrayVector( - {{row1}, {row2}, {row3}, {row4}, std::nullopt}); + auto arrayVector = makeNestedArrayVectorFromJson( + {"[[1, null, 2, 3], [4, 5]]", + "[[1, null, 2, 3], [6, 7, 8]]", + "[[]]", + "[[null]]", + "[null]"}); assertEqualVectors( makeFlatVector( {-6041664978295882827, -1052942565807509112, 42, 42, 42}), From e2f4423455308c71b8c56887ac3dd35578d665da Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Thu, 9 May 2024 16:48:20 +0800 Subject: [PATCH 4/5] add benchmark --- .../sparksql/benchmarks/CMakeLists.txt | 4 ++ .../sparksql/benchmarks/CompareBenchmark.cpp | 1 + .../sparksql/benchmarks/HashBenchmark.cpp | 53 +++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 velox/functions/sparksql/benchmarks/HashBenchmark.cpp diff --git a/velox/functions/sparksql/benchmarks/CMakeLists.txt b/velox/functions/sparksql/benchmarks/CMakeLists.txt index 5ccf7e88745a..837c983b7df8 100644 --- a/velox/functions/sparksql/benchmarks/CMakeLists.txt +++ b/velox/functions/sparksql/benchmarks/CMakeLists.txt @@ -27,3 +27,7 @@ target_link_libraries( add_executable(velox_sparksql_benchmarks_compare CompareBenchmark.cpp) target_link_libraries(velox_sparksql_benchmarks_compare velox_functions_spark velox_benchmark_builder velox_vector_test_lib) + +add_executable(velox_sparksql_benchmarks_hash HashBenchmark.cpp) +target_link_libraries(velox_sparksql_benchmarks_hash velox_functions_spark + velox_benchmark_builder velox_vector_test_lib) diff --git a/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp b/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp index 19b6f1262512..e4d3c9f3f608 100644 --- a/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp +++ b/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp @@ -26,6 +26,7 @@ using namespace facebook::velox; int main(int argc, char** argv) { folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); functions::sparksql::registerFunctions(""); ExpressionBenchmarkBuilder benchmarkBuilder; diff --git a/velox/functions/sparksql/benchmarks/HashBenchmark.cpp b/velox/functions/sparksql/benchmarks/HashBenchmark.cpp new file mode 100644 index 000000000000..97cf592e3b81 --- /dev/null +++ b/velox/functions/sparksql/benchmarks/HashBenchmark.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "velox/benchmarks/ExpressionBenchmarkBuilder.h" +#include "velox/functions/sparksql/Register.h" + +using namespace facebook; + +using namespace facebook::velox; + +int main(int argc, char** argv) { + folly::Init init(&argc, &argv); + memory::MemoryManager::initialize({}); + functions::sparksql::registerFunctions(""); + + ExpressionBenchmarkBuilder benchmarkBuilder; + + std::vector inputTypes = { + ARRAY(MAP(INTEGER(), VARCHAR())), + ROW({"f_map", "f_array"}, {MAP(INTEGER(), VARCHAR()), ARRAY(INTEGER())}), + }; + + for (auto& inputType : inputTypes) { + benchmarkBuilder + .addBenchmarkSet( + fmt::format("hash_{}", inputType->toString()), + ROW({"c0"}, {inputType})) + .withFuzzerOptions({.vectorSize = 1000, .nullRatio = 0.1}) + .addExpression("hash", "hash(c0)") + .addExpression("xxhash64", "xxhash64(c0)") + .withIterations(100); + } + + benchmarkBuilder.registerBenchmarks(); + folly::runBenchmarks(); + return 0; +} From 6a270b3c8ffc088b48f6110008215f822e8af5a9 Mon Sep 17 00:00:00 2001 From: "Ma, Rong" Date: Sat, 11 May 2024 10:39:26 +0800 Subject: [PATCH 5/5] revert switch --- velox/functions/sparksql/Hash.cpp | 32 +++++-------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp index a7ec95e2aa96..d20cf5d4c8b5 100644 --- a/velox/functions/sparksql/Hash.cpp +++ b/velox/functions/sparksql/Hash.cpp @@ -109,32 +109,10 @@ class SparkVectorHasher { } // Compute the hash value of input vector at index for non-null values. - ReturnType hashNotNullAt(vector_size_t index, SeedType seed) { - switch (decoded_.base()->typeKind()) { - case TypeKind::ARRAY: - return static_cast*>(this)->hashValueAt( - index, seed); - case TypeKind::MAP: - return static_cast*>(this)->hashValueAt( - index, seed); - case TypeKind::ROW: - return static_cast*>(this)->hashValueAt( - index, seed); - default: - return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( - hashPrimitive, decoded_.base()->typeKind(), index, seed); - } - } + virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0; protected: const DecodedVector& decoded_; - - private: - template - ReturnType hashPrimitive(vector_size_t index, SeedType seed) { - return static_cast*>(this) - ->hashValueAt(index, seed); - } }; template @@ -171,7 +149,7 @@ class PrimitiveVectorHasher : public SparkVectorHasher { PrimitiveVectorHasher(DecodedVector& decoded) : SparkVectorHasher(decoded) {} - ReturnType hashValueAt(vector_size_t index, SeedType seed) { + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { return hashOne( this->decoded_.template valueAt::NativeType>( index), @@ -195,7 +173,7 @@ class ArrayVectorHasher : public SparkVectorHasher { elementHasher_ = createVectorHasher(decodedElements_); } - ReturnType hashValueAt(vector_size_t index, SeedType seed) { + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { auto size = base_->sizeAt(indices_[index]); auto offset = base_->offsetAt(indices_[index]); @@ -231,7 +209,7 @@ class MapVectorHasher : public SparkVectorHasher { valueHasher_ = createVectorHasher(decodedValues_); } - ReturnType hashValueAt(vector_size_t index, SeedType seed) { + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { auto size = base_->sizeAt(indices_[index]); auto offset = base_->offsetAt(indices_[index]); @@ -272,7 +250,7 @@ class RowVectorHasher : public SparkVectorHasher { } } - ReturnType hashValueAt(vector_size_t index, SeedType seed) { + ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override { ReturnType result = seed; for (auto i = 0; i < base_->childrenSize(); ++i) { result = hashers_[i]->hashAt(indices_[index], result);