From 86622a08e14c606d07cc757b03916897843ace7c Mon Sep 17 00:00:00 2001
From: "Ma, Rong" <rong.ma@intel.com>
Date: Wed, 3 Apr 2024 13:42:20 +0800
Subject: [PATCH 1/5] support hash complex type

---
 velox/docs/functions/spark/binary.rst         |   8 +-
 velox/functions/sparksql/Hash.cpp             | 406 ++++++++++++++----
 velox/functions/sparksql/tests/HashTest.cpp   | 106 +++++
 .../functions/sparksql/tests/XxHash64Test.cpp | 113 +++++
 4 files changed, 551 insertions(+), 82 deletions(-)
diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst
index 4f76b29c1148..9ec544a8e810 100644
--- a/velox/docs/functions/spark/binary.rst
+++ b/velox/docs/functions/spark/binary.rst
@@ -11,7 +11,7 @@ Binary Functions
     Computes the hash of one or more input values using seed value of 42. For
     multiple arguments, their types can be different.
     Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP.
+    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 
 .. spark:function:: hash_with_seed(seed, x, ...) -> integer
@@ -19,21 +19,21 @@ Binary Functions
     Computes the hash of one or more input values using specified seed. For
     multiple arguments, their types can be different.
     Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP.
+    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 .. spark:function:: xxhash64(x, ...) -> bigint
 
     Computes the xxhash64 of one or more input values using seed value of 42.
     For multiple arguments, their types can be different.
     Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP.
+    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 .. spark:function:: xxhash64_with_seed(seed, x, ...) -> bigint
 
     Computes the xxhash64 of one or more input values using specified seed. For
     multiple arguments, their types can be different.
     Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT and TIMESTAMP.
+    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 .. spark:function:: md5(x) -> varbinary
 
diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp
index 8ad8b4e7f083..20e1284b68c4 100644
--- a/velox/functions/sparksql/Hash.cpp
+++ b/velox/functions/sparksql/Hash.cpp
@@ -26,21 +26,273 @@ namespace {
 
 const int32_t kDefaultSeed = 42;
 
+struct Murmur3Hash;
+struct XxHash64;
+
+// A template struct that contains the seed and return type of the hash
+// function.
+template <typename HashClass>
+struct HashTraits {};
+
+template <>
+struct HashTraits<Murmur3Hash> {
+  using SeedType = int32_t;
+  using ReturnType = int32_t;
+};
+
+template <>
+struct HashTraits<XxHash64> {
+  using SeedType = int64_t;
+  using ReturnType = int64_t;
+};
+
+// Computes the hash value of input using the hash function in HashClass.
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(int32_t input, SeedType seed) {
+  return HashClass::hashInt32(input, seed);
+}
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(int64_t input, SeedType seed) {
+  return HashClass::hashInt64(input, seed);
+}
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(float input, SeedType seed) {
+  return HashClass::hashFloat(input, seed);
+}
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(double input, SeedType seed) {
+  return HashClass::hashDouble(input, seed);
+}
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(int128_t input, SeedType seed) {
+  return HashClass::hashLongDecimal(input, seed);
+}
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(Timestamp input, SeedType seed) {
+  return HashClass::hashTimestamp(input, seed);
+}
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+ReturnType hashOne(StringView input, SeedType seed) {
+  return HashClass::hashBytes(input, seed);
+}
+
+// Class to compute hashes identical to one produced by Spark.
+// Hashes are computed using the algorithm implemented in HashClass.
+template <
+    typename HashClass,
+    typename SeedType = typename HashTraits<HashClass>::SeedType,
+    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+class SparkVectorHasher {
+ public:
+  SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {}
+
+  virtual ~SparkVectorHasher() = default;
+
+  // Compute the hash value of input vector at index.
+  ReturnType hashAt(vector_size_t index, SeedType seed) {
+    if (decoded_.isNullAt(index)) {
+      return seed;
+    }
+    return hashNotNullAt(index, seed);
+  }
+
+  // Compute the hash value of input vector at index for non-null values.
+  virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0;
+
+ protected:
+  const DecodedVector& decoded_;
+};
+
+template <
+    typename HashClass,
+    TypeKind kind,
+    typename SeedType = typename HashTraits<HashClass>::SeedType,
+    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+class PrimitiveVectorHasher;
+
+template <
+    typename HashClass,
+    typename SeedType = typename HashTraits<HashClass>::SeedType,
+    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+class ArrayVectorHasher;
+
+template <
+    typename HashClass,
+    typename SeedType = typename HashTraits<HashClass>::SeedType,
+    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+class MapVectorHasher;
+
+template <
+    typename HashClass,
+    typename SeedType = typename HashTraits<HashClass>::SeedType,
+    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+class RowVectorHasher;
+
+template <typename HashClass, TypeKind kind>
+std::shared_ptr<SparkVectorHasher<HashClass>> createPrimitiveVectorHasher(
+    DecodedVector& decoded) {
+  return std::make_shared<PrimitiveVectorHasher<HashClass, kind>>(decoded);
+}
+
+template <typename HashClass>
+std::shared_ptr<SparkVectorHasher<HashClass>> createVectorHasher(
+    DecodedVector& decoded) {
+  switch (decoded.base()->typeKind()) {
+    case TypeKind::ARRAY:
+      return std::make_shared<ArrayVectorHasher<HashClass>>(decoded);
+    case TypeKind::MAP:
+      return std::make_shared<MapVectorHasher<HashClass>>(decoded);
+    case TypeKind::ROW:
+      return std::make_shared<RowVectorHasher<HashClass>>(decoded);
+    default:
+      return VELOX_DYNAMIC_SCALAR_TEMPLATE_TYPE_DISPATCH(
+          createPrimitiveVectorHasher,
+          HashClass,
+          decoded.base()->typeKind(),
+          decoded);
+  }
+}
+
+template <
+    typename HashClass,
+    TypeKind kind,
+    typename SeedType,
+    typename ReturnType>
+class PrimitiveVectorHasher : public SparkVectorHasher<HashClass> {
+ public:
+  PrimitiveVectorHasher(DecodedVector& decoded)
+      : SparkVectorHasher<HashClass>(decoded) {}
+
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+    return hashOne<HashClass, SeedType, ReturnType>(
+        this->decoded_.template valueAt<typename TypeTraits<kind>::NativeType>(
+            index),
+        seed);
+  }
+};
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+class ArrayVectorHasher : public SparkVectorHasher<HashClass> {
+ public:
+  ArrayVectorHasher(DecodedVector& decoded)
+      : SparkVectorHasher<HashClass>(decoded) {
+    base_ = decoded.base()->as<ArrayVector>();
+    indices_ = decoded.indices();
+
+    SelectivityVector rows(base_->elements()->size());
+    decodedElements_.decode(*base_->elements(), rows);
+    elementHasher_ = createVectorHasher<HashClass>(decodedElements_);
+  }
+
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+    auto size = base_->sizeAt(indices_[index]);
+    auto offset = base_->offsetAt(indices_[index]);
+
+    ReturnType result = seed;
+    for (auto i = 0; i < size; ++i) {
+      result = elementHasher_->hashAt(i + offset, result);
+    }
+    return result;
+  }
+
+ private:
+  const ArrayVector* base_;
+  const int32_t* indices_;
+  DecodedVector decodedElements_;
+  std::shared_ptr<SparkVectorHasher<HashClass>> elementHasher_;
+};
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+class MapVectorHasher : public SparkVectorHasher<HashClass> {
+ public:
+  MapVectorHasher(DecodedVector& decoded)
+      : SparkVectorHasher<HashClass>(decoded) {
+    base_ = decoded.base()->as<MapVector>();
+    indices_ = decoded.indices();
+
+    SelectivityVector rows(base_->mapKeys()->size());
+    decodedKeys_.decode(*base_->mapKeys(), rows);
+    decodedValues_.decode(*base_->mapValues(), rows);
+    keyHasher_ = createVectorHasher<HashClass>(decodedKeys_);
+    valueHasher_ = createVectorHasher<HashClass>(decodedValues_);
+  }
+
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+    auto size = base_->sizeAt(indices_[index]);
+    auto offset = base_->offsetAt(indices_[index]);
+
+    ReturnType result = seed;
+    for (auto i = 0; i < size; ++i) {
+      result = keyHasher_->hashAt(i + offset, result);
+      result = valueHasher_->hashAt(i + offset, result);
+    }
+    return result;
+  }
+
+ private:
+  const MapVector* base_;
+  const int32_t* indices_;
+  DecodedVector decodedKeys_;
+  DecodedVector decodedValues_;
+  std::shared_ptr<SparkVectorHasher<HashClass>> keyHasher_;
+  std::shared_ptr<SparkVectorHasher<HashClass>> valueHasher_;
+};
+
+template <typename HashClass, typename SeedType, typename ReturnType>
+class RowVectorHasher : public SparkVectorHasher<HashClass> {
+ public:
+  RowVectorHasher(DecodedVector& decoded)
+      : SparkVectorHasher<HashClass>(decoded) {
+    base_ = decoded.base()->as<RowVector>();
+    indices_ = decoded.indices();
+
+    SelectivityVector rows(base_->size());
+    decodedChildren_.resize(base_->childrenSize());
+    hashers_.resize(base_->childrenSize());
+    for (auto i = 0; i < base_->childrenSize(); ++i) {
+      decodedChildren_[i].decode(*base_->childAt(i), rows);
+      hashers_[i] = createVectorHasher<HashClass>(decodedChildren_[i]);
+    }
+  }
+
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+    ReturnType result = seed;
+    for (auto i = 0; i < base_->childrenSize(); ++i) {
+      result = hashers_[i]->hashAt(indices_[index], result);
+    }
+    return result;
+  }
+
+ private:
+  const RowVector* base_;
+  const int32_t* indices_;
+  std::vector<DecodedVector> decodedChildren_;
+  std::vector<std::shared_ptr<SparkVectorHasher<HashClass>>> hashers_;
+};
+
 // ReturnType can be either int32_t or int64_t
 // HashClass contains the function like hashInt32
-template <typename ReturnType, typename HashClass, typename SeedType>
+template <
+    typename HashClass,
+    typename SeedType = typename HashTraits<HashClass>::SeedType,
+    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
 void applyWithType(
     const SelectivityVector& rows,
     std::vector<VectorPtr>& args, // Not using const ref so we can reuse args
     std::optional<SeedType> seed,
     exec::EvalCtx& context,
     VectorPtr& resultRef) {
-  HashClass hash;
   size_t hashIdx = seed ? 1 : 0;
   SeedType hashSeed = seed ? *seed : kDefaultSeed;
 
   auto& result = *resultRef->as<FlatVector<ReturnType>>();
-  rows.applyToSelected([&](int row) { result.set(row, hashSeed); });
+  rows.applyToSelected([&](auto row) { result.set(row, hashSeed); });
 
   exec::LocalSelectivityVector selectedMinusNulls(context);
 
@@ -54,36 +306,16 @@ void applyWithType(
           decoded->nulls(&rows), rows.begin(), rows.end());
       selected = selectedMinusNulls.get();
     }
-    switch (args[i]->type()->kind()) {
-// Derived from InterpretedHashFunction.hash:
-// https://github.com/apache/spark/blob/382b66e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L532
-#define CASE(typeEnum, hashFn, inputType)                                      \
-  case TypeKind::typeEnum:                                                     \
-    selected->applyToSelected([&](int row) {                                   \
-      result.set(                                                              \
-          row, hashFn(decoded->valueAt<inputType>(row), result.valueAt(row))); \
-    });                                                                        \
-    break;
-      CASE(BOOLEAN, hash.hashInt32, bool);
-      CASE(TINYINT, hash.hashInt32, int8_t);
-      CASE(SMALLINT, hash.hashInt32, int16_t);
-      CASE(INTEGER, hash.hashInt32, int32_t);
-      CASE(BIGINT, hash.hashInt64, int64_t);
-      CASE(VARCHAR, hash.hashBytes, StringView);
-      CASE(VARBINARY, hash.hashBytes, StringView);
-      CASE(REAL, hash.hashFloat, float);
-      CASE(DOUBLE, hash.hashDouble, double);
-      CASE(HUGEINT, hash.hashLongDecimal, int128_t);
-      CASE(TIMESTAMP, hash.hashTimestamp, Timestamp);
-#undef CASE
-      default:
-        VELOX_NYI(
-            "Unsupported type for HASH(): {}", args[i]->type()->toString());
-    }
+
+    auto hasher = createVectorHasher<HashClass>(*decoded);
+    selected->applyToSelected([&](auto row) {
+      result.set(row, hasher->hashNotNullAt(row, result.valueAt(row)));
+    });
   }
 }
 
-// Derived from src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java.
+// Derived from
+// src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java.
 //
 // Spark's Murmur3 seems slightly different from the original from Austin
 // Appleby: in particular the fmix function's first line is different. The
@@ -95,13 +327,13 @@ void applyWithType(
 
 class Murmur3Hash final {
  public:
-  uint32_t hashInt32(int32_t input, uint32_t seed) {
+  static uint32_t hashInt32(int32_t input, uint32_t seed) {
     uint32_t k1 = mixK1(input);
     uint32_t h1 = mixH1(seed, k1);
     return fmix(h1, 4);
   }
 
-  uint32_t hashInt64(uint64_t input, uint32_t seed) {
+  static uint32_t hashInt64(uint64_t input, uint32_t seed) {
     uint32_t low = input;
     uint32_t high = input >> 32;
 
@@ -116,19 +348,19 @@ class Murmur3Hash final {
 
   // Floating point numbers are hashed as if they are integers, with
   // -0f defined to have the same output as +0f.
-  uint32_t hashFloat(float input, uint32_t seed) {
+  static uint32_t hashFloat(float input, uint32_t seed) {
     return hashInt32(
         input == -0.f ? 0 : *reinterpret_cast<uint32_t*>(&input), seed);
   }
 
-  uint32_t hashDouble(double input, uint32_t seed) {
+  static uint32_t hashDouble(double input, uint32_t seed) {
     return hashInt64(
         input == -0. ? 0 : *reinterpret_cast<uint64_t*>(&input), seed);
   }
 
   // Spark also has an hashUnsafeBytes2 function, but it was not used at the
   // time of implementation.
-  uint32_t hashBytes(const StringView& input, uint32_t seed) {
+  static uint32_t hashBytes(const StringView& input, uint32_t seed) {
     const char* i = input.data();
     const char* const end = input.data() + input.size();
     uint32_t h1 = seed;
@@ -141,25 +373,25 @@ class Murmur3Hash final {
     return fmix(h1, input.size());
   }
 
-  uint32_t hashLongDecimal(int128_t input, uint32_t seed) {
+  static uint32_t hashLongDecimal(int128_t input, uint32_t seed) {
     char out[sizeof(int128_t)];
     int32_t length = DecimalUtil::toByteArray(input, out);
     return hashBytes(StringView(out, length), seed);
   }
 
-  uint32_t hashTimestamp(Timestamp input, uint32_t seed) {
+  static uint32_t hashTimestamp(Timestamp input, uint32_t seed) {
     return hashInt64(input.toMicros(), seed);
   }
 
  private:
-  uint32_t mixK1(uint32_t k1) {
+  static uint32_t mixK1(uint32_t k1) {
     k1 *= 0xcc9e2d51;
     k1 = bits::rotateLeft(k1, 15);
     k1 *= 0x1b873593;
     return k1;
   }
 
-  uint32_t mixH1(uint32_t h1, uint32_t k1) {
+  static uint32_t mixH1(uint32_t h1, uint32_t k1) {
     h1 ^= k1;
     h1 = bits::rotateLeft(h1, 13);
     h1 = h1 * 5 + 0xe6546b64;
@@ -167,7 +399,7 @@ class Murmur3Hash final {
   }
 
   // Finalization mix - force all bits of a hash block to avalanche
-  uint32_t fmix(uint32_t h1, uint32_t length) {
+  static uint32_t fmix(uint32_t h1, uint32_t length) {
     h1 ^= length;
     h1 ^= h1 >> 16;
     h1 *= 0x85ebca6b;
@@ -190,7 +422,7 @@ class Murmur3HashFunction final : public exec::VectorFunction {
       exec::EvalCtx& context,
       VectorPtr& resultRef) const final {
     context.ensureWritable(rows, INTEGER(), resultRef);
-    applyWithType<int32_t, Murmur3Hash>(rows, args, seed_, context, resultRef);
+    applyWithType<Murmur3Hash>(rows, args, seed_, context, resultRef);
   }
 
  private:
@@ -198,21 +430,15 @@ class Murmur3HashFunction final : public exec::VectorFunction {
 };
 
 class XxHash64 final {
-  const uint64_t PRIME64_1 = 0x9E3779B185EBCA87L;
-  const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FL;
-  const uint64_t PRIME64_3 = 0x165667B19E3779F9L;
-  const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63L;
-  const uint64_t PRIME64_5 = 0x27D4EB2F165667C5L;
-
  public:
-  int64_t hashInt32(const int32_t input, uint64_t seed) {
+  static uint64_t hashInt32(const int32_t input, uint64_t seed) {
     int64_t hash = seed + PRIME64_5 + 4L;
     hash ^= static_cast<int64_t>((input & 0xFFFFFFFFL) * PRIME64_1);
     hash = bits::rotateLeft64(hash, 23) * PRIME64_2 + PRIME64_3;
     return fmix(hash);
   }
 
-  int64_t hashInt64(int64_t input, uint64_t seed) {
+  static uint64_t hashInt64(int64_t input, uint64_t seed) {
     int64_t hash = seed + PRIME64_5 + 8L;
     hash ^= bits::rotateLeft64(input * PRIME64_2, 31) * PRIME64_1;
     hash = bits::rotateLeft64(hash, 27) * PRIME64_1 + PRIME64_4;
@@ -221,17 +447,17 @@ class XxHash64 final {
 
   // Floating point numbers are hashed as if they are integers, with
   // -0f defined to have the same output as +0f.
-  int64_t hashFloat(float input, uint64_t seed) {
+  static uint64_t hashFloat(float input, uint64_t seed) {
     return hashInt32(
         input == -0.f ? 0 : *reinterpret_cast<uint32_t*>(&input), seed);
   }
 
-  int64_t hashDouble(double input, uint64_t seed) {
+  static uint64_t hashDouble(double input, uint64_t seed) {
     return hashInt64(
         input == -0. ? 0 : *reinterpret_cast<uint64_t*>(&input), seed);
   }
 
-  uint64_t hashBytes(const StringView& input, uint64_t seed) {
+  static uint64_t hashBytes(const StringView& input, uint64_t seed) {
     const char* i = input.data();
     const char* const end = input.data() + input.size();
 
@@ -253,18 +479,24 @@ class XxHash64 final {
     return fmix(hash);
   }
 
-  int64_t hashLongDecimal(int128_t input, uint32_t seed) {
+  static uint64_t hashLongDecimal(int128_t input, uint64_t seed) {
     char out[sizeof(int128_t)];
     int32_t length = DecimalUtil::toByteArray(input, out);
     return hashBytes(StringView(out, length), seed);
   }
 
-  int64_t hashTimestamp(Timestamp input, uint32_t seed) {
+  static uint64_t hashTimestamp(Timestamp input, uint64_t seed) {
     return hashInt64(input.toMicros(), seed);
   }
 
  private:
-  uint64_t fmix(uint64_t hash) {
+  static const uint64_t PRIME64_1 = 0x9E3779B185EBCA87L;
+  static const uint64_t PRIME64_2 = 0xC2B2AE3D27D4EB4FL;
+  static const uint64_t PRIME64_3 = 0x165667B19E3779F9L;
+  static const uint64_t PRIME64_4 = 0x85EBCA77C2B2AE63L;
+  static const uint64_t PRIME64_5 = 0x27D4EB2F165667C5L;
+
+  static uint64_t fmix(uint64_t hash) {
     hash ^= hash >> 33;
     hash *= PRIME64_2;
     hash ^= hash >> 29;
@@ -273,7 +505,7 @@ class XxHash64 final {
     return hash;
   }
 
-  uint64_t hashBytesByWords(const StringView& input, uint64_t seed) {
+  static uint64_t hashBytesByWords(const StringView& input, uint64_t seed) {
     const char* i = input.data();
     const char* const end = input.data() + input.size();
     uint32_t length = input.size();
@@ -353,13 +585,52 @@ class XxHash64Function final : public exec::VectorFunction {
       exec::EvalCtx& context,
       VectorPtr& resultRef) const final {
     context.ensureWritable(rows, BIGINT(), resultRef);
-    applyWithType<int64_t, XxHash64>(rows, args, seed_, context, resultRef);
+    applyWithType<XxHash64>(rows, args, seed_, context, resultRef);
   }
 
  private:
   const std::optional<int64_t> seed_;
 };
 
+bool checkHashElementType(const TypePtr& type) {
+  switch (type->kind()) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::TINYINT:
+    case TypeKind::SMALLINT:
+    case TypeKind::INTEGER:
+    case TypeKind::BIGINT:
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+    case TypeKind::REAL:
+    case TypeKind::DOUBLE:
+    case TypeKind::HUGEINT:
+    case TypeKind::TIMESTAMP:
+      return true;
+    case TypeKind::ARRAY:
+      return checkHashElementType(type->asArray().elementType());
+    case TypeKind::MAP:
+      return checkHashElementType(type->asMap().keyType()) &&
+          checkHashElementType(type->asMap().valueType());
+    case TypeKind::ROW: {
+      const auto& children = type->asRow().children();
+      return std::all_of(
+          children.begin(), children.end(), [](const auto& child) {
+            return checkHashElementType(child);
+          });
+    }
+    default:
+      return false;
+  }
+}
+
+void checkArgTypes(const std::vector<exec::VectorFunctionArg>& args) {
+  for (const auto& arg : args) {
+    if (!checkHashElementType(arg.type)) {
+      VELOX_USER_FAIL("Unsupported type for hash: {}", arg.type->toString())
+    }
+  }
+}
+
 } // namespace
 
 // Not all types are supported by now. Check types when making hash function.
@@ -372,27 +643,6 @@ std::vector<std::shared_ptr<exec::FunctionSignature>> hashSignatures() {
               .build()};
 }
 
-void checkArgTypes(const std::vector<exec::VectorFunctionArg>& args) {
-  for (const auto& arg : args) {
-    switch (arg.type->kind()) {
-      case TypeKind::BOOLEAN:
-      case TypeKind::TINYINT:
-      case TypeKind::SMALLINT:
-      case TypeKind::INTEGER:
-      case TypeKind::BIGINT:
-      case TypeKind::VARCHAR:
-      case TypeKind::VARBINARY:
-      case TypeKind::REAL:
-      case TypeKind::DOUBLE:
-      case TypeKind::HUGEINT:
-      case TypeKind::TIMESTAMP:
-        break;
-      default:
-        VELOX_USER_FAIL("Unsupported type for hash: {}", arg.type->toString())
-    }
-  }
-}
-
 std::shared_ptr<exec::VectorFunction> makeHash(
     const std::string& name,
     const std::vector<exec::VectorFunctionArg>& inputArgs,
diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp
index 422d63643e56..1c6569dc15bf 100644
--- a/velox/functions/sparksql/tests/HashTest.cpp
+++ b/velox/functions/sparksql/tests/HashTest.cpp
@@ -18,6 +18,8 @@
 
 #include <stdint.h>
 
+using facebook::velox::test::assertEqualVectors;
+
 namespace facebook::velox::functions::sparksql::test {
 namespace {
 
@@ -27,6 +29,10 @@ class HashTest : public SparkFunctionBaseTest {
   std::optional<int32_t> hash(std::optional<T> arg) {
     return evaluateOnce<int32_t>("hash(c0)", arg);
   }
+
+  VectorPtr hash(VectorPtr vector) {
+    return evaluate("hash(c0)", makeRowVector({vector}));
+  }
 };
 
 TEST_F(HashTest, String) {
@@ -128,5 +134,105 @@ TEST_F(HashTest, Float) {
   EXPECT_EQ(hash<float>(-limits::infinity()), 427440766);
 }
 
+TEST_F(HashTest, Array) {
+  assertEqualVectors(
+      makeFlatVector<int32_t>({2101165938, 42, 1045631400}),
+      hash(makeArrayVector<int64_t>({{1, 2, 3, 4, 5}, {}, {1, 2, 3}})));
+
+  assertEqualVectors(
+      makeFlatVector<int32_t>({-559580957, 1765031574, 42}),
+      hash(makeNullableArrayVector<int32_t>(
+          {{1, std::nullopt}, {std::nullopt, 2}, {std::nullopt}})));
+
+  // Nested array.
+  {
+    using innerArrayType = std::vector<std::optional<int64_t>>;
+    using outerArrayType =
+        std::vector<std::optional<std::vector<std::optional<int64_t>>>>;
+
+    innerArrayType a{1, std::nullopt, 2, 3};
+    innerArrayType b{4, 5};
+    innerArrayType c{6, 7, 8};
+    outerArrayType row1{{a}, {b}};
+    outerArrayType row2{{a}, {c}};
+    outerArrayType row3{{{}}};
+    outerArrayType row4{{{std::nullopt}}};
+    auto arrayVector = makeNullableNestedArrayVector<int64_t>(
+        {{row1}, {row2}, {row3}, {row4}, std::nullopt});
+    assertEqualVectors(
+        makeFlatVector<int32_t>({2101165938, -992561130, 42, 42, 42}),
+        hash(arrayVector));
+  }
+
+  // Array of map.
+  {
+    using S = StringView;
+    using P = std::pair<int64_t, std::optional<S>>;
+    std::vector<P> a{P{1, S{"a"}}, P{2, std::nullopt}};
+    std::vector<P> b{P{3, S{"c"}}};
+    std::vector<std::vector<std::vector<P>>> data = {{a, b}};
+    auto arrayVector = makeArrayOfMapVector<int64_t, S>(data);
+    assertEqualVectors(
+        makeFlatVector<int32_t>(std::vector<int32_t>{-718462205}),
+        hash(arrayVector));
+  }
+
+  // Array of row.
+  {
+    std::vector<std::vector<std::optional<std::tuple<int32_t, std::string>>>>
+        data = {
+            {{{1, "red"}}, {{2, "blue"}}, {{3, "green"}}},
+            {{{1, "red"}}, std::nullopt, {{3, "green"}}},
+            {std::nullopt},
+        };
+    auto arrayVector = makeArrayOfRowVector(data, ROW({INTEGER(), VARCHAR()}));
+    assertEqualVectors(
+        makeFlatVector<int32_t>({-1458343314, 551500425, 42}),
+        hash(arrayVector));
+  }
+}
+
+TEST_F(HashTest, Map) {
+  auto mapVector = makeMapVector<int64_t, double>(
+      {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}});
+  assertEqualVectors(
+      makeFlatVector<int32_t>(std::vector<int32_t>{1263683448}),
+      hash(mapVector));
+
+  auto mapOfArrays = createMapOfArraysVector<int32_t, int32_t>(
+      {{{1, {{1, 2, 3}}}}, {{2, {{4, 5, 6}}}}, {{3, {{7, 8, 9}}}}});
+  assertEqualVectors(
+      makeFlatVector<int32_t>({-1818148947, 529298908, 825098912}),
+      hash(mapOfArrays));
+
+  auto mapWithNullArrays = createMapOfArraysVector<int64_t, int64_t>(
+      {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, {{3, {{}}}}});
+  assertEqualVectors(
+      makeFlatVector<int32_t>({-1712319331, 2060637564, 519220707}),
+      hash(mapWithNullArrays));
+}
+
+TEST_F(HashTest, Row) {
+  auto row = makeRowVector({
+      makeFlatVector<int64_t>({1, 3}),
+      makeFlatVector<int64_t>({2, 4}),
+  });
+  assertEqualVectors(
+      makeFlatVector<int32_t>({-1181176833, 1717636039}), hash(row));
+
+  row = makeRowVector({
+      makeNullableFlatVector<int64_t>({1, std::nullopt}),
+      makeNullableFlatVector<int64_t>({std::nullopt, 4}),
+  });
+  assertEqualVectors(
+      makeFlatVector<int32_t>({-1712319331, 1344313940}), hash(row));
+
+  row->setNull(0, true);
+  assertEqualVectors(makeFlatVector<int32_t>({42, 1344313940}), hash(row));
+
+  row->setNull(1, true);
+  assertEqualVectors(makeFlatVector<int32_t>({42, 42}), hash(row));
+}
+
 } // namespace
 } // namespace facebook::velox::functions::sparksql::test
diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp
index 09162f4a0279..d1508f3681fb 100644
--- a/velox/functions/sparksql/tests/XxHash64Test.cpp
+++ b/velox/functions/sparksql/tests/XxHash64Test.cpp
@@ -18,6 +18,8 @@
 
 #include <stdint.h>
 
+using facebook::velox::test::assertEqualVectors;
+
 namespace facebook::velox::functions::sparksql::test {
 namespace {
 class XxHash64Test : public SparkFunctionBaseTest {
@@ -26,6 +28,10 @@ class XxHash64Test : public SparkFunctionBaseTest {
   std::optional<int64_t> xxhash64(std::optional<T> arg) {
     return evaluateOnce<int64_t>("xxhash64(c0)", arg);
   }
+
+  VectorPtr xxhash64(VectorPtr vector) {
+    return evaluate("xxhash64(c0)", makeRowVector({vector}));
+  }
 };
 
 // The expected result was obtained by running SELECT xxhash64("Spark") query
@@ -138,6 +144,113 @@ TEST_F(XxHash64Test, float) {
   EXPECT_EQ(xxhash64<float>(-limits::infinity()), -7580553461823983095);
 }
 
+TEST_F(XxHash64Test, array) {
+  assertEqualVectors(
+      makeFlatVector<int64_t>({-6041664978295882827, 42, 4904562767517797033}),
+      xxhash64(makeArrayVector<int64_t>({{1, 2, 3, 4, 5}, {}, {1, 2, 3}})));
+
+  assertEqualVectors(
+      makeFlatVector<int64_t>({-6698625589789238999, 8420071140774656230, 42}),
+      xxhash64(makeNullableArrayVector<int32_t>(
+          {{1, std::nullopt}, {std::nullopt, 2}, {std::nullopt}})));
+
+  // Nested array.
+  {
+    using innerArrayType = std::vector<std::optional<int64_t>>;
+    using outerArrayType =
+        std::vector<std::optional<std::vector<std::optional<int64_t>>>>;
+
+    innerArrayType a{1, std::nullopt, 2, 3};
+    innerArrayType b{4, 5};
+    innerArrayType c{6, 7, 8};
+    outerArrayType row1{{a}, {b}};
+    outerArrayType row2{{a}, {c}};
+    outerArrayType row3{{{}}};
+    outerArrayType row4{{{std::nullopt}}};
+    auto arrayVector = makeNullableNestedArrayVector<int64_t>(
+        {{row1}, {row2}, {row3}, {row4}, std::nullopt});
+    assertEqualVectors(
+        makeFlatVector<int64_t>(
+            {-6041664978295882827, -1052942565807509112, 42, 42, 42}),
+        xxhash64(arrayVector));
+  }
+
+  // Array of map.
+  {
+    using S = StringView;
+    using P = std::pair<int64_t, std::optional<S>>;
+    std::vector<P> a{P{1, S{"a"}}, P{2, std::nullopt}};
+    std::vector<P> b{P{3, S{"c"}}};
+    std::vector<std::vector<std::vector<P>>> data = {{a, b}};
+    auto arrayVector = makeArrayOfMapVector<int64_t, S>(data);
+    assertEqualVectors(
+        makeFlatVector<int64_t>(std::vector<int64_t>{2880747995994395223}),
+        xxhash64(arrayVector));
+  }
+
+  // Array of row.
+  {
+    std::vector<std::vector<std::optional<std::tuple<int32_t, std::string>>>>
+        data = {
+            {{{1, "red"}}, {{2, "blue"}}, {{3, "green"}}},
+            {{{1, "red"}}, std::nullopt, {{3, "green"}}},
+            {std::nullopt},
+        };
+    auto arrayVector = makeArrayOfRowVector(data, ROW({INTEGER(), VARCHAR()}));
+    assertEqualVectors(
+        makeFlatVector<int64_t>(
+            {-4096178443626566478, -8973283971856715104, 42}),
+        xxhash64(arrayVector));
+  }
+}
+
+TEST_F(XxHash64Test, map) {
+  auto mapVector = makeMapVector<int64_t, double>(
+      {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}});
+  assertEqualVectors(
+      makeFlatVector<int64_t>(std::vector<int64_t>{-6303587702533348160}),
+      xxhash64(mapVector));
+
+  auto mapOfArrays = createMapOfArraysVector<int32_t, int32_t>(
+      {{{1, {{1, 2, 3}}}}, {{2, {{4, 5, 6}}}}, {{3, {{7, 8, 9}}}}});
+  assertEqualVectors(
+      makeFlatVector<int64_t>(
+          {-2103781794412908874, 1112887818746642853, 5787852566364222439}),
+      xxhash64(mapOfArrays));
+
+  auto mapWithNullArrays = createMapOfArraysVector<int64_t, int64_t>(
+      {{{1, std::nullopt}}, {{2, {{4, 5, std::nullopt}}}}, {{3, {{}}}}});
+  assertEqualVectors(
+      makeFlatVector<int64_t>(
+          {-7001672635703045582, 7217681953522744649, 3188756510806108107}),
+      xxhash64(mapWithNullArrays));
+}
+
+TEST_F(XxHash64Test, row) {
+  auto row = makeRowVector({
+      makeFlatVector<int64_t>({1, 3}),
+      makeFlatVector<int64_t>({2, 4}),
+  });
+  assertEqualVectors(
+      makeFlatVector<int64_t>({-8198029865082835910, 351067884137457704}),
+      xxhash64(row));
+
+  row = makeRowVector({
+      makeNullableFlatVector<int64_t>({1, std::nullopt}),
+      makeNullableFlatVector<int64_t>({std::nullopt, 4}),
+  });
+  assertEqualVectors(
+      makeFlatVector<int64_t>({-7001672635703045582, 404280023041566627}),
+      xxhash64(row));
+
+  row->setNull(0, true);
+  assertEqualVectors(
+      makeFlatVector<int64_t>({42, 404280023041566627}), xxhash64(row));
+
+  row->setNull(1, true);
+  assertEqualVectors(makeFlatVector<int64_t>({42, 42}), xxhash64(row));
+}
+
 TEST_F(XxHash64Test, hashSeed) {
   auto xxhash64WithSeed = [&](int64_t seed, const std::optional<int64_t>& arg) {
     return evaluateOnce<int64_t>(

From 86a2d26e4376966295bf615c0c491b8d5e51b863 Mon Sep 17 00:00:00 2001
From: "Ma, Rong" <rong.ma@intel.com>
Date: Fri, 26 Apr 2024 11:37:17 +0800
Subject: [PATCH 2/5] remove virtual function call

---
 velox/functions/sparksql/Hash.cpp | 175 ++++++++++++++++--------------
 1 file changed, 96 insertions(+), 79 deletions(-)

diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp
index 20e1284b68c4..a7ec95e2aa96 100644
--- a/velox/functions/sparksql/Hash.cpp
+++ b/velox/functions/sparksql/Hash.cpp
@@ -26,70 +26,76 @@ namespace {
 
 const int32_t kDefaultSeed = 42;
 
-struct Murmur3Hash;
-struct XxHash64;
-
-// A template struct that contains the seed and return type of the hash
-// function.
-template <typename HashClass>
-struct HashTraits {};
-
-template <>
-struct HashTraits<Murmur3Hash> {
-  using SeedType = int32_t;
-  using ReturnType = int32_t;
-};
-
-template <>
-struct HashTraits<XxHash64> {
-  using SeedType = int64_t;
-  using ReturnType = int64_t;
-};
-
 // Computes the hash value of input using the hash function in HashClass.
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(int32_t input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    int32_t input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashInt32(input, seed);
 }
 
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(int64_t input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    int64_t input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashInt64(input, seed);
 }
 
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(float input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    float input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashFloat(input, seed);
 }
 
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(double input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    double input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashDouble(input, seed);
 }
 
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(int128_t input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    int128_t input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashLongDecimal(input, seed);
 }
 
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(Timestamp input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    Timestamp input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashTimestamp(input, seed);
 }
 
-template <typename HashClass, typename SeedType, typename ReturnType>
-ReturnType hashOne(StringView input, SeedType seed) {
+template <typename HashClass>
+typename HashClass::ReturnType hashOne(
+    StringView input,
+    typename HashClass::SeedType seed) {
   return HashClass::hashBytes(input, seed);
 }
 
+template <typename HashClass, TypeKind kind>
+class PrimitiveVectorHasher;
+
+template <typename HashClass>
+class ArrayVectorHasher;
+
+template <typename HashClass>
+class MapVectorHasher;
+
+template <typename HashClass>
+class RowVectorHasher;
+
 // Class to compute hashes identical to one produced by Spark.
 // Hashes are computed using the algorithm implemented in HashClass.
-template <
-    typename HashClass,
-    typename SeedType = typename HashTraits<HashClass>::SeedType,
-    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+template <typename HashClass>
 class SparkVectorHasher {
  public:
+  using SeedType = typename HashClass::SeedType;
+  using ReturnType = typename HashClass::ReturnType;
+
   SparkVectorHasher(DecodedVector& decoded) : decoded_(decoded) {}
 
   virtual ~SparkVectorHasher() = default;
@@ -103,36 +109,33 @@ class SparkVectorHasher {
   }
 
   // Compute the hash value of input vector at index for non-null values.
-  virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0;
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) {
+    switch (decoded_.base()->typeKind()) {
+      case TypeKind::ARRAY:
+        return static_cast<ArrayVectorHasher<HashClass>*>(this)->hashValueAt(
+            index, seed);
+      case TypeKind::MAP:
+        return static_cast<MapVectorHasher<HashClass>*>(this)->hashValueAt(
+            index, seed);
+      case TypeKind::ROW:
+        return static_cast<RowVectorHasher<HashClass>*>(this)->hashValueAt(
+            index, seed);
+      default:
+        return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
+            hashPrimitive, decoded_.base()->typeKind(), index, seed);
+    }
+  }
 
  protected:
   const DecodedVector& decoded_;
-};
-
-template <
-    typename HashClass,
-    TypeKind kind,
-    typename SeedType = typename HashTraits<HashClass>::SeedType,
-    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
-class PrimitiveVectorHasher;
-
-template <
-    typename HashClass,
-    typename SeedType = typename HashTraits<HashClass>::SeedType,
-    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
-class ArrayVectorHasher;
 
-template <
-    typename HashClass,
-    typename SeedType = typename HashTraits<HashClass>::SeedType,
-    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
-class MapVectorHasher;
-
-template <
-    typename HashClass,
-    typename SeedType = typename HashTraits<HashClass>::SeedType,
-    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
-class RowVectorHasher;
+ private:
+  template <TypeKind kind>
+  ReturnType hashPrimitive(vector_size_t index, SeedType seed) {
+    return static_cast<PrimitiveVectorHasher<HashClass, kind>*>(this)
+        ->hashValueAt(index, seed);
+  }
+};
 
 template <typename HashClass, TypeKind kind>
 std::shared_ptr<SparkVectorHasher<HashClass>> createPrimitiveVectorHasher(
@@ -159,27 +162,29 @@ std::shared_ptr<SparkVectorHasher<HashClass>> createVectorHasher(
   }
 }
 
-template <
-    typename HashClass,
-    TypeKind kind,
-    typename SeedType,
-    typename ReturnType>
+template <typename HashClass, TypeKind kind>
 class PrimitiveVectorHasher : public SparkVectorHasher<HashClass> {
  public:
+  using SeedType = typename HashClass::SeedType;
+  using ReturnType = typename HashClass::ReturnType;
+
   PrimitiveVectorHasher(DecodedVector& decoded)
       : SparkVectorHasher<HashClass>(decoded) {}
 
-  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
-    return hashOne<HashClass, SeedType, ReturnType>(
+  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
+    return hashOne<HashClass>(
         this->decoded_.template valueAt<typename TypeTraits<kind>::NativeType>(
             index),
         seed);
   }
 };
 
-template <typename HashClass, typename SeedType, typename ReturnType>
+template <typename HashClass>
 class ArrayVectorHasher : public SparkVectorHasher<HashClass> {
  public:
+  using SeedType = typename HashClass::SeedType;
+  using ReturnType = typename HashClass::ReturnType;
+
   ArrayVectorHasher(DecodedVector& decoded)
       : SparkVectorHasher<HashClass>(decoded) {
     base_ = decoded.base()->as<ArrayVector>();
@@ -190,7 +195,7 @@ class ArrayVectorHasher : public SparkVectorHasher<HashClass> {
     elementHasher_ = createVectorHasher<HashClass>(decodedElements_);
   }
 
-  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
     auto size = base_->sizeAt(indices_[index]);
     auto offset = base_->offsetAt(indices_[index]);
 
@@ -208,9 +213,12 @@ class ArrayVectorHasher : public SparkVectorHasher<HashClass> {
   std::shared_ptr<SparkVectorHasher<HashClass>> elementHasher_;
 };
 
-template <typename HashClass, typename SeedType, typename ReturnType>
+template <typename HashClass>
 class MapVectorHasher : public SparkVectorHasher<HashClass> {
  public:
+  using SeedType = typename HashClass::SeedType;
+  using ReturnType = typename HashClass::ReturnType;
+
   MapVectorHasher(DecodedVector& decoded)
       : SparkVectorHasher<HashClass>(decoded) {
     base_ = decoded.base()->as<MapVector>();
@@ -223,7 +231,7 @@ class MapVectorHasher : public SparkVectorHasher<HashClass> {
     valueHasher_ = createVectorHasher<HashClass>(decodedValues_);
   }
 
-  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
     auto size = base_->sizeAt(indices_[index]);
     auto offset = base_->offsetAt(indices_[index]);
 
@@ -244,9 +252,12 @@ class MapVectorHasher : public SparkVectorHasher<HashClass> {
   std::shared_ptr<SparkVectorHasher<HashClass>> valueHasher_;
 };
 
-template <typename HashClass, typename SeedType, typename ReturnType>
+template <typename HashClass>
 class RowVectorHasher : public SparkVectorHasher<HashClass> {
  public:
+  using SeedType = typename HashClass::SeedType;
+  using ReturnType = typename HashClass::ReturnType;
+
   RowVectorHasher(DecodedVector& decoded)
       : SparkVectorHasher<HashClass>(decoded) {
     base_ = decoded.base()->as<RowVector>();
@@ -261,7 +272,7 @@ class RowVectorHasher : public SparkVectorHasher<HashClass> {
     }
   }
 
-  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
+  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
     ReturnType result = seed;
     for (auto i = 0; i < base_->childrenSize(); ++i) {
       result = hashers_[i]->hashAt(indices_[index], result);
@@ -280,8 +291,8 @@ class RowVectorHasher : public SparkVectorHasher<HashClass> {
 // HashClass contains the function like hashInt32
 template <
     typename HashClass,
-    typename SeedType = typename HashTraits<HashClass>::SeedType,
-    typename ReturnType = typename HashTraits<HashClass>::ReturnType>
+    typename SeedType = typename HashClass::SeedType,
+    typename ReturnType = typename HashClass::ReturnType>
 void applyWithType(
     const SelectivityVector& rows,
     std::vector<VectorPtr>& args, // Not using const ref so we can reuse args
@@ -327,6 +338,9 @@ void applyWithType(
 
 class Murmur3Hash final {
  public:
+  using SeedType = int32_t;
+  using ReturnType = int32_t;
+
   static uint32_t hashInt32(int32_t input, uint32_t seed) {
     uint32_t k1 = mixK1(input);
     uint32_t h1 = mixH1(seed, k1);
@@ -431,6 +445,9 @@ class Murmur3HashFunction final : public exec::VectorFunction {
 
 class XxHash64 final {
  public:
+  using SeedType = int64_t;
+  using ReturnType = int64_t;
+
   static uint64_t hashInt32(const int32_t input, uint64_t seed) {
     int64_t hash = seed + PRIME64_5 + 4L;
     hash ^= static_cast<int64_t>((input & 0xFFFFFFFFL) * PRIME64_1);

From e6a49d145225e41364a71906118b68c456dac919 Mon Sep 17 00:00:00 2001
From: "Ma, Rong" <rong.ma@intel.com>
Date: Wed, 1 May 2024 11:41:52 +0800
Subject: [PATCH 3/5] address comments

---
 velox/docs/functions/spark/binary.rst         |  9 -------
 velox/functions/sparksql/tests/HashTest.cpp   | 25 +++++++------------
 .../functions/sparksql/tests/XxHash64Test.cpp | 19 +++++---------
 3 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/velox/docs/functions/spark/binary.rst b/velox/docs/functions/spark/binary.rst
index 9ec544a8e810..249c7f4ee24f 100644
--- a/velox/docs/functions/spark/binary.rst
+++ b/velox/docs/functions/spark/binary.rst
@@ -10,30 +10,21 @@ Binary Functions
 
     Computes the hash of one or more input values using seed value of 42. For
     multiple arguments, their types can be different.
-    Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
-
 
 .. spark:function:: hash_with_seed(seed, x, ...) -> integer
 
     Computes the hash of one or more input values using specified seed. For
     multiple arguments, their types can be different.
-    Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 .. spark:function:: xxhash64(x, ...) -> bigint
 
     Computes the xxhash64 of one or more input values using seed value of 42.
     For multiple arguments, their types can be different.
-    Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 .. spark:function:: xxhash64_with_seed(seed, x, ...) -> bigint
 
     Computes the xxhash64 of one or more input values using specified seed. For
     multiple arguments, their types can be different.
-    Supported types are: BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, VARCHAR,
-    VARBINARY, REAL, DOUBLE, HUGEINT, TIMESTAMP, ARRAY, MAP and ROW.
 
 .. spark:function:: md5(x) -> varbinary
 
diff --git a/velox/functions/sparksql/tests/HashTest.cpp b/velox/functions/sparksql/tests/HashTest.cpp
index 1c6569dc15bf..58a9d4565c0d 100644
--- a/velox/functions/sparksql/tests/HashTest.cpp
+++ b/velox/functions/sparksql/tests/HashTest.cpp
@@ -134,7 +134,7 @@ TEST_F(HashTest, Float) {
   EXPECT_EQ(hash<float>(-limits::infinity()), 427440766);
 }
 
-TEST_F(HashTest, Array) {
+TEST_F(HashTest, array) {
   assertEqualVectors(
       makeFlatVector<int32_t>({2101165938, 42, 1045631400}),
       hash(makeArrayVector<int64_t>({{1, 2, 3, 4, 5}, {}, {1, 2, 3}})));
@@ -146,19 +146,12 @@ TEST_F(HashTest, Array) {
 
   // Nested array.
   {
-    using innerArrayType = std::vector<std::optional<int64_t>>;
-    using outerArrayType =
-        std::vector<std::optional<std::vector<std::optional<int64_t>>>>;
-
-    innerArrayType a{1, std::nullopt, 2, 3};
-    innerArrayType b{4, 5};
-    innerArrayType c{6, 7, 8};
-    outerArrayType row1{{a}, {b}};
-    outerArrayType row2{{a}, {c}};
-    outerArrayType row3{{{}}};
-    outerArrayType row4{{{std::nullopt}}};
-    auto arrayVector = makeNullableNestedArrayVector<int64_t>(
-        {{row1}, {row2}, {row3}, {row4}, std::nullopt});
+    auto arrayVector = makeNestedArrayVectorFromJson<int64_t>(
+        {"[[1, null, 2, 3], [4, 5]]",
+         "[[1, null, 2, 3], [6, 7, 8]]",
+         "[[]]",
+         "[[null]]",
+         "[null]"});
     assertEqualVectors(
         makeFlatVector<int32_t>({2101165938, -992561130, 42, 42, 42}),
         hash(arrayVector));
@@ -192,7 +185,7 @@ TEST_F(HashTest, Array) {
   }
 }
 
-TEST_F(HashTest, Map) {
+TEST_F(HashTest, map) {
   auto mapVector = makeMapVector<int64_t, double>(
       {{{1, 17.0}, {2, 36.0}, {3, 8.0}, {4, 28.0}, {5, 24.0}, {6, 32.0}}});
   assertEqualVectors(
@@ -212,7 +205,7 @@ TEST_F(HashTest, Map) {
       hash(mapWithNullArrays));
 }
 
-TEST_F(HashTest, Row) {
+TEST_F(HashTest, row) {
   auto row = makeRowVector({
       makeFlatVector<int64_t>({1, 3}),
       makeFlatVector<int64_t>({2, 4}),
diff --git a/velox/functions/sparksql/tests/XxHash64Test.cpp b/velox/functions/sparksql/tests/XxHash64Test.cpp
index d1508f3681fb..6e086ffd918f 100644
--- a/velox/functions/sparksql/tests/XxHash64Test.cpp
+++ b/velox/functions/sparksql/tests/XxHash64Test.cpp
@@ -156,19 +156,12 @@ TEST_F(XxHash64Test, array) {
 
   // Nested array.
   {
-    using innerArrayType = std::vector<std::optional<int64_t>>;
-    using outerArrayType =
-        std::vector<std::optional<std::vector<std::optional<int64_t>>>>;
-
-    innerArrayType a{1, std::nullopt, 2, 3};
-    innerArrayType b{4, 5};
-    innerArrayType c{6, 7, 8};
-    outerArrayType row1{{a}, {b}};
-    outerArrayType row2{{a}, {c}};
-    outerArrayType row3{{{}}};
-    outerArrayType row4{{{std::nullopt}}};
-    auto arrayVector = makeNullableNestedArrayVector<int64_t>(
-        {{row1}, {row2}, {row3}, {row4}, std::nullopt});
+    auto arrayVector = makeNestedArrayVectorFromJson<int64_t>(
+        {"[[1, null, 2, 3], [4, 5]]",
+         "[[1, null, 2, 3], [6, 7, 8]]",
+         "[[]]",
+         "[[null]]",
+         "[null]"});
     assertEqualVectors(
         makeFlatVector<int64_t>(
             {-6041664978295882827, -1052942565807509112, 42, 42, 42}),

From e2f4423455308c71b8c56887ac3dd35578d665da Mon Sep 17 00:00:00 2001
From: "Ma, Rong" <rong.ma@intel.com>
Date: Thu, 9 May 2024 16:48:20 +0800
Subject: [PATCH 4/5] add benchmark

---
 .../sparksql/benchmarks/CMakeLists.txt        |  4 ++
 .../sparksql/benchmarks/CompareBenchmark.cpp  |  1 +
 .../sparksql/benchmarks/HashBenchmark.cpp     | 53 +++++++++++++++++++
 3 files changed, 58 insertions(+)
 create mode 100644 velox/functions/sparksql/benchmarks/HashBenchmark.cpp

diff --git a/velox/functions/sparksql/benchmarks/CMakeLists.txt b/velox/functions/sparksql/benchmarks/CMakeLists.txt
index 5ccf7e88745a..837c983b7df8 100644
--- a/velox/functions/sparksql/benchmarks/CMakeLists.txt
+++ b/velox/functions/sparksql/benchmarks/CMakeLists.txt
@@ -27,3 +27,7 @@ target_link_libraries(
 add_executable(velox_sparksql_benchmarks_compare CompareBenchmark.cpp)
 target_link_libraries(velox_sparksql_benchmarks_compare velox_functions_spark
                       velox_benchmark_builder velox_vector_test_lib)
+
+add_executable(velox_sparksql_benchmarks_hash HashBenchmark.cpp)
+target_link_libraries(velox_sparksql_benchmarks_hash velox_functions_spark
+                      velox_benchmark_builder velox_vector_test_lib)
diff --git a/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp b/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp
index 19b6f1262512..e4d3c9f3f608 100644
--- a/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp
+++ b/velox/functions/sparksql/benchmarks/CompareBenchmark.cpp
@@ -26,6 +26,7 @@ using namespace facebook::velox;
 
 int main(int argc, char** argv) {
   folly::Init init(&argc, &argv);
+  memory::MemoryManager::initialize({});
   functions::sparksql::registerFunctions("");
 
   ExpressionBenchmarkBuilder benchmarkBuilder;
diff --git a/velox/functions/sparksql/benchmarks/HashBenchmark.cpp b/velox/functions/sparksql/benchmarks/HashBenchmark.cpp
new file mode 100644
index 000000000000..97cf592e3b81
--- /dev/null
+++ b/velox/functions/sparksql/benchmarks/HashBenchmark.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include "velox/benchmarks/ExpressionBenchmarkBuilder.h"
+#include "velox/functions/sparksql/Register.h"
+
+using namespace facebook;
+
+using namespace facebook::velox;
+
+int main(int argc, char** argv) {
+  folly::Init init(&argc, &argv);
+  memory::MemoryManager::initialize({});
+  functions::sparksql::registerFunctions("");
+
+  ExpressionBenchmarkBuilder benchmarkBuilder;
+
+  std::vector<TypePtr> inputTypes = {
+      ARRAY(MAP(INTEGER(), VARCHAR())),
+      ROW({"f_map", "f_array"}, {MAP(INTEGER(), VARCHAR()), ARRAY(INTEGER())}),
+  };
+
+  for (auto& inputType : inputTypes) {
+    benchmarkBuilder
+        .addBenchmarkSet(
+            fmt::format("hash_{}", inputType->toString()),
+            ROW({"c0"}, {inputType}))
+        .withFuzzerOptions({.vectorSize = 1000, .nullRatio = 0.1})
+        .addExpression("hash", "hash(c0)")
+        .addExpression("xxhash64", "xxhash64(c0)")
+        .withIterations(100);
+  }
+
+  benchmarkBuilder.registerBenchmarks();
+  folly::runBenchmarks();
+  return 0;
+}

From 6a270b3c8ffc088b48f6110008215f822e8af5a9 Mon Sep 17 00:00:00 2001
From: "Ma, Rong" <rong.ma@intel.com>
Date: Sat, 11 May 2024 10:39:26 +0800
Subject: [PATCH 5/5] revert switch

---
 velox/functions/sparksql/Hash.cpp | 32 +++++--------------------------
 1 file changed, 5 insertions(+), 27 deletions(-)

diff --git a/velox/functions/sparksql/Hash.cpp b/velox/functions/sparksql/Hash.cpp
index a7ec95e2aa96..d20cf5d4c8b5 100644
--- a/velox/functions/sparksql/Hash.cpp
+++ b/velox/functions/sparksql/Hash.cpp
@@ -109,32 +109,10 @@ class SparkVectorHasher {
   }
 
   // Compute the hash value of input vector at index for non-null values.
-  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) {
-    switch (decoded_.base()->typeKind()) {
-      case TypeKind::ARRAY:
-        return static_cast<ArrayVectorHasher<HashClass>*>(this)->hashValueAt(
-            index, seed);
-      case TypeKind::MAP:
-        return static_cast<MapVectorHasher<HashClass>*>(this)->hashValueAt(
-            index, seed);
-      case TypeKind::ROW:
-        return static_cast<RowVectorHasher<HashClass>*>(this)->hashValueAt(
-            index, seed);
-      default:
-        return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(
-            hashPrimitive, decoded_.base()->typeKind(), index, seed);
-    }
-  }
+  virtual ReturnType hashNotNullAt(vector_size_t index, SeedType seed) = 0;
 
  protected:
   const DecodedVector& decoded_;
-
- private:
-  template <TypeKind kind>
-  ReturnType hashPrimitive(vector_size_t index, SeedType seed) {
-    return static_cast<PrimitiveVectorHasher<HashClass, kind>*>(this)
-        ->hashValueAt(index, seed);
-  }
 };
 
 template <typename HashClass, TypeKind kind>
@@ -171,7 +149,7 @@ class PrimitiveVectorHasher : public SparkVectorHasher<HashClass> {
   PrimitiveVectorHasher(DecodedVector& decoded)
       : SparkVectorHasher<HashClass>(decoded) {}
 
-  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
     return hashOne<HashClass>(
         this->decoded_.template valueAt<typename TypeTraits<kind>::NativeType>(
             index),
@@ -195,7 +173,7 @@ class ArrayVectorHasher : public SparkVectorHasher<HashClass> {
     elementHasher_ = createVectorHasher<HashClass>(decodedElements_);
   }
 
-  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
     auto size = base_->sizeAt(indices_[index]);
     auto offset = base_->offsetAt(indices_[index]);
 
@@ -231,7 +209,7 @@ class MapVectorHasher : public SparkVectorHasher<HashClass> {
     valueHasher_ = createVectorHasher<HashClass>(decodedValues_);
   }
 
-  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
     auto size = base_->sizeAt(indices_[index]);
     auto offset = base_->offsetAt(indices_[index]);
 
@@ -272,7 +250,7 @@ class RowVectorHasher : public SparkVectorHasher<HashClass> {
     }
   }
 
-  ReturnType hashValueAt(vector_size_t index, SeedType seed) {
+  ReturnType hashNotNullAt(vector_size_t index, SeedType seed) override {
     ReturnType result = seed;
     for (auto i = 0; i < base_->childrenSize(); ++i) {
       result = hashers_[i]->hashAt(indices_[index], result);