Extend assertEqualResults to compare floating-point columns in comple…

…x-type columns with epsilon (#10927) Summary: Pull Request resolved: #10927 This diff addresses the limitation in #4481. One limitation is still left that floating-point in map keys are currently not compared with epsilon. This is because fuzzers can generate floating-point keys that are very close to each other (with in epsilon), making one key incorrectly match another during comparison. Reviewed By: xiaoxmeng Differential Revision: D62222471 fbshipit-source-id: 4bba6ff01c61dc6f9c1a719cc3543b37b1ab84e8
facebookincubator · Sep 14, 2024 · 079ed31 · 079ed31
1 parent 167dcf2
commit 079ed31
Show file tree

Hide file tree

Showing 3 changed files with 170 additions and 24 deletions.
diff --git a/velox/exec/tests/QueryAssertionsTest.cpp b/velox/exec/tests/QueryAssertionsTest.cpp
@@ -359,6 +359,81 @@ TEST_F(QueryAssertionsTest, multiFloatColumnWithNonUniqueKeys) {
       "971 extra rows, 971 missing rows");
 }
 
+TEST_F(QueryAssertionsTest, complexTypesContainingFloat) {
+  auto size = 1000;
+  // Test array.
+  auto expected = makeRowVector(
+      {makeArrayVector<float>(
+           size,
+           [](auto /*row*/) { return 2; },
+           [](auto row) { return row % 2 + 0.01; },
+           [](auto row) { return row % 7 == 0; }),
+       makeFlatVector<int64_t>(size, [](auto row) { return row; })});
+  auto actual = makeRowVector(
+      {makeArrayVector<float>(
+           size,
+           [&](auto /*row*/) { return 2; },
+           [&](auto row) {
+             auto value = row % 2 + 0.01;
+             return value + value * FLT_EPSILON;
+           },
+           [&](auto row) { return (size - row - 1) % 7 == 0; }),
+       makeFlatVector<int64_t>(
+           size, [&](auto row) { return size - row - 1; })});
+  EXPECT_TRUE(assertEqualResults({expected}, {actual}));
+
+  // Test map and row.
+  expected = makeRowVector(
+      {makeMapVector<int64_t, float>(
+           size,
+           [](auto /*row*/) { return 2; },
+           [](auto row) { return row; },
+           [](auto row) { return row % 5 + 0.01; },
+           [](auto /*row*/) { return false; },
+           [](auto row) { return row % 6 == 0; }),
+       makeRowVector(
+           {makeFlatVector<float>(
+                size, [](auto row) { return row % 5 + 0.01; }),
+            makeFlatVector<int64_t>(size, [](auto row) { return row % 5; }),
+            makeArrayVector<float>(
+                size,
+                [](auto /*row*/) { return 2; },
+                [](auto row) { return row % 2 + 0.01; },
+                [](auto row) { return row % 7 == 0; })}),
+       makeFlatVector<int64_t>(size, [](auto row) { return row; })});
+  actual = makeRowVector(
+      {makeMapVector<int64_t, float>(
+           size,
+           [](auto /*row*/) { return 2; },
+           [&](auto row) { return (size * 2 - row - 1); },
+           [&](auto row) {
+             auto value = (size * 2 - row - 1) % 5 + 0.01;
+             return value + value * FLT_EPSILON;
+           },
+           [&](auto /*row*/) { return false; },
+           [&](auto row) { return (size * 2 - row - 1) % 6 == 0; }),
+       makeRowVector(
+           {makeFlatVector<float>(
+                size,
+                [&](auto row) {
+                  auto value = (size - row - 1) % 5 + 0.01;
+                  return value + value * FLT_EPSILON;
+                }),
+            makeFlatVector<int64_t>(
+                size, [&](auto row) { return (size - row - 1) % 5; }),
+            makeArrayVector<float>(
+                size,
+                [](auto /*row*/) { return 2; },
+                [&](auto row) {
+                  auto value = row % 2 + 0.01;
+                  return value + value * FLT_EPSILON;
+                },
+                [&](auto row) { return (size - row - 1) % 7 == 0; })}),
+       makeFlatVector<int64_t>(
+           size, [&](auto row) { return (size - row - 1); })});
+  EXPECT_TRUE(assertEqualResults({expected}, {actual}));
+}
+
 TEST_F(QueryAssertionsTest, nullDecimalValue) {
   auto shortDecimal = makeRowVector(
       {makeNullableFlatVector<int64_t>({std::nullopt}, DECIMAL(5, 2))});

diff --git a/velox/exec/tests/utils/QueryAssertions.cpp b/velox/exec/tests/utils/QueryAssertions.cpp
@@ -1068,26 +1068,44 @@ void assertEqualTypeAndNumRows(
   EXPECT_EQ(expectedNumRows, actualNumRows);
 }
 
+bool containsFloatingPoint(const TypePtr& type) {
+  if (type->isPrimitiveType()) {
+    return type->isReal() || type->isDouble();
+  } else if (type->isArray()) {
+    return containsFloatingPoint(type->as<TypeKind::ARRAY>().elementType());
+  } else if (type->isMap()) {
+    // We currently don't support comparing maps with floating-point keys with
+    // epsilon. This is because fuzzer can generate floating-point keys that are
+    // very close, causing one key incorrectly match another during the
+    // comparison.
+    return containsFloatingPoint(type->as<TypeKind::MAP>().valueType());
+  } else if (type->isRow()) {
+    for (auto& child : type->as<TypeKind::ROW>().children()) {
+      if (containsFloatingPoint(child)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 /// Returns the number of floating-point columns and a list of columns indices
 /// with floating-point columns placed at the end.
 std::tuple<uint32_t, std::vector<velox::column_index_t>>
-findFloatingPointColumns(const MaterializedRow& row) {
-  auto isFloatingPointColumn = [&](size_t i) {
-    return row[i].kind() == TypeKind::REAL || row[i].kind() == TypeKind::DOUBLE;
-  };
-
+findFloatingPointColumns(const TypePtr& type) {
+  const auto rowType = asRowType(type);
   uint32_t numFloatingPointColumns = 0;
   std::vector<velox::column_index_t> indices;
-  for (auto i = 0; i < row.size(); ++i) {
-    if (isFloatingPointColumn(i)) {
+  for (auto i = 0; i < rowType->children().size(); ++i) {
+    if (containsFloatingPoint(rowType->childAt(i))) {
       ++numFloatingPointColumns;
     } else {
       indices.push_back(i);
     }
   }
 
-  for (auto i = 0; i < row.size(); ++i) {
-    if (isFloatingPointColumn(i)) {
+  for (auto i = 0; i < rowType->children().size(); ++i) {
+    if (containsFloatingPoint(rowType->childAt(i))) {
       indices.push_back(i);
     }
   }
@@ -1107,8 +1125,8 @@ bool assertEqualResults(
     const MaterializedRowMultiset& actualRows,
     const TypePtr& actualType,
     const std::string& message) {
+  const auto& type = (!expectedRows.empty()) ? expectedType : actualType;
   if (expectedRows.empty() != actualRows.empty()) {
-    const auto& type = (!expectedRows.empty()) ? expectedType : actualType;
     ADD_FAILURE() << generateUserFriendlyDiff(expectedRows, actualRows, type)
                   << message;
     return false;
@@ -1125,8 +1143,7 @@ bool assertEqualResults(
     return false;
   }
 
-  auto [numFloatingPointColumns, columns] =
-      findFloatingPointColumns(*expectedRows.begin());
+  auto [numFloatingPointColumns, columns] = findFloatingPointColumns(type);
   if (numFloatingPointColumns) {
     MaterializedRowEpsilonComparator comparator{
         numFloatingPointColumns, columns};
@@ -1214,10 +1231,11 @@ using OrderedPartition = std::pair<MaterializedRow, MaterializedRowMultiset>;
 
 // Special function to compare ordered partitions in a way that
 // we compare all floating point values inside using 'epsilon' constant.
-// Returns true if equal.
+// Returns true if equal. valueType is the type of expected.second.
 static bool compareOrderedPartitions(
     const OrderedPartition& expected,
-    const OrderedPartition& actual) {
+    const OrderedPartition& actual,
+    const RowTypePtr& valueType) {
   if (expected.first.size() != actual.first.size() or
       expected.second.size() != actual.second.size()) {
     return false;
@@ -1238,8 +1256,9 @@ static bool compareOrderedPartitions(
     return false;
   }
 
-  auto [numFloatingPointColumns, columns] =
-      findFloatingPointColumns(*expected.second.begin());
+  // valueType is needed by findFloatingPointColumns() to avoid having to infer
+  // the type from expected.second.
+  auto [numFloatingPointColumns, columns] = findFloatingPointColumns(valueType);
   if (numFloatingPointColumns) {
     MaterializedRowEpsilonComparator comparator{
         numFloatingPointColumns, columns};
@@ -1255,16 +1274,17 @@ static bool compareOrderedPartitions(
 
 // Special function to compare vectors of ordered partitions in a way that
 // we compare all floating point values inside using 'epsilon' constant.
-// Returns true if equal.
+// Returns true if equal. valueType is the type of expected[i].second.
 static bool compareOrderedPartitionsVectors(
     const std::vector<OrderedPartition>& expected,
-    const std::vector<OrderedPartition>& actual) {
+    const std::vector<OrderedPartition>& actual,
+    const RowTypePtr& valueType) {
   if (expected.size() != actual.size()) {
     return false;
   }
 
   for (size_t i = 0; i < expected.size(); ++i) {
-    if (not compareOrderedPartitions(expected[i], actual[i])) {
+    if (not compareOrderedPartitions(expected[i], actual[i], valueType)) {
       return false;
     }
   }
@@ -1302,12 +1322,13 @@ void assertResultsOrdered(
   }
 
   if (not compareOrderedPartitionsVectors(
-          expectedPartitions, actualPartitions)) {
+          expectedPartitions, actualPartitions, resultType)) {
     auto actualPartIter = actualPartitions.begin();
     auto expectedPartIter = expectedPartitions.begin();
     while (expectedPartIter != expectedPartitions.end() &&
            actualPartIter != actualPartitions.end()) {
-      if (not compareOrderedPartitions(*expectedPartIter, *actualPartIter)) {
+      if (not compareOrderedPartitions(
+              *expectedPartIter, *actualPartIter, resultType)) {
         break;
       }
       ++expectedPartIter;

diff --git a/velox/type/Variant.cpp b/velox/type/Variant.cpp
@@ -858,20 +858,70 @@ bool variant::lessThanWithEpsilon(const variant& other) const {
   return VELOX_DYNAMIC_TYPE_DISPATCH_ALL(lessThan, kind_, *this, other);
 }
 
+namespace {
+
+// Compare variants of Array or Row type.
+template <TypeKind KIND>
+bool compareComplexTypeWithEpsilon(const variant& left, const variant& right) {
+  auto& leftContainer = left.value<KIND>();
+  auto& rightContainer = right.value<KIND>();
+  if (leftContainer.size() != rightContainer.size()) {
+    return false;
+  }
+  for (int32_t i = 0; i < leftContainer.size(); i++) {
+    if (!leftContainer[i].equalsWithEpsilon(rightContainer[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Compare variants of Map type.
+template <>
+bool compareComplexTypeWithEpsilon<TypeKind::MAP>(
+    const variant& left,
+    const variant& right) {
+  auto& leftMap = left.value<TypeKind::MAP>();
+  auto& rightMap = right.value<TypeKind::MAP>();
+  if (leftMap.size() != rightMap.size()) {
+    return false;
+  }
+  for (auto it = leftMap.begin(); it != leftMap.end(); ++it) {
+    auto otherIt = rightMap.find(it->first);
+    if (otherIt == rightMap.end()) {
+      return false;
+    }
+    if (!it->second.equalsWithEpsilon(otherIt->second)) {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace
+
 // Uses kEpsilon to compare floating point types (REAL and DOUBLE).
 // For testing purposes.
 bool variant::equalsWithEpsilon(const variant& other) const {
   if (other.kind_ != this->kind_) {
     return false;
   }
-  if (other.isNull()) {
-    return this->isNull();
+  if (other.isNull() || this->isNull()) {
+    return other.isNull() && this->isNull();
   }
   if ((kind_ == TypeKind::REAL) or (kind_ == TypeKind::DOUBLE)) {
     return equalsFloatingPointWithEpsilon(*this, other);
   }
 
-  return VELOX_DYNAMIC_TYPE_DISPATCH_ALL(equals, kind_, *this, other);
+  switch (kind_) {
+    case TypeKind::ARRAY:
+      return compareComplexTypeWithEpsilon<TypeKind::ARRAY>(*this, other);
+    case TypeKind::MAP:
+      return compareComplexTypeWithEpsilon<TypeKind::MAP>(*this, other);
+    case TypeKind::ROW:
+      return compareComplexTypeWithEpsilon<TypeKind::ROW>(*this, other);
+    default:
+      return VELOX_DYNAMIC_TYPE_DISPATCH_ALL(equals, kind_, *this, other);
+  }
 }
 
 void variant::verifyArrayElements(const std::vector<variant>& inputs) {