From 2d2425686b92d13999c05248fd9c53b1544bdf8b Mon Sep 17 00:00:00 2001 From: HolyLow Date: Tue, 28 Jan 2025 13:36:27 -0800 Subject: [PATCH] fix: regexp_extract_all to not return a match in mismatched group (#12143) Summary: Fix regexp_extract_all to avoid returning a match in mismatched group. For example, ``` regexp_extract_all("rat cat\nbat dog", "ra(.)|blah(.)(.)", 2) ``` is expected to return {NULL} because the first group `ra(.)` has a match while the second group `blah(.)` has no match. The current buggy implementation returns {""}. This test case and the expected result is from presto's existing UT, see https://github.com/prestodb/presto/blob/48f2fb1cd0244e8ae1230d27445fcd15d6520597/presto-main/src/test/java/com/facebook/presto/operator/scalar/AbstractTestRegexpFunctions.java#L214. Fixes https://github.com/facebookincubator/velox/issues/12119. Pull Request resolved: https://github.com/facebookincubator/velox/pull/12143 Reviewed By: kagamiori Differential Revision: D68746581 Pulled By: xiaoxmeng fbshipit-source-id: 10e6c919fde1c4419a404592b7e7c6941fe983a1 --- velox/functions/lib/Re2Functions.cpp | 9 +++++++-- velox/functions/lib/tests/Re2FunctionsTest.cpp | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index 78ce3e2f29b5..ace6d0a6fa3f 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -1070,8 +1070,13 @@ void re2ExtractAll( const re2::StringPiece fullMatch = groups[0]; const re2::StringPiece subMatch = groups[groupId]; - arrayWriter.add_item().setNoCopy( - StringView(subMatch.data(), subMatch.size())); + // Check if the subMatch is null. + if (subMatch.data()) { + arrayWriter.add_item().setNoCopy( + StringView(subMatch.data(), subMatch.size())); + } else { + arrayWriter.add_null(); + } pos = fullMatch.data() + fullMatch.size() - input.data(); if (UNLIKELY(fullMatch.size() == 0)) { ++pos; diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index f27d9cb46453..a7d349243db9 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -1103,6 +1103,20 @@ TEST_F(Re2FunctionsTest, regexExtractAllConstantPatternConstantGroupId) { testRe2ExtractAll(inputs, constantPattern, bigGroupIds, expectedOutputs); } +TEST_F(Re2FunctionsTest, regexExtractAllMismatchedGroup) { + auto sourceVector = + makeFlatVector({"rat cat\nbat dog"}, VARCHAR()); + auto patternVector = + makeFlatVector({"ra(.)|blah(.)(.)"}, VARCHAR()); + auto groupIdVector = makeFlatVector(std::vector{2}, INTEGER()); + auto expectedVector = makeNullableArrayVector( + std::vector>>{{std::nullopt}}); + auto result = evaluate( + "regexp_extract_all(c0, c1, c2)", + makeRowVector({sourceVector, patternVector, groupIdVector})); + assertEqualVectors(expectedVector, result); +} + TEST_F(Re2FunctionsTest, regexExtractAllConstantPatternVariableGroupId) { const std::vector> inputs = { " 123a 2b 14m ",