From 5933f3bf2671f2091eaf6ecfe08323b8468ef4b0 Mon Sep 17 00:00:00 2001 From: HolyLow Date: Tue, 11 Feb 2025 19:04:32 +0800 Subject: [PATCH] fix: regexp_split fails in empty match pattern --- velox/functions/lib/Re2Functions.h | 18 ++++++++++++------ velox/functions/lib/tests/Re2FunctionsTest.cpp | 11 +++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/velox/functions/lib/Re2Functions.h b/velox/functions/lib/Re2Functions.h index ef6712a73401..510b9ac09597 100644 --- a/velox/functions/lib/Re2Functions.h +++ b/velox/functions/lib/Re2Functions.h @@ -408,13 +408,14 @@ struct Re2RegexpSplit { const auto re2String = re2::StringPiece(string.data(), string.size()); - size_t pos = 0; + size_t currPos = 0; + size_t lastPos = 0; const char* start = string.data(); re2::StringPiece subMatches[1]; while (re->Match( re2String, - pos, + currPos, string.size(), RE2::Anchor::UNANCHORED, subMatches, @@ -423,16 +424,21 @@ struct Re2RegexpSplit { const auto offset = fullMatch.data() - start; const auto size = fullMatch.size(); - out.add_item().setNoCopy(StringView(string.data() + pos, offset - pos)); + out.add_item().setNoCopy( + StringView(string.data() + lastPos, offset - lastPos)); - pos = offset + size; + currPos = offset + size; + lastPos = currPos; + // Only change currPos if encounters empty string match. The lastPos is + // not changed to make sure the produced string has correct position + // when encountering empty string matches in next match. if (UNLIKELY(size == 0)) { - ++pos; + ++currPos; } } out.add_item().setNoCopy( - StringView(string.data() + pos, string.size() - pos)); + StringView(string.data() + lastPos, string.size() - lastPos)); } private: diff --git a/velox/functions/lib/tests/Re2FunctionsTest.cpp b/velox/functions/lib/tests/Re2FunctionsTest.cpp index a7d349243db9..6512bc6acb56 100644 --- a/velox/functions/lib/tests/Re2FunctionsTest.cpp +++ b/velox/functions/lib/tests/Re2FunctionsTest.cpp @@ -1529,6 +1529,17 @@ TEST_F(Re2FunctionsTest, split) { {"a", "b"}, }); assertEqualVectors(expected, result); + + input = makeRowVector({ + makeFlatVector({ + "abcd", + }), + }); + result = evaluate("regexp_split(c0, '')", input); + expected = makeArrayVector({ + {"", "a", "b", "c", "d", ""}, + }); + assertEqualVectors(expected, result); } TEST_F(Re2FunctionsTest, parseSubstrings) {