From 75fc8fdcb6151d3b791de1695ad0effedf12b62f Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Wed, 22 Nov 2023 14:47:38 -0800 Subject: [PATCH] Use simdjson for json_parse (#7658) Summary: In `json_parse` when the input is invalid, we throw exception and it's slow (both the creation and throwing). To avoid creating or throwing the exception, we switch the implementation to `simdjson` and set a pre-canned exception when the input is invalid. This reduces the CPU time in some queries (with JSON validity check in filter) by more than 20 times (from 2.34 days to 2.68 hours). Reviewed By: mbasmanova Differential Revision: D51469435 --- velox/functions/prestosql/JsonFunctions.cpp | 40 +++++++++++++++---- .../prestosql/tests/JsonFunctionsTest.cpp | 16 ++++++-- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/velox/functions/prestosql/JsonFunctions.cpp b/velox/functions/prestosql/JsonFunctions.cpp index 625060886804..8e9cb0e9d4c4 100644 --- a/velox/functions/prestosql/JsonFunctions.cpp +++ b/velox/functions/prestosql/JsonFunctions.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "velox/expression/VectorFunction.h" +#include "velox/functions/prestosql/json/SIMDJsonWrapper.h" #include "velox/functions/prestosql/types/JsonType.h" namespace facebook::velox::functions { @@ -70,6 +71,14 @@ class JsonParseFunction : public exec::VectorFunction { const TypePtr& /* outputType */, exec::EvalCtx& context, VectorPtr& result) const override { + folly::call_once(initializeErrors_, [this] { + // Initilize errors here so that we get the proper exception context. + for (int i = 1; i < simdjson::NUM_ERROR_CODES; ++i) { + simdjson::simdjson_error e(static_cast(i)); + errors_[i] = toVeloxException(std::make_exception_ptr(e)); + } + }); + VectorPtr localResult; // Input can be constant or flat. @@ -77,12 +86,12 @@ class JsonParseFunction : public exec::VectorFunction { // validation of JSON syntax that doesn't allocate memory or copy data. assert(args.size() > 0); const auto& arg = args[0]; + static_assert(simdjson::SIMDJSON_PADDING <= AlignedBuffer::kPaddedSize); if (arg->isConstantEncoding()) { auto value = arg->as>()->valueAt(0); - try { - folly::parseJson(value); - } catch (const std::exception& e) { - context.setErrors(rows, std::current_exception()); + auto parsed = parser_.parse(value.data(), value.size(), false); + if (parsed.error() != simdjson::SUCCESS) { + context.setErrors(rows, errors_[parsed.error()]); return; } localResult = std::make_shared>( @@ -93,8 +102,13 @@ class JsonParseFunction : public exec::VectorFunction { auto stringBuffers = flatInput->stringBuffers(); VELOX_CHECK_LE(rows.end(), flatInput->size()); - context.applyToSelectedNoThrow( - rows, [&](auto row) { folly::parseJson(flatInput->valueAt(row)); }); + rows.applyToSelected([&](auto row) { + auto value = flatInput->valueAt(row); + auto parsed = parser_.parse(value.data(), value.size(), false); + if (parsed.error() != simdjson::SUCCESS) { + context.setVeloxExceptionError(row, errors_[parsed.error()]); + } + }); localResult = std::make_shared>( context.pool(), JSON(), @@ -114,6 +128,11 @@ class JsonParseFunction : public exec::VectorFunction { .argumentType("varchar") .build()}; } + + private: + mutable folly::once_flag initializeErrors_; + mutable std::exception_ptr errors_[simdjson::NUM_ERROR_CODES]; + mutable simdjson::dom::parser parser_; }; } // namespace @@ -123,8 +142,13 @@ VELOX_DECLARE_VECTOR_FUNCTION( JsonFormatFunction::signatures(), std::make_unique()); -VELOX_DECLARE_VECTOR_FUNCTION( +VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION( udf_json_parse, JsonParseFunction::signatures(), - std::make_unique()); + [](const std::string& /*name*/, + const std::vector&, + const velox::core::QueryConfig&) { + return std::make_shared(); + }); + } // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp index ce8b4e1bda01..fad75e584a01 100644 --- a/velox/functions/prestosql/tests/JsonFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/JsonFunctionsTest.cpp @@ -197,10 +197,11 @@ TEST_F(JsonFunctionsTest, jsonParse) { EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})"); EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])"); - VELOX_ASSERT_THROW(jsonParse(R"({"k1":})"), "expected json value"); VELOX_ASSERT_THROW( - jsonParse(R"({:"k1"})"), "json parse error on line 0 near `:\"k1\"}"); - VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "expected json value"); + jsonParse(R"({"k1":})"), "The JSON document has an improper structure"); + VELOX_ASSERT_THROW( + jsonParse(R"({:"k1"})"), "The JSON document has an improper structure"); + VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "Problem while parsing an atom"); EXPECT_EQ(jsonParseWithTry(R"(not_json)"), std::nullopt); EXPECT_EQ(jsonParseWithTry(R"({"k1":})"), std::nullopt); @@ -223,7 +224,7 @@ TEST_F(JsonFunctionsTest, jsonParse) { VELOX_ASSERT_THROW( evaluate("json_parse(c0)", data), - "json parse error on line 0 near `:': parsing didn't consume all input"); + "missing or superfluous commas, braces, missing keys, etc."); data = makeRowVector({makeFlatVector( {R"("This is a long sentence")", R"("This is some other sentence")"})}); @@ -251,6 +252,13 @@ TEST_F(JsonFunctionsTest, jsonParse) { {R"("This is a long sentence")", R"("This is some other sentence")"}, JSON()); velox::test::assertEqualVectors(expected, result); + + try { + jsonParse(R"({"k1":})"); + FAIL() << "Error expected"; + } catch (const VeloxUserError& e) { + ASSERT_EQ(e.context(), "json_parse(c0)"); + } } TEST_F(JsonFunctionsTest, isJsonScalarSignatures) {