Skip to content

Commit

Permalink
Use simdjson for json_parse (#7658)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #7658

In `json_parse` when the input is invalid, we throw exception and it's slow (both the creation and throwing).  To avoid creating or throwing the exception, we switch the implementation to `simdjson` and set a pre-canned exception when the input is invalid.  This reduces the CPU time in some queries (with JSON validity check in filter) by more than 20 times (from 2.34 days to 2.68 hours).

Reviewed By: mbasmanova

Differential Revision: D51469435

fbshipit-source-id: de126a2b35a51c210a6d2ddadc31f6fb5df22d75
  • Loading branch information
Yuhta authored and facebook-github-bot committed Nov 23, 2023
1 parent 2b97fed commit 0460044
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 12 deletions.
40 changes: 32 additions & 8 deletions velox/functions/prestosql/JsonFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "velox/expression/VectorFunction.h"
#include "velox/functions/prestosql/json/SIMDJsonWrapper.h"
#include "velox/functions/prestosql/types/JsonType.h"

namespace facebook::velox::functions {
Expand Down Expand Up @@ -70,19 +71,27 @@ class JsonParseFunction : public exec::VectorFunction {
const TypePtr& /* outputType */,
exec::EvalCtx& context,
VectorPtr& result) const override {
folly::call_once(initializeErrors_, [this] {
// Initilize errors here so that we get the proper exception context.
for (int i = 1; i < simdjson::NUM_ERROR_CODES; ++i) {
simdjson::simdjson_error e(static_cast<simdjson::error_code>(i));
errors_[i] = toVeloxException(std::make_exception_ptr(e));
}
});

VectorPtr localResult;

// Input can be constant or flat.
// TODO(arpitporwal2293) Replace folly::parseJson with a lightweight
// validation of JSON syntax that doesn't allocate memory or copy data.
assert(args.size() > 0);
const auto& arg = args[0];
static_assert(simdjson::SIMDJSON_PADDING <= AlignedBuffer::kPaddedSize);
if (arg->isConstantEncoding()) {
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
try {
folly::parseJson(value);
} catch (const std::exception& e) {
context.setErrors(rows, std::current_exception());
auto parsed = parser_.parse(value.data(), value.size(), false);
if (parsed.error() != simdjson::SUCCESS) {
context.setErrors(rows, errors_[parsed.error()]);
return;
}
localResult = std::make_shared<ConstantVector<StringView>>(
Expand All @@ -93,8 +102,13 @@ class JsonParseFunction : public exec::VectorFunction {
auto stringBuffers = flatInput->stringBuffers();
VELOX_CHECK_LE(rows.end(), flatInput->size());

context.applyToSelectedNoThrow(
rows, [&](auto row) { folly::parseJson(flatInput->valueAt(row)); });
rows.applyToSelected([&](auto row) {
auto value = flatInput->valueAt(row);
auto parsed = parser_.parse(value.data(), value.size(), false);
if (parsed.error() != simdjson::SUCCESS) {
context.setVeloxExceptionError(row, errors_[parsed.error()]);
}
});
localResult = std::make_shared<FlatVector<StringView>>(
context.pool(),
JSON(),
Expand All @@ -114,6 +128,11 @@ class JsonParseFunction : public exec::VectorFunction {
.argumentType("varchar")
.build()};
}

private:
mutable folly::once_flag initializeErrors_;
mutable std::exception_ptr errors_[simdjson::NUM_ERROR_CODES];
mutable simdjson::dom::parser parser_;
};

} // namespace
Expand All @@ -123,8 +142,13 @@ VELOX_DECLARE_VECTOR_FUNCTION(
JsonFormatFunction::signatures(),
std::make_unique<JsonFormatFunction>());

VELOX_DECLARE_VECTOR_FUNCTION(
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(
udf_json_parse,
JsonParseFunction::signatures(),
std::make_unique<JsonParseFunction>());
[](const std::string& /*name*/,
const std::vector<exec::VectorFunctionArg>&,
const velox::core::QueryConfig&) {
return std::make_shared<JsonParseFunction>();
});

} // namespace facebook::velox::functions
16 changes: 12 additions & 4 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ TEST_F(JsonFunctionsTest, jsonParse) {
EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})");
EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])");

VELOX_ASSERT_THROW(jsonParse(R"({"k1":})"), "expected json value");
VELOX_ASSERT_THROW(
jsonParse(R"({:"k1"})"), "json parse error on line 0 near `:\"k1\"}");
VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "expected json value");
jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(
jsonParse(R"({:"k1"})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "Problem while parsing an atom");

EXPECT_EQ(jsonParseWithTry(R"(not_json)"), std::nullopt);
EXPECT_EQ(jsonParseWithTry(R"({"k1":})"), std::nullopt);
Expand All @@ -223,7 +224,7 @@ TEST_F(JsonFunctionsTest, jsonParse) {

VELOX_ASSERT_THROW(
evaluate("json_parse(c0)", data),
"json parse error on line 0 near `:': parsing didn't consume all input");
"missing or superfluous commas, braces, missing keys, etc.");

data = makeRowVector({makeFlatVector<StringView>(
{R"("This is a long sentence")", R"("This is some other sentence")"})});
Expand Down Expand Up @@ -251,6 +252,13 @@ TEST_F(JsonFunctionsTest, jsonParse) {
{R"("This is a long sentence")", R"("This is some other sentence")"},
JSON());
velox::test::assertEqualVectors(expected, result);

try {
jsonParse(R"({"k1":})");
FAIL() << "Error expected";
} catch (const VeloxUserError& e) {
ASSERT_EQ(e.context(), "json_parse(c0)");
}
}

TEST_F(JsonFunctionsTest, isJsonScalarSignatures) {
Expand Down

0 comments on commit 0460044

Please sign in to comment.