Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use simdjson for json_parse #7658

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 32 additions & 8 deletions velox/functions/prestosql/JsonFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "velox/expression/VectorFunction.h"
#include "velox/functions/prestosql/json/SIMDJsonWrapper.h"
#include "velox/functions/prestosql/types/JsonType.h"

namespace facebook::velox::functions {
Expand Down Expand Up @@ -70,19 +71,27 @@ class JsonParseFunction : public exec::VectorFunction {
const TypePtr& /* outputType */,
exec::EvalCtx& context,
VectorPtr& result) const override {
folly::call_once(initializeErrors_, [this] {
// Initilize errors here so that we get the proper exception context.
for (int i = 1; i < simdjson::NUM_ERROR_CODES; ++i) {
simdjson::simdjson_error e(static_cast<simdjson::error_code>(i));
errors_[i] = toVeloxException(std::make_exception_ptr(e));
}
});

VectorPtr localResult;

// Input can be constant or flat.
// TODO(arpitporwal2293) Replace folly::parseJson with a lightweight
// validation of JSON syntax that doesn't allocate memory or copy data.
assert(args.size() > 0);
const auto& arg = args[0];
static_assert(simdjson::SIMDJSON_PADDING <= AlignedBuffer::kPaddedSize);
if (arg->isConstantEncoding()) {
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
try {
folly::parseJson(value);
} catch (const std::exception& e) {
context.setErrors(rows, std::current_exception());
auto parsed = parser_.parse(value.data(), value.size(), false);
if (parsed.error() != simdjson::SUCCESS) {
context.setErrors(rows, errors_[parsed.error()]);
return;
}
localResult = std::make_shared<ConstantVector<StringView>>(
Expand All @@ -93,8 +102,13 @@ class JsonParseFunction : public exec::VectorFunction {
auto stringBuffers = flatInput->stringBuffers();
VELOX_CHECK_LE(rows.end(), flatInput->size());

context.applyToSelectedNoThrow(
rows, [&](auto row) { folly::parseJson(flatInput->valueAt(row)); });
rows.applyToSelected([&](auto row) {
auto value = flatInput->valueAt(row);
auto parsed = parser_.parse(value.data(), value.size(), false);
if (parsed.error() != simdjson::SUCCESS) {
context.setVeloxExceptionError(row, errors_[parsed.error()]);
}
});
localResult = std::make_shared<FlatVector<StringView>>(
context.pool(),
JSON(),
Expand All @@ -114,6 +128,11 @@ class JsonParseFunction : public exec::VectorFunction {
.argumentType("varchar")
.build()};
}

private:
mutable folly::once_flag initializeErrors_;
mutable std::exception_ptr errors_[simdjson::NUM_ERROR_CODES];
mutable simdjson::dom::parser parser_;
Copy link
Contributor

@PHILO-HE PHILO-HE Nov 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @Yuhta, I note simdjson's ondemand parser has better performance, but has some limitations, e.g., it will not validate the full input. Is dom parser intentionally used here? Ref. link.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I tried ondemand parser first and find it is not able to invalidate some malformed JSON.

};

} // namespace
Expand All @@ -123,8 +142,13 @@ VELOX_DECLARE_VECTOR_FUNCTION(
JsonFormatFunction::signatures(),
std::make_unique<JsonFormatFunction>());

VELOX_DECLARE_VECTOR_FUNCTION(
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(
udf_json_parse,
JsonParseFunction::signatures(),
std::make_unique<JsonParseFunction>());
[](const std::string& /*name*/,
const std::vector<exec::VectorFunctionArg>&,
const velox::core::QueryConfig&) {
return std::make_shared<JsonParseFunction>();
});

} // namespace facebook::velox::functions
16 changes: 12 additions & 4 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ TEST_F(JsonFunctionsTest, jsonParse) {
EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})");
EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])");

VELOX_ASSERT_THROW(jsonParse(R"({"k1":})"), "expected json value");
VELOX_ASSERT_THROW(
jsonParse(R"({:"k1"})"), "json parse error on line 0 near `:\"k1\"}");
VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "expected json value");
jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(
jsonParse(R"({:"k1"})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "Problem while parsing an atom");

EXPECT_EQ(jsonParseWithTry(R"(not_json)"), std::nullopt);
EXPECT_EQ(jsonParseWithTry(R"({"k1":})"), std::nullopt);
Expand All @@ -223,7 +224,7 @@ TEST_F(JsonFunctionsTest, jsonParse) {

VELOX_ASSERT_THROW(
evaluate("json_parse(c0)", data),
"json parse error on line 0 near `:': parsing didn't consume all input");
"missing or superfluous commas, braces, missing keys, etc.");

data = makeRowVector({makeFlatVector<StringView>(
{R"("This is a long sentence")", R"("This is some other sentence")"})});
Expand Down Expand Up @@ -251,6 +252,13 @@ TEST_F(JsonFunctionsTest, jsonParse) {
{R"("This is a long sentence")", R"("This is some other sentence")"},
JSON());
velox::test::assertEqualVectors(expected, result);

try {
jsonParse(R"({"k1":})");
FAIL() << "Error expected";
} catch (const VeloxUserError& e) {
ASSERT_EQ(e.context(), "json_parse(c0)");
}
}

TEST_F(JsonFunctionsTest, isJsonScalarSignatures) {
Expand Down