Skip to content

Commit

Permalink
Use simdjson for json_parse (#7658)
Browse files Browse the repository at this point in the history
Summary:

In `json_parse` when the input is invalid, we throw exception and it's slow (both the creation and throwing).  To avoid creating or throwing the exception, we switch the implementation to `simdjson` and set a pre-canned exception when the input is invalid.  This reduces the CPU time in some queries (with JSON validity check in filter) by more than 20 times (from 2.34 days to 2.68 hours).

Differential Revision: D51469435
  • Loading branch information
Yuhta authored and facebook-github-bot committed Nov 20, 2023
1 parent bca2c2c commit 14aa377
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 12 deletions.
49 changes: 41 additions & 8 deletions velox/functions/prestosql/JsonFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "velox/expression/VectorFunction.h"
#include "velox/functions/prestosql/json/SIMDJsonWrapper.h"
#include "velox/functions/prestosql/types/JsonType.h"

namespace facebook::velox::functions {
Expand Down Expand Up @@ -62,6 +63,25 @@ class JsonFormatFunction : public exec::VectorFunction {
}
};

struct SIMDJsonVeloxErrors {
SIMDJsonVeloxErrors() {
for (int i = 1; i < simdjson::NUM_ERROR_CODES; ++i) {
simdjson::simdjson_error e(static_cast<simdjson::error_code>(i));
errors_[i] = toVeloxException(std::make_exception_ptr(e));
}
}

const std::exception_ptr& operator[](size_t i) const {
return errors_[i];
}

private:
std::exception_ptr errors_[simdjson::NUM_ERROR_CODES];
};

// Avoid creating the exceptions repeatedly.
const SIMDJsonVeloxErrors simdJsonErrors;

class JsonParseFunction : public exec::VectorFunction {
public:
void apply(
Expand All @@ -77,12 +97,12 @@ class JsonParseFunction : public exec::VectorFunction {
// validation of JSON syntax that doesn't allocate memory or copy data.
assert(args.size() > 0);
const auto& arg = args[0];
static_assert(simdjson::SIMDJSON_PADDING <= AlignedBuffer::kPaddedSize);
if (arg->isConstantEncoding()) {
auto value = arg->as<ConstantVector<StringView>>()->valueAt(0);
try {
folly::parseJson(value);
} catch (const std::exception& e) {
context.setErrors(rows, std::current_exception());
auto parsed = parser_.parse(value.data(), value.size(), false);
if (parsed.error() != simdjson::SUCCESS) {
context.setErrors(rows, simdJsonErrors[parsed.error()]);
return;
}
localResult = std::make_shared<ConstantVector<StringView>>(
Expand All @@ -93,8 +113,13 @@ class JsonParseFunction : public exec::VectorFunction {
auto stringBuffers = flatInput->stringBuffers();
VELOX_CHECK_LE(rows.end(), flatInput->size());

context.applyToSelectedNoThrow(
rows, [&](auto row) { folly::parseJson(flatInput->valueAt(row)); });
rows.applyToSelected([&](auto row) {
auto value = flatInput->valueAt(row);
auto parsed = parser_.parse(value.data(), value.size(), false);
if (parsed.error() != simdjson::SUCCESS) {
context.setVeloxExceptionError(row, simdJsonErrors[parsed.error()]);
}
});
localResult = std::make_shared<FlatVector<StringView>>(
context.pool(),
JSON(),
Expand All @@ -114,6 +139,9 @@ class JsonParseFunction : public exec::VectorFunction {
.argumentType("varchar")
.build()};
}

private:
mutable simdjson::dom::parser parser_;
};

} // namespace
Expand All @@ -123,8 +151,13 @@ VELOX_DECLARE_VECTOR_FUNCTION(
JsonFormatFunction::signatures(),
std::make_unique<JsonFormatFunction>());

VELOX_DECLARE_VECTOR_FUNCTION(
VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION(
udf_json_parse,
JsonParseFunction::signatures(),
std::make_unique<JsonParseFunction>());
[](const std::string& /*name*/,
const std::vector<exec::VectorFunctionArg>&,
const velox::core::QueryConfig&) {
return std::make_shared<JsonParseFunction>();
});

} // namespace facebook::velox::functions
9 changes: 5 additions & 4 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ TEST_F(JsonFunctionsTest, jsonParse) {
EXPECT_EQ(jsonParse(R"({"k1":"v1"})"), R"({"k1":"v1"})");
EXPECT_EQ(jsonParse(R"(["k1", "v1"])"), R"(["k1", "v1"])");

VELOX_ASSERT_THROW(jsonParse(R"({"k1":})"), "expected json value");
VELOX_ASSERT_THROW(
jsonParse(R"({:"k1"})"), "json parse error on line 0 near `:\"k1\"}");
VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "expected json value");
jsonParse(R"({"k1":})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(
jsonParse(R"({:"k1"})"), "The JSON document has an improper structure");
VELOX_ASSERT_THROW(jsonParse(R"(not_json)"), "Problem while parsing an atom");

EXPECT_EQ(jsonParseWithTry(R"(not_json)"), std::nullopt);
EXPECT_EQ(jsonParseWithTry(R"({"k1":})"), std::nullopt);
Expand All @@ -223,7 +224,7 @@ TEST_F(JsonFunctionsTest, jsonParse) {

VELOX_ASSERT_THROW(
evaluate("json_parse(c0)", data),
"json parse error on line 0 near `:': parsing didn't consume all input");
"missing or superfluous commas, braces, missing keys, etc.");

data = makeRowVector({makeFlatVector<StringView>(
{R"("This is a long sentence")", R"("This is some other sentence")"})});
Expand Down

0 comments on commit 14aa377

Please sign in to comment.