Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve path parsing issues in get_json_object #15082

Merged
24 changes: 18 additions & 6 deletions cpp/src/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,14 @@ struct path_operator {
int index{-1}; // index for subscript operator
};

/**
* @brief Enum to specify whether parsing values enclosed within brackets, like `['book']`
*/
enum class bracket_state : bool {
INSIDE, ///< Parsing inside brackets
OUTSIDE ///< Parsing outside brackets
};

/**
* @brief Parsing class that holds the current state of the JSONPath string to be parsed
* and provides functions for navigating through it. This is only called on the host
Expand All @@ -541,7 +549,7 @@ class path_state : private parser {
case '.': {
path_operator op;
string_view term{".[", 2};
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, bracket_state::OUTSIDE)) {
// this is another potential use case for __SPARK_BEHAVIORS / configurability
// Spark currently only handles the wildcard operator inside [*], it does
// not handle .*
Expand All @@ -564,7 +572,7 @@ class path_state : private parser {
path_operator op;
string_view term{"]", 1};
bool const is_string = *pos == '\'';
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, bracket_state::INSIDE)) {
pos++;
if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') {
op.type = path_operator_type::CHILD_WILDCARD;
Expand Down Expand Up @@ -600,7 +608,8 @@ class path_state : private parser {
private:
cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};

bool parse_path_name(string_view& name, string_view const& terminators)
// b_state is set to INSIDE while parsing values enclosed within [ ], otherwise OUTSIDE
bool parse_path_name(string_view& name, string_view const& terminators, bracket_state b_state)
{
switch (*pos) {
case '*':
Expand All @@ -609,8 +618,11 @@ class path_state : private parser {
break;

case '\'':
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
if (b_state == bracket_state::INSIDE) {
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
}
// if not inside the [ ] -> go to default

default: {
size_t const chars_left = input_len - (pos - input);
Expand Down Expand Up @@ -656,7 +668,7 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
do {
op = p_state.get_next_operator();
if (op.type == path_operator_type::ERROR) {
CUDF_FAIL("Encountered invalid JSONPath input string");
CUDF_FAIL("Encountered invalid JSONPath input string", std::invalid_argument);
}
if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
// convert pointer to device pointer
Expand Down
38 changes: 38 additions & 0 deletions cpp/tests/json/json_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,15 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
};
EXPECT_THROW(query(), std::invalid_argument);
}

{
auto const input = cudf::test::strings_column_wrapper{R"({"a": "b"})"};
auto const json_path = std::string{"${a}"};
auto const query = [&]() {
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
};
EXPECT_THROW(query(), std::invalid_argument);
}
}

// queries that are legal, but reference invalid parts of the input
Expand Down Expand Up @@ -1018,4 +1027,33 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
}

TEST_F(JsonPathTests, QueriesContainingQuotes)
{
std::string input_string = R"({"AB": 1, "A.B": 2, "'A": {"B'": 3}, "A": {"B": 4} })";

auto do_test = [&input_string](auto const& json_path_string,
auto const& expected_string,
bool const& expect_null = false) {
auto const input = cudf::test::strings_column_wrapper{input_string};
auto const json_path = std::string{json_path_string};
cudf::get_json_object_options options;
options.set_allow_single_quotes(true);
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
auto const expected =
cudf::test::strings_column_wrapper{std::initializer_list<std::string>{expected_string},
std::initializer_list<bool>{!expect_null}};

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
};

// Set 1
do_test(R"($.AB)", "1");
do_test(R"($['A.B'])", "2");
do_test(R"($.'A.B')", "3");
do_test(R"($.A.B)", "4");

// Set 2
do_test(R"($.'A)", R"({"B'": 3})");
}

CUDF_TEST_PROGRAM_MAIN()
Loading