From 224e25eb5cc35391e27146fab47bc11d126cc048 Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Mon, 29 Jul 2024 20:45:11 +0800 Subject: [PATCH] [Feature](JsonReader) support sepecify `$.` as root column in json path (#38213) ``` curl --location-trusted -u root: -T value.json -H "read_json_by_line:true" -H "format:json" -H "max_filter_ratio:0.5" -H "jsonpaths: [\"$.id\", \"$.entity_id\", \"$.\"]" http://127.0.0.1:8149/api/regression_test/records/_stream_load ``` --- be/src/exprs/json_functions.cpp | 5 ++++ be/src/exprs/json_functions.h | 2 ++ .../vec/exec/format/json/new_json_reader.cpp | 14 +++++++++- .../load_p0/stream_load/test_json_load.out | 6 +++++ .../stream_load/test_read_root_path.json | 4 +++ .../load_p0/stream_load/test_json_load.groovy | 27 +++++++++++++++++++ 6 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 regression-test/data/load_p0/stream_load/test_read_root_path.json diff --git a/be/src/exprs/json_functions.cpp b/be/src/exprs/json_functions.cpp index 205ee5a5d20b92..5e3fb136929595 100644 --- a/be/src/exprs/json_functions.cpp +++ b/be/src/exprs/json_functions.cpp @@ -353,4 +353,9 @@ void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value } } +// root path "$." +bool JsonFunctions::is_root_path(const std::vector& json_path) { + return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); +} + } // namespace doris diff --git a/be/src/exprs/json_functions.h b/be/src/exprs/json_functions.h index 72aa522ff374fa..11970eb8c46c56 100644 --- a/be/src/exprs/json_functions.h +++ b/be/src/exprs/json_functions.h @@ -116,6 +116,8 @@ class JsonFunctions { static std::string print_json_value(const rapidjson::Value& value); + static bool is_root_path(const std::vector& json_path); + private: static rapidjson::Value* match_value(const std::vector& parsed_paths, rapidjson::Value* document, diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 2d105a15880425..f0ea0f4c7f0713 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -1657,7 +1657,19 @@ Status NewJsonReader::_simdjson_write_columns_by_jsonpath( return st; } } - if (i >= _parsed_jsonpaths.size() || st.is()) { + if (i < _parsed_jsonpaths.size() && JsonFunctions::is_root_path(_parsed_jsonpaths[i])) { + // Indicate that the jsonpath is "$.", read the full root json object, insert the original doc directly + ColumnNullable* nullable_column = nullptr; + IColumn* target_column_ptr = nullptr; + if (slot_desc->is_nullable()) { + nullable_column = assert_cast(column_ptr); + target_column_ptr = &nullable_column->get_nested_column(); + } + auto* column_string = assert_cast(target_column_ptr); + column_string->insert_data(_simdjson_ondemand_padding_buffer.data(), + _original_doc_size); + has_valid_value = true; + } else if (i >= _parsed_jsonpaths.size() || st.is()) { // not match in jsondata, filling with default value RETURN_IF_ERROR(_fill_missing_column(slot_desc, column_ptr, valid)); if (!(*valid)) { diff --git a/regression-test/data/load_p0/stream_load/test_json_load.out b/regression-test/data/load_p0/stream_load/test_json_load.out index 588b6edb00463a..7df15b74b86f62 100644 --- a/regression-test/data/load_p0/stream_load/test_json_load.out +++ b/regression-test/data/load_p0/stream_load/test_json_load.out @@ -250,3 +250,9 @@ test k2_value -- !select29 -- 10 \N + +-- !select30 -- +12345 {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} {"k1":12345,"k2":"11111","k3":111111,"k4":[11111]} 111111 +12346 {"k1":12346,"k2":"22222","k4":[22222]} {"k1":12346,"k2":"22222","k4":[22222]} \N +12347 {"k1":12347,"k3":"33333","k4":[22222]} {"k1":12347,"k3":"33333","k4":[22222]} 33333 +12348 {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} {"k1":12348,"k3":"33333","k5":{"k51":1024,"xxxx":[11111]}} 33333 \ No newline at end of file diff --git a/regression-test/data/load_p0/stream_load/test_read_root_path.json b/regression-test/data/load_p0/stream_load/test_read_root_path.json new file mode 100644 index 00000000000000..777ccbbfb1f933 --- /dev/null +++ b/regression-test/data/load_p0/stream_load/test_read_root_path.json @@ -0,0 +1,4 @@ +{"k1" : 12345, "k2" : "11111", "k3" : 111111, "k4" : [11111]} +{"k1" : 12346, "k2" : "22222", "k4" : [22222]} +{"k1" : 12347, "k3" : "33333", "k4" : [22222]} +{"k1" : 12348, "k3" : "33333", "k5" : {"k51" : 1024, "xxxx" : [11111]}} \ No newline at end of file diff --git a/regression-test/suites/load_p0/stream_load/test_json_load.groovy b/regression-test/suites/load_p0/stream_load/test_json_load.groovy index e2235c7d5b39e8..1cf2108d48ee77 100644 --- a/regression-test/suites/load_p0/stream_load/test_json_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_json_load.groovy @@ -878,4 +878,31 @@ suite("test_json_load", "p0,nonConcurrent") { } finally { try_sql("DROP TABLE IF EXISTS ${testTable}") } + + // support read "$." as root + try { + sql "DROP TABLE IF EXISTS ${testTable}" + sql """CREATE TABLE IF NOT EXISTS ${testTable} + ( + `k1` varchar(1024) NULL, + `k2` variant NULL, + `k3` variant NULL, + `k4` variant NULL + ) + DUPLICATE KEY(`k1`) + COMMENT '' + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + );""" + + load_json_data.call("${testTable}", "${testTable}_case30", 'false', 'true', 'json', '', '[\"$.k1\",\"$.\", \"$.\", \"$.k3\"]', + '', '', '', 'test_read_root_path.json') + + sql "sync" + qt_select30 "select * from ${testTable} order by k1" + + } finally { + // try_sql("DROP TABLE IF EXISTS ${testTable}") + } }