Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix](simdjson) fix simdjson reader for read json object array when jsonroot set #38500

Merged
merged 1 commit into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion be/src/vec/exec/format/json/new_json_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1251,7 +1251,7 @@ Status NewJsonReader::_simdjson_handle_flat_array_complex_json(
bool valid = true;
cur = (*_array_iter).get_object();
// extract root
if (_parsed_json_root.size() != 0) {
if (!_parsed_from_json_root && _parsed_json_root.size() != 0) {
simdjson::ondemand::value val;
Status st = JsonFunctions::extract_from_object(cur, _parsed_json_root, &val);
if (UNLIKELY(!st.ok())) {
Expand Down Expand Up @@ -1665,6 +1665,7 @@ Status NewJsonReader::_simdjson_parse_json_doc(size_t* size, bool* eof) {
fmt::format_to(error_msg, "{}", st.to_string());
return return_quality_error(error_msg, std::string((char*)json_str, *size));
}
_parsed_from_json_root = true;
} catch (simdjson::simdjson_error& e) {
fmt::memory_buffer error_msg;
fmt::format_to(error_msg, "Encounter error while extract_from_object, error: {}",
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/exec/format/json/new_json_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ class NewJsonReader : public GenericReader {

std::vector<std::vector<JsonPath>> _parsed_jsonpaths;
std::vector<JsonPath> _parsed_json_root;
bool _parsed_from_json_root = false; // to avoid parsing json root multiple times

char _value_buffer[4 * 1024 * 1024]; // 4MB
char _parse_buffer[512 * 1024]; // 512KB
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select --
1021021338780262401 1021021338700570624 0 239.0000 876219500005C31942 0 0.0000 128.0000 1 2024-07-19T11:34:17 239.0000 0 0.0000

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data":[{"id":"1021021338780262401","type":"INSERT","owner_id":"0","amount_tag":"239.0","barcode":"876219500005C31942","retail_order_bill_id":"1021021338700570624","status":"0","amount_retail":"0.0","amount":"128.0","qty":"1","timestamp":"2024-07-19 11:34:17","price_cost":"239.0","is_gift":"0","amount_discount":"0.0"}],"type":"INSERT"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("load_object_array_json", "p0") {
// define a sql table
def testTable = "load_object_array_json"

def create_test_table = {
def result1 = sql """
CREATE TABLE IF NOT EXISTS ${testTable} (
`id` bigint(20) NOT NULL COMMENT "",
`retail_order_bill_id` bigint(20) NULL COMMENT "",
`owner_id` int(11) NULL COMMENT "",
`amount_tag` decimal(12, 4) NULL COMMENT "",
`barcode` varchar(128) NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`amount_retail` decimal(12, 4) NULL COMMENT "",
`amount` decimal(12, 4) NULL COMMENT "",
`qty` int(11) NULL COMMENT "",
`timestamp` datetime NULL COMMENT "时间戳",
`price_cost` decimal(12, 4) NULL COMMENT "",
`is_gift` int(11) NULL COMMENT "",
`amount_discount` decimal(12, 4) NULL COMMENT "",
) ENGINE=OLAP
UNIQUE KEY(`id`, `retail_order_bill_id`)
DISTRIBUTED BY HASH(`retail_order_bill_id`) BUCKETS 10
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"storage_format" = "V2"
)
"""
}

def load_data = {table_name, file_name ->
// load the json data
streamLoad {
table table_name

// set http request header params
set 'strip_outer_array', 'true'
set 'read_json_by_line', 'true'
set 'format', 'json'
set 'columns', 'id,owner_id,amount_tag,barcode,retail_order_bill_id,status,amount_retail,amount,qty,timestamp,price_cost,is_gift,amount_discount'
set 'jsonpaths', '[\"$.id\",\"$.owner_id\",\"$.amount_tag\",\"$.barcode\",\"$.retail_order_bill_id\",\"$.status\",\"$.amount_retail\",\"$.amount\",\"$.qty\",\"$.timestamp\",\"$.price_cost\",\"$.is_gift\",\"$.amount_discount\"]'
set 'json_root', '$.data'
set 'fuzzy_parse', 'false'
set 'max_filter_ratio', '1'
file file_name // import json file
time 10000 // limit inflight 10s

// if declared a check callback, the default check condition will ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows
+ json.NumberFilteredRows)
assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
}
}
}

def check_data_correct = {table_name ->
sql "sync"
// select the table and check whether the data is correct
qt_select "select * from ${table_name} order by id"
}

// case1: import array data in json format and enable vectorized engine
try {
sql "DROP TABLE IF EXISTS ${testTable}"

create_test_table.call()

load_data.call(testTable, 'test_json_object_array.csv')

check_data_correct(testTable)

} finally {
try_sql("DROP TABLE IF EXISTS ${testTable}")
}
}
Loading