From d17699875bd286dd6cd2f654cc2a1ac4162edeb3 Mon Sep 17 00:00:00 2001 From: lihangyu <15605149486@163.com> Date: Fri, 26 Jul 2024 12:19:25 +0800 Subject: [PATCH] [Fix](Variant) implement resize and set num_rows right when `subcolumns` is empty (#38364) Set rows correct when subcolumns is empty, and implement resize method with correct semantic This fix the potential crash with stack bellow ``` 6# doris::vectorized::ColumnObject::get(unsigned long, doris::vectorized::Field&) const at /root/doris/be/src/vec/columns/column_object.cpp:841 7# doris::vectorized::ColumnObject::operator[](unsigned long) const at /root/doris/be/src/vec/columns/column_object.cpp:838 8# doris::vectorized::ColumnObject::insert_from(doris::vectorized::IColumn const&, unsigned long) in /home/work/unlimit_teamcity/TeamCity/Agents/20240725170551agent_172.16.0.48_1/work/60183217f6ee2a9c/output/be/lib/doris_be 9# doris::vectorized::ColumnObject::insert_indices_from(doris::vectorized::IColumn const&, unsigned int const*, unsigned int const*) at /root/doris/be/src/vec/columns/column_object.cpp:1581 ``` --- be/src/vec/columns/column_object.cpp | 61 ++++++++++++++++++- be/src/vec/columns/column_object.h | 2 + .../mv/variant/variant_mv.groovy | 2 +- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index b56c48573345c9..65c0a5dcd897cb 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -509,9 +509,24 @@ MutableColumnPtr ColumnObject::apply_for_subcolumns(Func&& func) const { res->add_sub_column(subcolumn->path, new_subcolumn->assume_mutable(), subcolumn->data.get_least_common_type()); } + check_consistency(); return res; } +void ColumnObject::resize(size_t n) { + if (n == num_rows) { + return; + } + if (n > num_rows) { + insert_many_defaults(n - num_rows); + } else { + for (auto& subcolumn : subcolumns) { + subcolumn->data.pop_back(num_rows - n); + } + } + num_rows = n; +} + bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) { if (num_rows < config::variant_threshold_rows_to_estimate_sparse_column) { return false; @@ -697,8 +712,16 @@ MutableColumnPtr ColumnObject::clone_resized(size_t new_size) const { if (new_size == 0) { return ColumnObject::create(is_nullable); } - return apply_for_subcolumns( + // If subcolumns are empty, then res will be empty but new_size > 0 + if (subcolumns.empty()) { + // Add an emtpy column with new_size rows + auto res = ColumnObject::create(true, false); + res->set_num_rows(new_size); + return res; + } + auto res = apply_for_subcolumns( [&](const auto& subcolumn) { return subcolumn.clone_resized(new_size); }); + return res; } size_t ColumnObject::byte_size() const { @@ -838,7 +861,10 @@ Field ColumnObject::operator[](size_t n) const { } void ColumnObject::get(size_t n, Field& res) const { - assert(n < size()); + if (UNLIKELY(n >= size())) { + throw doris::Exception(ErrorCode::OUT_OF_BOUND, + "Index ({}) for getting field is out of range", n); + } res = VariantMap(); auto& object = res.get(); @@ -886,11 +912,32 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le } ColumnPtr ColumnObject::replicate(const Offsets& offsets) const { + if (subcolumns.empty()) { + // Add an emtpy column with offsets.back rows + auto res = ColumnObject::create(true, false); + res->set_num_rows(offsets.back()); + } return apply_for_subcolumns( [&](const auto& subcolumn) { return subcolumn.replicate(offsets); }); } ColumnPtr ColumnObject::permute(const Permutation& perm, size_t limit) const { + if (subcolumns.empty()) { + if (limit == 0) { + limit = num_rows; + } else { + limit = std::min(num_rows, limit); + } + + if (perm.size() < limit) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Size of permutation is less than required."); + } + // Add an emtpy column with limit rows + auto res = ColumnObject::create(true, false); + res->set_num_rows(limit); + return res; + } return apply_for_subcolumns( [&](const auto& subcolumn) { return subcolumn.permute(perm, limit); }); } @@ -1420,6 +1467,12 @@ ColumnPtr ColumnObject::filter(const Filter& filter, ssize_t count) const { return finalized_object.apply_for_subcolumns( [&](const auto& subcolumn) { return subcolumn.filter(filter, count); }); } + if (subcolumns.empty()) { + // Add an emtpy column with filtered rows + auto res = ColumnObject::create(true, false); + res->set_num_rows(count_bytes_in_filter(filter)); + return res; + } auto new_column = ColumnObject::create(true, false); for (auto& entry : subcolumns) { auto subcolumn = entry->data.get_finalized_column().filter(filter, count); @@ -1433,6 +1486,10 @@ Status ColumnObject::filter_by_selector(const uint16_t* sel, size_t sel_size, IC if (!is_finalized()) { finalize(); } + if (subcolumns.empty()) { + assert_cast(col_ptr)->insert_many_defaults(sel_size); + return Status::OK(); + } auto* res = assert_cast(col_ptr); for (const auto& subcolumn : subcolumns) { auto new_subcolumn = subcolumn->data.get_least_common_type()->create_column(); diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 441589bdfbb4fa..e9b6eb7dfd854e 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -363,6 +363,8 @@ class ColumnObject final : public COWHelper { void clear() override; + void resize(size_t n) override; + void clear_subcolumns_data(); std::string get_name() const override { diff --git a/regression-test/suites/nereids_rules_p0/mv/variant/variant_mv.groovy b/regression-test/suites/nereids_rules_p0/mv/variant/variant_mv.groovy index b1c4d371dbb101..23f5e889e7842d 100644 --- a/regression-test/suites/nereids_rules_p0/mv/variant/variant_mv.groovy +++ b/regression-test/suites/nereids_rules_p0/mv/variant/variant_mv.groovy @@ -573,7 +573,7 @@ suite("variant_mv") { where g2.actor['id'] > 34259289; """ def query3_6 = """ - SELECT + SELECT /*+SET_VAR(batch_size=4064,broker_load_batch_size=16352,disable_streaming_preaggregations=false,enable_distinct_streaming_aggregation=true,parallel_fragment_exec_instance_num=3,parallel_pipeline_task_num=0,profile_level=1,enable_pipeline_engine=true,enable_parallel_scan=true,parallel_scan_max_scanners_count=32,parallel_scan_min_rows_per_scanner=64,enable_fold_constant_by_be=true,enable_rewrite_element_at_to_slot=true,runtime_filter_type=1,enable_parallel_result_sink=false,enable_nereids_planner=true,rewrite_or_to_in_predicate_threshold=100000,enable_function_pushdown=false,enable_common_expr_pushdown=false,enable_local_exchange=true,partitioned_hash_join_rows_threshold=8,partitioned_hash_agg_rows_threshold=8,partition_pruning_expand_threshold=10,enable_share_hash_table_for_broadcast_join=true,enable_two_phase_read_opt=true,enable_common_expr_pushdown_for_inverted_index=false,enable_delete_sub_predicate_v2=false,min_revocable_mem=33554432,fetch_remote_schema_timeout_seconds=120,max_fetch_remote_schema_tablet_count=512,enable_join_spill=false,enable_sort_spill=false,enable_agg_spill=false,enable_force_spill=false,data_queue_max_blocks=1,spill_streaming_agg_mem_limit=268435456,external_agg_partition_bits=5) */ g1.id, g2.type, floor(cast(g1.actor['id'] as int) + 100.5),