From 7037d3db0a423f5f471d2503c83b5e6343b03183 Mon Sep 17 00:00:00 2001 From: gaoxin Date: Fri, 22 Dec 2023 19:22:17 +0800 Subject: [PATCH] [fix](parquet) the end offset of column chunk may be wrong in parquet metadata --- .../format/parquet/vparquet_column_chunk_reader.cpp | 2 ++ .../format/parquet/vparquet_column_chunk_reader.h | 11 ++++++++++- .../vec/exec/format/parquet/vparquet_page_reader.h | 5 +++++ .../data/external_table_p2/tvf/test_tvf_p2.out | 12 ++++++++++++ .../suites/external_table_p2/tvf/test_tvf_p2.groovy | 6 ++++++ 5 files changed, 35 insertions(+), 1 deletion(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 928b5ae70bd67d..6feb9bc1025b33 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -97,10 +97,12 @@ Status ColumnChunkReader::next_page() { return next_page(); } else if (_page_reader->get_page_header()->type == tparquet::PageType::DATA_PAGE_V2) { _remaining_num_values = _page_reader->get_page_header()->data_page_header_v2.num_values; + _chunk_parsed_values += _remaining_num_values; _state = HEADER_PARSED; return Status::OK(); } else { _remaining_num_values = _page_reader->get_page_header()->data_page_header.num_values; + _chunk_parsed_values += _remaining_num_values; _state = HEADER_PARSED; return Status::OK(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h index 21ee808e48f6a9..c8a49e098a53e9 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -91,9 +91,17 @@ class ColumnChunkReader { Status init(); // Whether the chunk reader has a more page to read. - bool has_next_page() { return _page_reader->has_next_page(); } + bool has_next_page() { return _chunk_parsed_values < _metadata.num_values; } + // Deprecated // Seek to the specific page, page_header_offset must be the start offset of the page header. + // _end_offset may exceed the actual data area, so we can only use the number of parsed values + // to determine whether there are remaining pages to read. That's to say we can't use the + // PageLocation in parquet metadata to seek to the specified page. We should call next_page() + // and skip_page() to skip pages one by one. + // todo: change this interface to seek_to_page(int64_t page_header_offset, size_t num_parsed_values) + // and set _chunk_parsed_values = num_parsed_values + // [[deprecated]] void seek_to_page(int64_t page_header_offset) { _remaining_num_values = 0; _page_reader->seek_to_page(page_header_offset); @@ -201,6 +209,7 @@ class ColumnChunkReader { LevelDecoder _rep_level_decoder; LevelDecoder _def_level_decoder; + size_t _chunk_parsed_values = 0; uint32_t _remaining_num_values = 0; Slice _page_data; std::unique_ptr _decompress_buf; diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h index 730b9a3001b01d..bdd0a8d0f5ff24 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h @@ -45,6 +45,11 @@ class PageReader { uint64_t length); ~PageReader() = default; + // Deprecated + // Parquet file may not be standardized, + // _end_offset may exceed the actual data area. + // ColumnChunkReader::has_next_page() use the number of parsed values for judgment + // [[deprecated]] bool has_next_page() const { return _offset < _end_offset; } Status next_page_header(); diff --git a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out index 6a44b7322dcd07..53b454df858e2a 100644 --- a/regression-test/data/external_table_p2/tvf/test_tvf_p2.out +++ b/regression-test/data/external_table_p2/tvf/test_tvf_p2.out @@ -50,6 +50,18 @@ 32.024 64.0000 128.901468 32.024 64.0000 128.901468 2023-07-07 2023-07-07 2021-07-07T19:15:31.123456 2023-07-07 2023-07-07 2021-07-07T19:15:31.123456 32.689 64.2580 128.745382 32.689 64.2580 128.745382 2023-11-11 2023-11-11 2022-11-11T16:35:37.123456 2023-11-11 2023-11-11 2022-11-11T16:35:37.123456 +-- !wrong_page_header -- +53587 38687 2689589 99480 2971 1999262 218 386 5265 86 33.14 56.33 40.55 0.00 3487.30 2850.04 4844.38 69.74 0.00 3487.30 3557.04 637.26 +53587 47417 2689589 99480 2971 1999262 218 1216 5265 75 36.98 54.36 46.74 736.15 3505.50 2773.50 4077.00 138.46 736.15 2769.35 2907.81 -4.15 +53587 72713 2689589 99480 2971 1999262 218 1250 5265 15 54.97 85.75 75.46 0.00 1131.90 824.55 1286.25 56.59 0.00 1131.90 1188.49 307.35 +\N 124637 2689589 99480 \N \N 218 196 5265 \N 17.56 33.01 \N \N \N \N 3069.93 17.18 \N 245.52 \N \N +\N 132020 2689589 \N 2971 \N 218 \N 5265 \N \N 132.06 \N \N \N 8965.20 12281.58 255.41 \N \N 3448.10 \N +53587 132623 2689589 99480 2971 1999262 218 31 5265 74 62.58 90.74 77.12 0.00 5706.88 4630.92 6714.76 57.06 0.00 5706.88 5763.94 1075.96 +53587 298010 2689589 99480 2971 1999262 218 802 5265 59 83.41 159.31 70.09 0.00 4135.31 4921.19 9399.29 289.47 0.00 4135.31 4424.78 -785.88 +47821 4657 8720303 1194037 2971 4208705 574 656 5960 55 68.94 91.69 82.52 0.00 4538.60 3791.70 5042.95 408.47 0.00 4538.60 4947.07 746.90 +47821 50869 8720303 1194037 2971 4208705 574 1284 5960 77 2.87 2.92 0.37 0.00 28.49 220.99 224.84 0.00 0.00 28.49 28.49 -192.50 +47821 51494 8720303 1194037 2971 4208705 574 145 5960 18 69.14 94.72 14.20 0.00 255.60 1244.52 1704.96 7.66 0.00 255.60 263.26 -988.92 + -- !viewfs -- 25001 25001 25001 diff --git a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy index 39015b7b76f66a..96626af32b7ead 100644 --- a/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy +++ b/regression-test/suites/external_table_p2/tvf/test_tvf_p2.groovy @@ -60,6 +60,12 @@ suite("test_tvf_p2", "p2,external,tvf,external_remote,external_remote_tvf") { "format" = "parquet"); """ + // test for wrong page header + qt_wrong_page_header """select * from hdfs( + "uri" = "hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/wrong_page_header.parquet", + "format" = "parquet") order by ss_ticket_number,ss_item_sk limit 10; + """ + // viewfs qt_viewfs """select count(id), count(m1), count(m2) from hdfs(