Skip to content

Commit

Permalink
[fix](parquet) the end offset of column chunk may be wrong in parquet…
Browse files Browse the repository at this point in the history
… metadata #28891 (#28893)

backport: #28891
  • Loading branch information
AshinGau authored Dec 23, 2023
1 parent a15b289 commit adb7730
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,12 @@ Status ColumnChunkReader::next_page() {
return next_page();
} else if (_page_reader->get_page_header()->type == tparquet::PageType::DATA_PAGE_V2) {
_remaining_num_values = _page_reader->get_page_header()->data_page_header_v2.num_values;
_chunk_parsed_values += _remaining_num_values;
_state = HEADER_PARSED;
return Status::OK();
} else {
_remaining_num_values = _page_reader->get_page_header()->data_page_header.num_values;
_chunk_parsed_values += _remaining_num_values;
_state = HEADER_PARSED;
return Status::OK();
}
Expand Down
11 changes: 10 additions & 1 deletion be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,17 @@ class ColumnChunkReader {
Status init();

// Whether the chunk reader has a more page to read.
bool has_next_page() { return _page_reader->has_next_page(); }
bool has_next_page() { return _chunk_parsed_values < _metadata.num_values; }

// Deprecated
// Seek to the specific page, page_header_offset must be the start offset of the page header.
// _end_offset may exceed the actual data area, so we can only use the number of parsed values
// to determine whether there are remaining pages to read. That's to say we can't use the
// PageLocation in parquet metadata to seek to the specified page. We should call next_page()
// and skip_page() to skip pages one by one.
// todo: change this interface to seek_to_page(int64_t page_header_offset, size_t num_parsed_values)
// and set _chunk_parsed_values = num_parsed_values
// [[deprecated]]
void seek_to_page(int64_t page_header_offset) {
_remaining_num_values = 0;
_page_reader->seek_to_page(page_header_offset);
Expand Down Expand Up @@ -201,6 +209,7 @@ class ColumnChunkReader {

LevelDecoder _rep_level_decoder;
LevelDecoder _def_level_decoder;
size_t _chunk_parsed_values = 0;
uint32_t _remaining_num_values = 0;
Slice _page_data;
std::unique_ptr<uint8_t[]> _decompress_buf;
Expand Down
5 changes: 5 additions & 0 deletions be/src/vec/exec/format/parquet/vparquet_page_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ class PageReader {
uint64_t length);
~PageReader() = default;

// Deprecated
// Parquet file may not be standardized,
// _end_offset may exceed the actual data area.
// ColumnChunkReader::has_next_page() use the number of parsed values for judgment
// [[deprecated]]
bool has_next_page() const { return _offset < _end_offset; }

Status next_page_header();
Expand Down
12 changes: 12 additions & 0 deletions regression-test/data/external_table_p2/tvf/test_tvf_p2.out
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@
32.024 64.0000 128.901468 32.024 64.0000 128.901468 2023-07-07 2023-07-07 2021-07-07T19:15:31.123456 2023-07-07 2023-07-07 2021-07-07T19:15:31.123456
32.689 64.2580 128.745382 32.689 64.2580 128.745382 2023-11-11 2023-11-11 2022-11-11T16:35:37.123456 2023-11-11 2023-11-11 2022-11-11T16:35:37.123456

-- !wrong_page_header --
53587 38687 2689589 99480 2971 1999262 218 386 5265 86 33.14 56.33 40.55 0.00 3487.30 2850.04 4844.38 69.74 0.00 3487.30 3557.04 637.26
53587 47417 2689589 99480 2971 1999262 218 1216 5265 75 36.98 54.36 46.74 736.15 3505.50 2773.50 4077.00 138.46 736.15 2769.35 2907.81 -4.15
53587 72713 2689589 99480 2971 1999262 218 1250 5265 15 54.97 85.75 75.46 0.00 1131.90 824.55 1286.25 56.59 0.00 1131.90 1188.49 307.35
\N 124637 2689589 99480 \N \N 218 196 5265 \N 17.56 33.01 \N \N \N \N 3069.93 17.18 \N 245.52 \N \N
\N 132020 2689589 \N 2971 \N 218 \N 5265 \N \N 132.06 \N \N \N 8965.20 12281.58 255.41 \N \N 3448.10 \N
53587 132623 2689589 99480 2971 1999262 218 31 5265 74 62.58 90.74 77.12 0.00 5706.88 4630.92 6714.76 57.06 0.00 5706.88 5763.94 1075.96
53587 298010 2689589 99480 2971 1999262 218 802 5265 59 83.41 159.31 70.09 0.00 4135.31 4921.19 9399.29 289.47 0.00 4135.31 4424.78 -785.88
47821 4657 8720303 1194037 2971 4208705 574 656 5960 55 68.94 91.69 82.52 0.00 4538.60 3791.70 5042.95 408.47 0.00 4538.60 4947.07 746.90
47821 50869 8720303 1194037 2971 4208705 574 1284 5960 77 2.87 2.92 0.37 0.00 28.49 220.99 224.84 0.00 0.00 28.49 28.49 -192.50
47821 51494 8720303 1194037 2971 4208705 574 145 5960 18 69.14 94.72 14.20 0.00 255.60 1244.52 1704.96 7.66 0.00 255.60 263.26 -988.92

-- !viewfs --
25001 25001 25001

Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ suite("test_tvf_p2", "p2") {
"format" = "parquet");
"""

// test for wrong page header
qt_wrong_page_header """select * from hdfs(
"uri" = "hdfs://${nameNodeHost}:${hdfsPort}/catalog/tvf/parquet/wrong_page_header.parquet",
"format" = "parquet") order by ss_ticket_number,ss_item_sk limit 10;
"""

// viewfs
qt_viewfs """select count(id), count(m1), count(m2)
from hdfs(
Expand Down

0 comments on commit adb7730

Please sign in to comment.