Skip to content

Commit

Permalink
[fix](parquet) the end offset of column chunk may be wrong in parquet…
Browse files Browse the repository at this point in the history
… metadata
  • Loading branch information
AshinGau committed Dec 22, 2023
1 parent fa0ad56 commit d0be8c4
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,12 @@ Status ColumnChunkReader::next_page() {
return next_page();
} else if (_page_reader->get_page_header()->type == tparquet::PageType::DATA_PAGE_V2) {
_remaining_num_values = _page_reader->get_page_header()->data_page_header_v2.num_values;
_chunk_parsed_values += _remaining_num_values;
_state = HEADER_PARSED;
return Status::OK();
} else {
_remaining_num_values = _page_reader->get_page_header()->data_page_header.num_values;
_chunk_parsed_values += _remaining_num_values;
_state = HEADER_PARSED;
return Status::OK();
}
Expand Down
11 changes: 10 additions & 1 deletion be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,17 @@ class ColumnChunkReader {
Status init();

// Whether the chunk reader has a more page to read.
bool has_next_page() { return _page_reader->has_next_page(); }
bool has_next_page() { return _chunk_parsed_values < _metadata.num_values; }

// Deprecated
// Seek to the specific page, page_header_offset must be the start offset of the page header.
// _end_offset may exceed the actual data area, so we can only use the number of parsed values
// to determine whether there are remaining pages to read. That's to say we can't use the
// PageLocation in parquet metadata to seek to the specified page. We should call next_page()
// and skip_page() to skip pages one by one.
// todo: change this interface to seek_to_page(int64_t page_header_offset, size_t num_parsed_values)
// and set _chunk_parsed_values = num_parsed_values
// [[deprecated]]
void seek_to_page(int64_t page_header_offset) {
_remaining_num_values = 0;
_page_reader->seek_to_page(page_header_offset);
Expand Down Expand Up @@ -201,6 +209,7 @@ class ColumnChunkReader {

LevelDecoder _rep_level_decoder;
LevelDecoder _def_level_decoder;
size_t _chunk_parsed_values = 0;
uint32_t _remaining_num_values = 0;
Slice _page_data;
std::unique_ptr<uint8_t[]> _decompress_buf;
Expand Down
5 changes: 5 additions & 0 deletions be/src/vec/exec/format/parquet/vparquet_page_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ class PageReader {
uint64_t length);
~PageReader() = default;

// Deprecated
// Parquet file may not be standardized,
// _end_offset may exceed the actual data area.
// ColumnChunkReader::has_next_page() use the number of parsed values for judgment
// [[deprecated]]
bool has_next_page() const { return _offset < _end_offset; }

Status next_page_header();
Expand Down

0 comments on commit d0be8c4

Please sign in to comment.