Skip to content

Commit

Permalink
[improvement](multi-catalog) push down all predicates into rowgroup/p…
Browse files Browse the repository at this point in the history
…age filtering for ParquetReader (#16388)

Tow improvements:
1. Refactor rowgroup&page filtering in `ParquetReader`, and use the operator overloading of Doris native c++ type to process comparison.
2. Support decimal/decimal v3/date/datev2/datetime/datetimev2
  • Loading branch information
AshinGau authored Feb 7, 2023
1 parent 0b8c631 commit 27216dc
Show file tree
Hide file tree
Showing 6 changed files with 336 additions and 515 deletions.
4 changes: 0 additions & 4 deletions be/src/vec/exec/format/parquet/parquet_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@ const uint32_t ParquetInt96::JULIAN_EPOCH_OFFSET_DAYS = 2440588;
const uint64_t ParquetInt96::MICROS_IN_DAY = 86400000000;
const uint64_t ParquetInt96::NANOS_PER_MICROSECOND = 1000;

inline uint64_t ParquetInt96::to_timestamp_micros() const {
return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND;
}

#define FOR_LOGICAL_NUMERIC_TYPES(M) \
M(TypeIndex::Int8, Int8) \
M(TypeIndex::UInt8, UInt8) \
Expand Down
8 changes: 5 additions & 3 deletions be/src/vec/exec/format/parquet/parquet_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ struct ParquetInt96 {
uint64_t lo; // time of nanoseconds in a day
uint32_t hi; // days from julian epoch

inline uint64_t to_timestamp_micros() const;
inline uint64_t to_timestamp_micros() const {
return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND;
}

static const uint32_t JULIAN_EPOCH_OFFSET_DAYS;
static const uint64_t MICROS_IN_DAY;
Expand Down Expand Up @@ -361,7 +363,6 @@ Status FixLengthDecoder::_decode_datetime64(MutableColumnPtr& doris_column,
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
int64_t scale_to_micro = _decode_params->scale_to_nano_factor / 1000;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run(&read_type)) {
switch (read_type) {
Expand All @@ -373,7 +374,8 @@ Status FixLengthDecoder::_decode_datetime64(MutableColumnPtr& doris_column,
v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz);
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
// nanoseconds will be ignored.
v.set_microsecond((date_value % _decode_params->second_mask) * scale_to_micro);
v.set_microsecond((date_value % _decode_params->second_mask) *
_decode_params->scale_to_nano_factor / 1000);
// TODO: the precision of datetime v1
}
_FIXED_SHIFT_DATA_OFFSET();
Expand Down
Loading

0 comments on commit 27216dc

Please sign in to comment.