Skip to content

Commit

Permalink
Merge remote-tracking branch 'apache/master' into specialize_filter_s…
Browse files Browse the repository at this point in the history
…truct_sparse_union
  • Loading branch information
alamb committed Aug 31, 2024
2 parents f8c5b1a + 69e5e5f commit d643a43
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 6 deletions.
11 changes: 9 additions & 2 deletions arrow-select/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,10 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray {
BooleanArray::new(mask, None)
}

/// Filters an [Array], returning elements matching the filter (i.e. where the values are true).
/// Returns a filtered `values` [Array] where the corresponding elements of
/// `predicate` are `true`.
///
/// See also [`FilterBuilder`] for more control over the filtering process.
///
/// # Example
/// ```rust
Expand Down Expand Up @@ -189,14 +192,18 @@ fn multiple_arrays(data_type: &DataType) -> bool {
}
}

/// Returns a new [RecordBatch] with arrays containing only values matching the filter.
/// Returns a filtered [RecordBatch] where the corresponding elements of
/// `predicate` are true.
///
/// This is the equivalent of calling [filter] on each column of the [RecordBatch].
pub fn filter_record_batch(
record_batch: &RecordBatch,
predicate: &BooleanArray,
) -> Result<RecordBatch, ArrowError> {
let mut filter_builder = FilterBuilder::new(predicate);
if record_batch.num_columns() > 1 {
// Only optimize if filtering more than one column
// Otherwise, the overhead of optimization can be more than the benefit
filter_builder = filter_builder.optimize();
}
let filter = filter_builder.build();
Expand Down
9 changes: 7 additions & 2 deletions object_store/src/client/backoff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
use rand::prelude::*;
use std::time::Duration;

/// Exponential backoff with jitter
/// Exponential backoff with decorrelated jitter algorithm
///
/// The first backoff will always be `init_backoff`.
///
/// Subsequent backoffs will pick a random value between `init_backoff` and
/// `base * previous` where `previous` is the duration of the previous backoff
///
/// See <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
#[allow(missing_copy_implementations)]
Expand All @@ -28,7 +33,7 @@ pub struct BackoffConfig {
pub init_backoff: Duration,
/// The maximum backoff duration
pub max_backoff: Duration,
/// The base of the exponential to use
/// The multiplier to use for the next backoff duration
pub base: f64,
}

Expand Down
30 changes: 28 additions & 2 deletions parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
if null_page && self.column_index_builder.valid() {
self.column_index_builder.append(
null_page,
vec![0; 1],
vec![0; 1],
vec![],
vec![],
self.page_metrics.num_page_nulls as i64,
);
} else if self.column_index_builder.valid() {
Expand Down Expand Up @@ -2668,6 +2668,32 @@ mod tests {
),);
}

#[test]
fn test_column_index_with_null_pages() {
// write a single page of all nulls
let page_writer = get_test_page_writer();
let props = Default::default();
let mut writer = get_test_column_writer::<Int32Type>(page_writer, 1, 0, props);
writer.write_batch(&[], Some(&[0, 0, 0, 0]), None).unwrap();

let r = writer.close().unwrap();
assert!(r.column_index.is_some());
let col_idx = r.column_index.unwrap();
// null_pages should be true for page 0
assert!(col_idx.null_pages[0]);
// min and max should be empty byte arrays
assert_eq!(col_idx.min_values[0].len(), 0);
assert_eq!(col_idx.max_values[0].len(), 0);
// null_counts should be defined and be 4 for page 0
assert!(col_idx.null_counts.is_some());
assert_eq!(col_idx.null_counts.as_ref().unwrap()[0], 4);
// there is no repetition so rep histogram should be absent
assert!(col_idx.repetition_level_histograms.is_none());
// definition_level_histogram should be present and should be 0:4, 1:0
assert!(col_idx.definition_level_histograms.is_some());
assert_eq!(col_idx.definition_level_histograms.unwrap(), &[4, 0]);
}

#[test]
fn test_column_offset_index_metadata() {
// write data
Expand Down

0 comments on commit d643a43

Please sign in to comment.