Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
gatesn committed Feb 27, 2025
2 parents 12fc930 + 5a23ffe commit 30b0216
Show file tree
Hide file tree
Showing 22 changed files with 125 additions and 431 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

78 changes: 45 additions & 33 deletions encodings/fastlanes/src/bitpacking/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ use std::mem::MaybeUninit;

use arrow_buffer::ArrowNativeType;
use fastlanes::BitPacking;
use num_traits::AsPrimitive;
use itertools::Itertools;
use num_traits::{AsPrimitive, PrimInt};
use vortex_array::arrays::PrimitiveArray;
use vortex_array::builders::{ArrayBuilder as _, PrimitiveBuilder, UninitRange};
use vortex_array::patches::Patches;
Expand All @@ -14,7 +15,8 @@ use vortex_dtype::{
NativePType, PType, match_each_integer_ptype, match_each_integer_ptype_with_unsigned_type,
match_each_unsigned_integer_ptype,
};
use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
use vortex_error::{VortexExpect, VortexResult, vortex_bail};
use vortex_mask::AllOr;
use vortex_scalar::Scalar;

use crate::BitPackedArray;
Expand All @@ -25,10 +27,7 @@ pub fn bitpack_to_best_bit_width(array: &PrimitiveArray) -> VortexResult<BitPack
}

pub fn bitpack_encode(array: &PrimitiveArray, bit_width: u8) -> VortexResult<BitPackedArray> {
let bit_width_freq = array
.statistics()
.compute_bit_width_freq()
.ok_or_else(|| vortex_err!(ComputeError: "missing bit width frequency"))?;
let bit_width_freq = bit_width_histogram(array)?;

// Check array contains no negative values.
if array.ptype().is_signed_int() {
Expand Down Expand Up @@ -454,33 +453,8 @@ pub unsafe fn unpack_single_primitive<T: NativePType + BitPacking>(
unsafe { BitPacking::unchecked_unpack_single(bit_width, packed_chunk, index_in_chunk) }
}

pub fn find_min_patchless_bit_width(array: &PrimitiveArray) -> VortexResult<u8> {
let bit_width_freq = array
.statistics()
.compute_bit_width_freq()
.ok_or_else(|| vortex_err!(ComputeError: "Failed to compute bit width frequency"))?;

min_patchless_bit_width(&bit_width_freq)
}

fn min_patchless_bit_width(bit_width_freq: &[usize]) -> VortexResult<u8> {
if bit_width_freq.is_empty() {
vortex_bail!("Empty bit width frequency!");
}
Ok(bit_width_freq
.iter()
.enumerate()
.filter_map(|(bw, count)| (*count > 0).then_some(bw as u8))
.max()
.unwrap_or_default())
}

pub fn find_best_bit_width(array: &PrimitiveArray) -> VortexResult<u8> {
let bit_width_freq = array
.statistics()
.compute_bit_width_freq()
.ok_or_else(|| vortex_err!(ComputeError: "Failed to compute bit width frequency"))?;

let bit_width_freq = bit_width_histogram(array)?;
best_bit_width(&bit_width_freq, bytes_per_exception(array.ptype()))
}

Expand Down Expand Up @@ -523,6 +497,45 @@ pub fn count_exceptions(bit_width: u8, bit_width_freq: &[usize]) -> usize {
bit_width_freq[bit_width as usize + 1..].iter().sum()
}

fn bit_width_histogram(array: &PrimitiveArray) -> VortexResult<Vec<usize>> {
match_each_integer_ptype!(array.ptype(), |$P| {
bit_width_histogram_typed::<$P>(array)
})
}

fn bit_width_histogram_typed<T: NativePType + PrimInt>(
array: &PrimitiveArray,
) -> VortexResult<Vec<usize>> {
let bit_width: fn(T) -> usize =
|v: T| (8 * size_of::<T>()) - (PrimInt::leading_zeros(v) as usize);

let mut bit_widths = vec![0usize; size_of::<T>() * 8 + 1];
match array.validity_mask()?.boolean_buffer() {
AllOr::All => {
// All values are valid.
for v in array.as_slice::<T>() {
bit_widths[bit_width(*v)] += 1;
}
}
AllOr::None => {
// All values are invalid
bit_widths[0] = array.len();
}
AllOr::Some(buffer) => {
// Some values are valid
for (is_valid, v) in buffer.iter().zip_eq(array.as_slice::<T>()) {
if is_valid {
bit_widths[bit_width(*v)] += 1;
} else {
bit_widths[0] += 1;
}
}
}
}

Ok(bit_widths)
}

#[cfg(feature = "test-harness")]
pub mod test_harness {
use rand::Rng as _;
Expand Down Expand Up @@ -691,7 +704,6 @@ mod test {
best_bit_width(&freq, bytes_per_exception(PType::U8)).unwrap(),
3
);
assert_eq!(min_patchless_bit_width(&freq).unwrap(), 4)
}

#[test]
Expand Down
13 changes: 10 additions & 3 deletions encodings/runend/benches/run_end_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

use divan::Bencher;
use itertools::repeat_n;
use num_traits::PrimInt;
use vortex_array::Array;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;
use vortex_dtype::NativePType;
use vortex_runend::RunEndArray;
use vortex_runend::compress::runend_encode;

Expand Down Expand Up @@ -38,13 +40,18 @@ fn compress(bencher: Bencher, (length, run_step): (usize, usize)) {
.bench_refs(|values| runend_encode(values).unwrap());
}

#[divan::bench(args = BENCH_ARGS)]
fn decompress(bencher: Bencher, (length, run_step): (usize, usize)) {
#[divan::bench(types = [u8, u16, u32, u64], args = BENCH_ARGS)]
fn decompress<T: NativePType + PrimInt>(bencher: Bencher, (length, run_step): (usize, usize)) {
let values = PrimitiveArray::new(
(0..=length)
.step_by(run_step)
.enumerate()
.flat_map(|(idx, x)| repeat_n(idx as u64, x))
.flat_map(|(idx, x)| {
repeat_n(
T::from(idx % T::max_value().to_usize().unwrap()).unwrap(),
x,
)
})
.collect::<Buffer<_>>(),
Validity::NonNullable,
);
Expand Down
2 changes: 1 addition & 1 deletion encodings/sparse/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ impl StatisticsVTable<&SparseArray> for SparseEncoding {

let fill_len = array.len() - values.len();
let fill_stats = if array.fill_scalar().is_null() {
StatsSet::nulls(fill_len, array.dtype())
StatsSet::nulls(fill_len)
} else {
StatsSet::constant(array.fill_scalar().clone(), fill_len)
};
Expand Down
8 changes: 2 additions & 6 deletions vortex-array/src/arrays/bool/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use std::ops::BitAnd;

use arrow_buffer::BooleanBuffer;
use itertools::Itertools;
use vortex_dtype::{DType, Nullability};
use vortex_error::VortexResult;
use vortex_mask::Mask;

Expand All @@ -27,7 +26,7 @@ impl StatisticsVTable<&BoolArray> for BoolEncoding {

match array.validity_mask()? {
Mask::AllTrue(_) => self.compute_statistics(array.boolean_buffer(), stat),
Mask::AllFalse(v) => Ok(StatsSet::nulls(v, array.dtype())),
Mask::AllFalse(v) => Ok(StatsSet::nulls(v)),
Mask::Values(values) => self.compute_statistics(
&NullableBools(array.boolean_buffer(), values.boolean_buffer()),
stat,
Expand Down Expand Up @@ -72,10 +71,7 @@ impl StatisticsVTable<&NullableBools<'_>> for BoolEncoding {
.for_each(|next| acc.nullable_next(next));
Ok(acc.finish())
} else {
Ok(StatsSet::nulls(
array.0.len(),
&DType::Bool(Nullability::Nullable),
))
Ok(StatsSet::nulls(array.0.len()))
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/arrays/null/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ impl StatisticsVTable<&NullArray> for NullEncoding {
return Ok(StatsSet::of(stat, Precision::exact(array.nbytes())));
}

Ok(StatsSet::nulls(array.len(), &DType::Null))
Ok(StatsSet::nulls(array.len()))
}
}

Expand Down
Loading

0 comments on commit 30b0216

Please sign in to comment.