-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Mask sets entries of an array to null. I like the analogy to light: the array is a sequence of lights (each value might be a different wavelength). Null is represented by the absence oflight. Placing a mask (i.e. a piece of plastic with slits) over the array causes those values where the mask is present (i.e. "on", "true") to be dark. An example in pseudo-code: ```rust a = [1, 2, 3, 4, 5] a_mask = [t, f, f, t, f] mask(a, a_mask) == [null, 2, 3, null, 5] ``` Specializations --------------- I only fallback to Arrow for two of the core arrays: - Sparse. I was skeptical that I could do better than decompressing and applying it. - Constant. If the mask is sparse, SparseArray might be a good choice. I didn't investigate. For the non-core arrays, I'm missing the following. I'm not clear that I can beat decompression forrun end. The others are easy enough but some amount of typing and testing. - fastlanes - fsst - roaring - runend - runend-bool - zigzag Naming ------ Pandas also calls this operation [`mask`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mask.html) but accepts an optional second argument which is an array of values to use instead of null (which makes Pandas' mask more like an `if_else`). Arrow-rs calls this [`nullif`](https://arrow.apache.org/rust/arrow/compute/fn.nullif.html). Arrow-cpp has [`if_else(condition, consequent, alternate)`](https://arrow.apache.org/docs/cpp/compute.html#cpp-compute-scalar-selections) and [`replace_with_mask(array, mask, replacements)`](https://arrow.apache.org/docs/cpp/compute.html#replace-functions) both of which can implement our `mask` by passing a `NullArray` as the third argument.
- Loading branch information
Showing
32 changed files
with
1,220 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
use vortex_array::compute::{mask, MaskFn}; | ||
use vortex_array::{Array, IntoArray}; | ||
use vortex_error::VortexResult; | ||
use vortex_mask::Mask; | ||
|
||
use crate::{ALPRDArray, ALPRDEncoding}; | ||
|
||
impl MaskFn<ALPRDArray> for ALPRDEncoding { | ||
fn mask(&self, array: &ALPRDArray, filter_mask: Mask) -> VortexResult<Array> { | ||
Ok(ALPRDArray::try_new( | ||
array.dtype().as_nullable(), | ||
mask(&array.left_parts(), filter_mask)?, | ||
array.left_parts_dict(), | ||
array.right_parts(), | ||
array.right_bit_width(), | ||
array.left_parts_patches(), | ||
)? | ||
.into_array()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use rstest::rstest; | ||
use vortex_array::array::PrimitiveArray; | ||
use vortex_array::compute::test_harness::test_mask; | ||
use vortex_array::IntoArray as _; | ||
|
||
use crate::{ALPRDFloat, RDEncoder}; | ||
|
||
#[rstest] | ||
#[case(0.1f32, 0.2f32, 3e25f32)] | ||
#[case(0.1f64, 0.2f64, 3e100f64)] | ||
fn test_mask_simple<T: ALPRDFloat>(#[case] a: T, #[case] b: T, #[case] outlier: T) { | ||
test_mask( | ||
RDEncoder::new(&[a, b]) | ||
.encode(&PrimitiveArray::from_iter([a, b, outlier, b, outlier])) | ||
.into_array(), | ||
); | ||
} | ||
|
||
#[rstest] | ||
#[case(0.1f32, 3e25f32)] | ||
#[case(0.5f64, 1e100f64)] | ||
fn test_mask_with_nulls<T: ALPRDFloat>(#[case] a: T, #[case] outlier: T) { | ||
test_mask( | ||
RDEncoder::new(&[a]) | ||
.encode(&PrimitiveArray::from_option_iter([ | ||
Some(a), | ||
None, | ||
Some(outlier), | ||
Some(a), | ||
None, | ||
])) | ||
.into_array(), | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#![allow(clippy::unwrap_used)] | ||
|
||
use divan::Bencher; | ||
use rand::rngs::StdRng; | ||
use rand::{Rng, SeedableRng as _}; | ||
use vortex_array::array::PrimitiveArray; | ||
use vortex_array::compute::mask; | ||
use vortex_array::IntoArray as _; | ||
use vortex_dict::DictArray; | ||
use vortex_mask::Mask; | ||
|
||
fn main() { | ||
divan::main(); | ||
} | ||
|
||
fn filter_mask(len: usize, fraction_masked: f64, rng: &mut StdRng) -> Mask { | ||
let indices = (0..len) | ||
.filter(|_| rng.gen_bool(fraction_masked)) | ||
.collect::<Vec<usize>>(); | ||
Mask::from_indices(len, indices) | ||
} | ||
|
||
#[divan::bench(args = [ | ||
(0.9, 0.9), | ||
(0.9, 0.5), | ||
(0.9, 0.1), | ||
(0.9, 0.01), | ||
(0.5, 0.9), | ||
(0.5, 0.5), | ||
(0.5, 0.1), | ||
(0.5, 0.01), | ||
(0.1, 0.9), | ||
(0.1, 0.5), | ||
(0.1, 0.1), | ||
(0.1, 0.01), | ||
(0.01, 0.9), | ||
(0.01, 0.5), | ||
(0.01, 0.1), | ||
(0.01, 0.01), | ||
])] | ||
fn bench_dict_mask(bencher: Bencher, (fraction_valid, fraction_masked): (f64, f64)) { | ||
let mut rng = StdRng::seed_from_u64(0); | ||
|
||
let len = 65_535; | ||
let codes = PrimitiveArray::from_iter((0..len).map(|_| { | ||
if rng.gen_bool(fraction_valid) { | ||
1u64 | ||
} else { | ||
0u64 | ||
} | ||
})) | ||
.into_array(); | ||
let values = PrimitiveArray::from_option_iter([None, Some(42i32)]).into_array(); | ||
let array = DictArray::try_new(codes, values).unwrap().into_array(); | ||
let filter_mask = filter_mask(len, fraction_masked, &mut rng); | ||
bencher | ||
.with_inputs(|| (&array, filter_mask.clone())) | ||
.bench_values(|(array, filter_mask)| mask(array, filter_mask).unwrap()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
use vortex_error::VortexResult; | ||
use vortex_mask::Mask; | ||
|
||
use crate::array::{BoolArray, BoolEncoding}; | ||
use crate::compute::MaskFn; | ||
use crate::{Array, IntoArray}; | ||
|
||
impl MaskFn<BoolArray> for BoolEncoding { | ||
fn mask(&self, array: &BoolArray, mask: Mask) -> VortexResult<Array> { | ||
BoolArray::try_new(array.boolean_buffer(), array.validity().mask(&mask)?) | ||
.map(IntoArray::into_array) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.