Skip to content

Commit d34fa95

Browse files
authored
chore: add binary array generator that generates different sized binary items (#3390)
The current generator for binary data always generates data that has the exact same length. This PR adds a variation that generates binary elements of different lengths (the lengths are uniformly sampled from a given range). The existing generator is renamed to fixedbin and this one takes the varbin spot.
1 parent c58814a commit d34fa95

File tree

2 files changed

+85
-9
lines changed

2 files changed

+85
-9
lines changed

rust/lance-datagen/benches/array_gen.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ fn bench_rand_gen(c: &mut Criterion) {
119119
lance_datagen::array::rand::<Int64Type>()
120120
});
121121
bench_gen(&mut group, "rand_varbin", || {
122-
lance_datagen::array::rand_varbin(ByteCount::from(12), false)
122+
lance_datagen::array::rand_fixedbin(ByteCount::from(12), false)
123123
});
124124
bench_gen(&mut group, "rand_utf8", || {
125125
lance_datagen::array::rand_utf8(ByteCount::from(12), false)

rust/lance-datagen/src/generator.rs

+84-8
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ use arrow::{
1111
use arrow_array::{
1212
make_array,
1313
types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type},
14-
Array, FixedSizeBinaryArray, FixedSizeListArray, LargeListArray, ListArray, NullArray,
15-
PrimitiveArray, RecordBatch, RecordBatchOptions, RecordBatchReader, StringArray, StructArray,
14+
Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, LargeListArray, ListArray,
15+
NullArray, PrimitiveArray, RecordBatch, RecordBatchOptions, RecordBatchReader, StringArray,
16+
StructArray,
1617
};
1718
use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
1819
use futures::{stream::BoxStream, StreamExt};
@@ -775,6 +776,51 @@ impl ArrayGenerator for RandomBinaryGenerator {
775776
}
776777
}
777778

779+
pub struct VariableRandomBinaryGenerator {
780+
lengths_gen: Box<dyn ArrayGenerator>,
781+
data_type: DataType,
782+
}
783+
784+
impl VariableRandomBinaryGenerator {
785+
pub fn new(min_bytes_per_element: ByteCount, max_bytes_per_element: ByteCount) -> Self {
786+
let lengths_dist = Uniform::new_inclusive(
787+
min_bytes_per_element.0 as i32,
788+
max_bytes_per_element.0 as i32,
789+
);
790+
let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
791+
792+
Self {
793+
lengths_gen,
794+
data_type: DataType::Binary,
795+
}
796+
}
797+
}
798+
799+
impl ArrayGenerator for VariableRandomBinaryGenerator {
800+
fn generate(
801+
&mut self,
802+
length: RowCount,
803+
rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
804+
) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
805+
let lengths = self.lengths_gen.generate(length, rng)?;
806+
let lengths = lengths.as_primitive::<Int32Type>();
807+
let total_length = lengths.values().iter().map(|i| *i as usize).sum::<usize>();
808+
let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
809+
let mut bytes = vec![0; total_length];
810+
rng.fill_bytes(&mut bytes);
811+
let bytes = Buffer::from(bytes);
812+
Ok(Arc::new(BinaryArray::try_new(offsets, bytes, None)?))
813+
}
814+
815+
fn data_type(&self) -> &DataType {
816+
&self.data_type
817+
}
818+
819+
fn element_size_bytes(&self) -> Option<ByteCount> {
820+
None
821+
}
822+
}
823+
778824
pub struct CycleBinaryGenerator<T: ByteArrayType> {
779825
values: Vec<u8>,
780826
lengths: Vec<usize>,
@@ -1427,7 +1473,7 @@ pub mod array {
14271473
pub fn blob() -> Box<dyn ArrayGenerator> {
14281474
let mut blob_meta = HashMap::new();
14291475
blob_meta.insert("lance-encoding:blob".to_string(), "true".to_string());
1430-
rand_varbin(ByteCount::from(4 * 1024 * 1024), true).with_metadata(blob_meta)
1476+
rand_fixedbin(ByteCount::from(4 * 1024 * 1024), true).with_metadata(blob_meta)
14311477
}
14321478

14331479
/// Create a generator that starts at a given value and increments by a given step for each element
@@ -1769,15 +1815,28 @@ pub mod array {
17691815
))
17701816
}
17711817

1772-
/// Create a generator of random binary values
1773-
pub fn rand_varbin(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
1818+
/// Create a generator of random binary values where each value has a fixed number of bytes
1819+
pub fn rand_fixedbin(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
17741820
Box::new(RandomBinaryGenerator::new(
17751821
bytes_per_element,
17761822
false,
17771823
is_large,
17781824
))
17791825
}
17801826

1827+
/// Create a generator of random binary values where each value has a variable number of bytes
1828+
///
1829+
/// The number of bytes per element will be randomly sampled from the given (inclusive) range
1830+
pub fn rand_varbin(
1831+
min_bytes_per_element: ByteCount,
1832+
max_bytes_per_element: ByteCount,
1833+
) -> Box<dyn ArrayGenerator> {
1834+
Box::new(VariableRandomBinaryGenerator::new(
1835+
min_bytes_per_element,
1836+
max_bytes_per_element,
1837+
))
1838+
}
1839+
17811840
/// Create a generator of random strings
17821841
///
17831842
/// All strings will consist entirely of printable ASCII characters
@@ -1799,6 +1858,13 @@ pub mod array {
17991858
Box::new(RandomListGenerator::new(child_gen, is_large))
18001859
}
18011860

1861+
pub fn rand_list_any(
1862+
item_gen: Box<dyn ArrayGenerator>,
1863+
is_large: bool,
1864+
) -> Box<dyn ArrayGenerator> {
1865+
Box::new(RandomListGenerator::new(item_gen, is_large))
1866+
}
1867+
18021868
pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> {
18031869
let child_gens = fields
18041870
.iter()
@@ -1830,8 +1896,8 @@ pub mod array {
18301896
DataType::Decimal256(_, _) => rand_primitive::<Decimal256Type>(data_type.clone()),
18311897
DataType::Utf8 => rand_utf8(ByteCount::from(12), false),
18321898
DataType::LargeUtf8 => rand_utf8(ByteCount::from(12), true),
1833-
DataType::Binary => rand_varbin(ByteCount::from(12), false),
1834-
DataType::LargeBinary => rand_varbin(ByteCount::from(12), true),
1899+
DataType::Binary => rand_fixedbin(ByteCount::from(12), false),
1900+
DataType::LargeBinary => rand_fixedbin(ByteCount::from(12), true),
18351901
DataType::Dictionary(key_type, value_type) => {
18361902
dict_type(rand_type(value_type), key_type)
18371903
}
@@ -2015,7 +2081,7 @@ mod tests {
20152081
Int32Array::from_iter([-797553329, 1369325940, -69174021])
20162082
);
20172083

2018-
let mut gen = array::rand_varbin(ByteCount::from(3), false);
2084+
let mut gen = array::rand_fixedbin(ByteCount::from(3), false);
20192085
assert_eq!(
20202086
*gen.generate(RowCount::from(3), &mut rng).unwrap(),
20212087
arrow_array::BinaryArray::from_iter_values([
@@ -2046,6 +2112,16 @@ mod tests {
20462112
// Sanity check to ensure we're getting at least some rng
20472113
assert!(bools.false_count() > 100);
20482114
assert!(bools.true_count() > 100);
2115+
2116+
let mut gen = array::rand_varbin(ByteCount::from(2), ByteCount::from(4));
2117+
assert_eq!(
2118+
*gen.generate(RowCount::from(3), &mut rng).unwrap(),
2119+
arrow_array::BinaryArray::from_iter_values([
2120+
vec![56, 122, 157, 34],
2121+
vec![58, 51],
2122+
vec![41, 184, 125]
2123+
])
2124+
);
20492125
}
20502126

20512127
#[test]

0 commit comments

Comments
 (0)