@@ -11,8 +11,9 @@ use arrow::{
11
11
use arrow_array:: {
12
12
make_array,
13
13
types:: { ArrowDictionaryKeyType , BinaryType , ByteArrayType , Utf8Type } ,
14
- Array , FixedSizeBinaryArray , FixedSizeListArray , LargeListArray , ListArray , NullArray ,
15
- PrimitiveArray , RecordBatch , RecordBatchOptions , RecordBatchReader , StringArray , StructArray ,
14
+ Array , BinaryArray , FixedSizeBinaryArray , FixedSizeListArray , LargeListArray , ListArray ,
15
+ NullArray , PrimitiveArray , RecordBatch , RecordBatchOptions , RecordBatchReader , StringArray ,
16
+ StructArray ,
16
17
} ;
17
18
use arrow_schema:: { ArrowError , DataType , Field , Fields , IntervalUnit , Schema , SchemaRef } ;
18
19
use futures:: { stream:: BoxStream , StreamExt } ;
@@ -775,6 +776,51 @@ impl ArrayGenerator for RandomBinaryGenerator {
775
776
}
776
777
}
777
778
779
+ pub struct VariableRandomBinaryGenerator {
780
+ lengths_gen : Box < dyn ArrayGenerator > ,
781
+ data_type : DataType ,
782
+ }
783
+
784
+ impl VariableRandomBinaryGenerator {
785
+ pub fn new ( min_bytes_per_element : ByteCount , max_bytes_per_element : ByteCount ) -> Self {
786
+ let lengths_dist = Uniform :: new_inclusive (
787
+ min_bytes_per_element. 0 as i32 ,
788
+ max_bytes_per_element. 0 as i32 ,
789
+ ) ;
790
+ let lengths_gen = rand_with_distribution :: < Int32Type , Uniform < i32 > > ( lengths_dist) ;
791
+
792
+ Self {
793
+ lengths_gen,
794
+ data_type : DataType :: Binary ,
795
+ }
796
+ }
797
+ }
798
+
799
+ impl ArrayGenerator for VariableRandomBinaryGenerator {
800
+ fn generate (
801
+ & mut self ,
802
+ length : RowCount ,
803
+ rng : & mut rand_xoshiro:: Xoshiro256PlusPlus ,
804
+ ) -> Result < Arc < dyn arrow_array:: Array > , ArrowError > {
805
+ let lengths = self . lengths_gen . generate ( length, rng) ?;
806
+ let lengths = lengths. as_primitive :: < Int32Type > ( ) ;
807
+ let total_length = lengths. values ( ) . iter ( ) . map ( |i| * i as usize ) . sum :: < usize > ( ) ;
808
+ let offsets = OffsetBuffer :: from_lengths ( lengths. values ( ) . iter ( ) . map ( |v| * v as usize ) ) ;
809
+ let mut bytes = vec ! [ 0 ; total_length] ;
810
+ rng. fill_bytes ( & mut bytes) ;
811
+ let bytes = Buffer :: from ( bytes) ;
812
+ Ok ( Arc :: new ( BinaryArray :: try_new ( offsets, bytes, None ) ?) )
813
+ }
814
+
815
+ fn data_type ( & self ) -> & DataType {
816
+ & self . data_type
817
+ }
818
+
819
+ fn element_size_bytes ( & self ) -> Option < ByteCount > {
820
+ None
821
+ }
822
+ }
823
+
778
824
pub struct CycleBinaryGenerator < T : ByteArrayType > {
779
825
values : Vec < u8 > ,
780
826
lengths : Vec < usize > ,
@@ -1427,7 +1473,7 @@ pub mod array {
1427
1473
pub fn blob ( ) -> Box < dyn ArrayGenerator > {
1428
1474
let mut blob_meta = HashMap :: new ( ) ;
1429
1475
blob_meta. insert ( "lance-encoding:blob" . to_string ( ) , "true" . to_string ( ) ) ;
1430
- rand_varbin ( ByteCount :: from ( 4 * 1024 * 1024 ) , true ) . with_metadata ( blob_meta)
1476
+ rand_fixedbin ( ByteCount :: from ( 4 * 1024 * 1024 ) , true ) . with_metadata ( blob_meta)
1431
1477
}
1432
1478
1433
1479
/// Create a generator that starts at a given value and increments by a given step for each element
@@ -1769,15 +1815,28 @@ pub mod array {
1769
1815
) )
1770
1816
}
1771
1817
1772
- /// Create a generator of random binary values
1773
- pub fn rand_varbin ( bytes_per_element : ByteCount , is_large : bool ) -> Box < dyn ArrayGenerator > {
1818
+ /// Create a generator of random binary values where each value has a fixed number of bytes
1819
+ pub fn rand_fixedbin ( bytes_per_element : ByteCount , is_large : bool ) -> Box < dyn ArrayGenerator > {
1774
1820
Box :: new ( RandomBinaryGenerator :: new (
1775
1821
bytes_per_element,
1776
1822
false ,
1777
1823
is_large,
1778
1824
) )
1779
1825
}
1780
1826
1827
+ /// Create a generator of random binary values where each value has a variable number of bytes
1828
+ ///
1829
+ /// The number of bytes per element will be randomly sampled from the given (inclusive) range
1830
+ pub fn rand_varbin (
1831
+ min_bytes_per_element : ByteCount ,
1832
+ max_bytes_per_element : ByteCount ,
1833
+ ) -> Box < dyn ArrayGenerator > {
1834
+ Box :: new ( VariableRandomBinaryGenerator :: new (
1835
+ min_bytes_per_element,
1836
+ max_bytes_per_element,
1837
+ ) )
1838
+ }
1839
+
1781
1840
/// Create a generator of random strings
1782
1841
///
1783
1842
/// All strings will consist entirely of printable ASCII characters
@@ -1799,6 +1858,13 @@ pub mod array {
1799
1858
Box :: new ( RandomListGenerator :: new ( child_gen, is_large) )
1800
1859
}
1801
1860
1861
+ pub fn rand_list_any (
1862
+ item_gen : Box < dyn ArrayGenerator > ,
1863
+ is_large : bool ,
1864
+ ) -> Box < dyn ArrayGenerator > {
1865
+ Box :: new ( RandomListGenerator :: new ( item_gen, is_large) )
1866
+ }
1867
+
1802
1868
pub fn rand_struct ( fields : Fields ) -> Box < dyn ArrayGenerator > {
1803
1869
let child_gens = fields
1804
1870
. iter ( )
@@ -1830,8 +1896,8 @@ pub mod array {
1830
1896
DataType :: Decimal256 ( _, _) => rand_primitive :: < Decimal256Type > ( data_type. clone ( ) ) ,
1831
1897
DataType :: Utf8 => rand_utf8 ( ByteCount :: from ( 12 ) , false ) ,
1832
1898
DataType :: LargeUtf8 => rand_utf8 ( ByteCount :: from ( 12 ) , true ) ,
1833
- DataType :: Binary => rand_varbin ( ByteCount :: from ( 12 ) , false ) ,
1834
- DataType :: LargeBinary => rand_varbin ( ByteCount :: from ( 12 ) , true ) ,
1899
+ DataType :: Binary => rand_fixedbin ( ByteCount :: from ( 12 ) , false ) ,
1900
+ DataType :: LargeBinary => rand_fixedbin ( ByteCount :: from ( 12 ) , true ) ,
1835
1901
DataType :: Dictionary ( key_type, value_type) => {
1836
1902
dict_type ( rand_type ( value_type) , key_type)
1837
1903
}
@@ -2015,7 +2081,7 @@ mod tests {
2015
2081
Int32Array :: from_iter( [ -797553329 , 1369325940 , -69174021 ] )
2016
2082
) ;
2017
2083
2018
- let mut gen = array:: rand_varbin ( ByteCount :: from ( 3 ) , false ) ;
2084
+ let mut gen = array:: rand_fixedbin ( ByteCount :: from ( 3 ) , false ) ;
2019
2085
assert_eq ! (
2020
2086
* gen . generate( RowCount :: from( 3 ) , & mut rng) . unwrap( ) ,
2021
2087
arrow_array:: BinaryArray :: from_iter_values( [
@@ -2046,6 +2112,16 @@ mod tests {
2046
2112
// Sanity check to ensure we're getting at least some rng
2047
2113
assert ! ( bools. false_count( ) > 100 ) ;
2048
2114
assert ! ( bools. true_count( ) > 100 ) ;
2115
+
2116
+ let mut gen = array:: rand_varbin ( ByteCount :: from ( 2 ) , ByteCount :: from ( 4 ) ) ;
2117
+ assert_eq ! (
2118
+ * gen . generate( RowCount :: from( 3 ) , & mut rng) . unwrap( ) ,
2119
+ arrow_array:: BinaryArray :: from_iter_values( [
2120
+ vec![ 56 , 122 , 157 , 34 ] ,
2121
+ vec![ 58 , 51 ] ,
2122
+ vec![ 41 , 184 , 125 ]
2123
+ ] )
2124
+ ) ;
2049
2125
}
2050
2126
2051
2127
#[ test]
0 commit comments