Skip to content

Commit 0859c9a

Browse files
committedJul 19, 2024
add tests for hash util
1 parent 2fa2831 commit 0859c9a

File tree

1 file changed

+105
-12
lines changed

1 file changed

+105
-12
lines changed
 

‎datafusion/common/src/hash_utils.rs

+105-12
Original file line numberDiff line numberDiff line change
@@ -490,22 +490,57 @@ mod tests {
490490
Ok(())
491491
}
492492

493-
#[test]
494-
fn create_hashes_binary() -> Result<()> {
495-
let byte_array = Arc::new(BinaryArray::from_vec(vec![
496-
&[4, 3, 2],
497-
&[4, 3, 2],
498-
&[1, 2, 3],
499-
]));
493+
macro_rules! create_hash_binary {
494+
($NAME:ident, $ARRAY:ty) => {
495+
#[cfg(not(feature = "force_hash_collisions"))]
496+
#[test]
497+
fn $NAME() {
498+
let binary = [
499+
Some(b"short".to_byte_slice()),
500+
None,
501+
Some(b"long but different 12 bytes string"),
502+
Some(b"short2"),
503+
Some(b"Longer than 12 bytes string"),
504+
Some(b"short"),
505+
Some(b"Longer than 12 bytes string"),
506+
];
507+
508+
let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
509+
let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
510+
511+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
512+
513+
let mut binary_hashes = vec![0; binary.len()];
514+
create_hashes(&[binary_array], &random_state, &mut binary_hashes)
515+
.unwrap();
516+
517+
let mut ref_hashes = vec![0; binary.len()];
518+
create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
519+
520+
// Null values result in a zero hash,
521+
for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
522+
match val {
523+
Some(_) => assert_ne!(*hash, 0),
524+
None => assert_eq!(*hash, 0),
525+
}
526+
}
500527

501-
let random_state = RandomState::with_seeds(0, 0, 0, 0);
502-
let hashes_buff = &mut vec![0; byte_array.len()];
503-
let hashes = create_hashes(&[byte_array], &random_state, hashes_buff)?;
504-
assert_eq!(hashes.len(), 3,);
528+
// same logical values should hash to the same hash value
529+
assert_eq!(binary_hashes, ref_hashes);
505530

506-
Ok(())
531+
// Same values should map to same hash values
532+
assert_eq!(binary[0], binary[5]);
533+
assert_eq!(binary[4], binary[6]);
534+
535+
// different binary should map to different hash values
536+
assert_ne!(binary[0], binary[2]);
537+
}
538+
};
507539
}
508540

541+
create_hash_binary!(binary_array, BinaryArray);
542+
create_hash_binary!(binary_view_array, BinaryViewArray);
543+
509544
#[test]
510545
fn create_hashes_fixed_size_binary() -> Result<()> {
511546
let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
@@ -521,6 +556,64 @@ mod tests {
521556
Ok(())
522557
}
523558

559+
macro_rules! create_hash_string {
560+
($NAME:ident, $ARRAY:ty) => {
561+
#[cfg(not(feature = "force_hash_collisions"))]
562+
#[test]
563+
fn $NAME() {
564+
let strings = [
565+
Some("short"),
566+
None,
567+
Some("long but different 12 bytes string"),
568+
Some("short2"),
569+
Some("Longer than 12 bytes string"),
570+
Some("short"),
571+
Some("Longer than 12 bytes string"),
572+
];
573+
574+
let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
575+
let dict_array = Arc::new(
576+
strings
577+
.iter()
578+
.cloned()
579+
.collect::<DictionaryArray<Int8Type>>(),
580+
);
581+
582+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
583+
584+
let mut string_hashes = vec![0; strings.len()];
585+
create_hashes(&[string_array], &random_state, &mut string_hashes)
586+
.unwrap();
587+
588+
let mut dict_hashes = vec![0; strings.len()];
589+
create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
590+
591+
// Null values result in a zero hash,
592+
for (val, hash) in strings.iter().zip(string_hashes.iter()) {
593+
match val {
594+
Some(_) => assert_ne!(*hash, 0),
595+
None => assert_eq!(*hash, 0),
596+
}
597+
}
598+
599+
// same logical values should hash to the same hash value
600+
assert_eq!(string_hashes, dict_hashes);
601+
602+
// Same values should map to same hash values
603+
assert_eq!(strings[0], strings[5]);
604+
assert_eq!(strings[4], strings[6]);
605+
606+
// different strings should map to different hash values
607+
assert_ne!(strings[0], strings[2]);
608+
}
609+
};
610+
}
611+
612+
create_hash_string!(string_array, StringArray);
613+
create_hash_string!(large_string_array, LargeStringArray);
614+
create_hash_string!(string_view_array, StringArray);
615+
create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
616+
524617
#[test]
525618
// Tests actual values of hashes, which are different if forcing collisions
526619
#[cfg(not(feature = "force_hash_collisions"))]

0 commit comments

Comments
 (0)