diff --git a/CHANGELOG.md b/CHANGELOG.md index d017a90a15b..d2b8d81e01d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ - `zerotrie` - Add `as_borrowed_slice` and `AsRef` impl (https://github.com/unicode-org/icu4x/pull/4381) - Add `ZeroTrieSimpleAsciiCursor` for manual iteration (https://github.com/unicode-org/icu4x/pull/4383) + - Increase bound of `p` to solve more perfect hash functions; _might_ break serialized ZeroTriePerfectHash from previous versions (https://github.com/unicode-org/icu4x/pull/4888) - `zerovec` - Change `ZeroHashMap` to use `twox-hash` (https://github.com/unicode-org/icu4x/pull/4592) - `writeable` diff --git a/utils/zerotrie/src/byte_phf/builder.rs b/utils/zerotrie/src/byte_phf/builder.rs index 421bd7230bd..20a8eecba0e 100644 --- a/utils/zerotrie/src/byte_phf/builder.rs +++ b/utils/zerotrie/src/byte_phf/builder.rs @@ -26,7 +26,7 @@ pub fn find(bytes: &[u8]) -> Result<(u8, Vec), Error> { let mut bqs = vec![0u8; N]; let mut seen = vec![false; N]; - let mut max_allowable_p = P_FAST_MAX; + let max_allowable_p = P_FAST_MAX; let mut max_allowable_q = Q_FAST_MAX; 'p_loop: loop { @@ -67,13 +67,15 @@ pub fn find(bytes: &[u8]) -> Result<(u8, Vec), Error> { num_max_q += 1; bqs[i] = 0; if i == 0 || num_max_q > MAX_L2_SEARCH_MISSES { - if p == max_allowable_p && max_allowable_p != P_REAL_MAX { + if p == max_allowable_p && max_allowable_q != Q_REAL_MAX { // println!("Could not solve fast function: trying again: {bytes:?}"); - max_allowable_p = P_REAL_MAX; max_allowable_q = Q_REAL_MAX; p = 0; continue 'p_loop; - } else if p == P_REAL_MAX { + } else if p == max_allowable_p { + // If a fallback algorithm for `p` is added, relax this assertion + // and re-run the loop with a higher `max_allowable_p`. + debug_assert_eq!(max_allowable_p, P_REAL_MAX); // println!("Could not solve PHF function"); return Err(Error::CouldNotSolvePerfectHash); } else { diff --git a/utils/zerotrie/src/byte_phf/mod.rs b/utils/zerotrie/src/byte_phf/mod.rs index bcf77607968..b0cb6d09a0d 100644 --- a/utils/zerotrie/src/byte_phf/mod.rs +++ b/utils/zerotrie/src/byte_phf/mod.rs @@ -59,14 +59,16 @@ mod cached_owned; pub use cached_owned::PerfectByteHashMapCacheOwned; /// The cutoff for the fast version of [`f1`]. -const P_FAST_MAX: u8 = 11; +#[cfg(feature = "alloc")] // used in the builder code +const P_FAST_MAX: u8 = 95; /// The cutoff for the fast version of [`f2`]. const Q_FAST_MAX: u8 = 95; /// The maximum allowable value of `p`. This could be raised if found to be necessary. +/// Values exceeding P_FAST_MAX could use a different `p` algorithm by modifying [`f1`]. #[cfg(feature = "alloc")] // used in the builder code -const P_REAL_MAX: u8 = 15; +const P_REAL_MAX: u8 = P_FAST_MAX; /// The maximum allowable value of `q`. This could be raised if found to be necessary. #[cfg(feature = "alloc")] // used in the builder code @@ -118,13 +120,10 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize { if p == 0 { byte as usize % n } else { - let mut result = byte ^ p ^ byte.wrapping_shr(p as u32); - // In almost all cases, the PHF works with the above constant-time operation. - // However, to crack a few difficult cases, we fall back to the linear-time - // operation shown below. - for _ in P_FAST_MAX..p { - result = result ^ (result << 1) ^ (result >> 1); - } + // `p` always uses the below constant-time operation. If needed, we + // could add some other operation here with `p > P_FAST_MAX` to solve + // difficult cases if the need arises. + let result = byte ^ p ^ byte.wrapping_shr(p as u32); result as usize % n } } @@ -359,6 +358,24 @@ mod tests { assert!(count_fastq >= count_slowq * 100); } + #[test] + fn test_hard_cases() { + let keys = [ + 0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, + 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, + 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, + 126, 195, 196, + ]; + + let computed = PerfectByteHashMap::try_new(&keys).unwrap(); + let (p, qmax) = computed.p_qmax().unwrap(); + assert_eq!(p, 69); + assert_eq!(qmax, 67); + } + #[test] fn test_build_read_small() { #[derive(Debug)]