diff --git a/Cargo.toml b/Cargo.toml index dee034bf..f36ffe73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,7 +33,7 @@ opt-level = 1 # required for the tail calls in inflate to optimize [workspace.dependencies] libloading = "0.8.1" -libz-sys = { version = "1.1.19", default-features = false, features = ["zlib-ng"] } # use libz-ng in libz compat mode +libz-sys = { version = "1.1.21", default-features = false, features = ["zlib-ng"] } # use libz-ng in libz compat mode arbitrary = { version = "1.0" } quickcheck = { version = "1.0.3", default-features = false, features = [] } diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 22f71d12..58c74990 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -23,7 +23,7 @@ features = ["arbitrary-derive"] [dependencies] libc = "0.2.151" -libz-ng-sys = "1.1.12" +libz-ng-sys = "1.1.21" libloading = "0.8.1" crc32fast = "1.3.2" rstest = "0.23.0" diff --git a/test-libz-rs-sys/src/deflate.rs b/test-libz-rs-sys/src/deflate.rs index ae516fe3..feac3302 100644 --- a/test-libz-rs-sys/src/deflate.rs +++ b/test-libz-rs-sys/src/deflate.rs @@ -1815,14 +1815,9 @@ mod fuzz_based_tests { // depending on the algorithm that can give different results. Note that in this case, the // output of aarch64 and x86_64 do line up with the current aarch64 hashing approach. - let output_other = &[ - 24, 149, 99, 96, 102, 24, 104, 160, 7, 38, 57, 96, 92, 117, 6, 14, 6, 38, 60, 202, 65, - 14, 86, 99, 208, 3, 3, 6, 67, 6, 38, 22, 122, 184, 17, 2, 40, 119, 41, 84, 175, 26, 0, - 172, 56, 3, 232, - ]; - - // longest_match does not perform unaligned 32-bit reads/writes on s390x - let output_s390x = &[ + // historical note: previously we used the crc32 hardware instructions, but later versions + // of zlib-ng stopped doing that (the dispatch cost was too high), and we followed suit. + let output = &[ 24, 149, 99, 96, 102, 24, 104, 160, 7, 38, 57, 96, 92, 117, 6, 14, 6, 38, 60, 202, 65, 14, 86, 99, 208, 3, 3, 6, 67, 6, 38, 22, 58, 56, 17, 10, 40, 119, 41, 84, 175, 26, 0, 172, 56, 3, 232, @@ -1852,22 +1847,13 @@ mod fuzz_based_tests { mem_level: 2, strategy: Strategy::Default, }, - if cfg!(any(miri, target_arch = "s390x", target_family = "wasm")) { - output_s390x - } else { - output_other - }, + output, ) } #[test] - #[cfg_attr( - target_family = "wasm", - ignore = "zlib-ng compresses differently on wasm" - )] fn longest_match_difference() { - // the output on aarch64 and x86_64: fully featured modern targets - let output_other = &[ + let output = &[ 24, 87, 83, 81, 97, 100, 96, 96, 228, 98, 0, 3, 123, 6, 6, 77, 21, 16, 67, 5, 36, 10, 102, 73, 139, 67, 164, 24, 194, 64, 32, 144, 75, 55, 16, 5, 248, 65, 65, 52, 152, 116, 99, 100, 96, 96, 16, 98, 96, 151, 241, 243, 243, 11, 12, 52, 128, 41, 2, 153, 206, 6, @@ -1877,31 +1863,6 @@ mod fuzz_based_tests { 82, ]; - // longest_match does not perform unaligned 64-bit reads/writes on i686 - let output_i686 = &[ - 24, 87, 83, 81, 97, 100, 96, 96, 228, 98, 0, 3, 123, 6, 6, 77, 21, 16, 67, 5, 36, 10, - 102, 73, 139, 67, 164, 24, 194, 64, 32, 144, 75, 55, 16, 5, 248, 65, 65, 52, 152, 116, - 99, 100, 96, 96, 16, 98, 96, 151, 241, 243, 243, 11, 12, 52, 128, 41, 2, 153, 206, 6, - 50, 21, 106, 20, 20, 56, 66, 40, 5, 48, 169, 98, 13, 166, 4, 24, 98, 25, 20, 192, 138, - 173, 37, 24, 184, 32, 64, 65, 26, 68, 50, 112, 128, 57, 26, 32, 83, 224, 134, 73, 162, - 154, 8, 7, 14, 40, 60, 78, 20, 30, 8, 48, 97, 136, 48, 48, 128, 125, 128, 36, 0, 0, - 125, 74, 22, 82, - ]; - - // longest_match does not perform unaligned 32-bit reads/writes on s390x - let output_s390x = &[ - 24, 87, 83, 81, 97, 100, 96, 96, 228, 98, 0, 3, 123, 6, 6, 77, 21, 16, 67, 5, 36, 10, - 102, 73, 139, 67, 164, 24, 194, 64, 32, 144, 75, 55, 16, 5, 248, 65, 65, 52, 152, 116, - 99, 100, 96, 96, 16, 98, 96, 151, 241, 243, 243, 11, 12, 52, 128, 41, 2, 153, 206, 6, - 50, 21, 106, 20, 20, 56, 66, 40, 5, 48, 169, 98, 13, 166, 4, 24, 98, 25, 20, 192, 138, - 173, 37, 24, 184, 32, 64, 65, 26, 68, 50, 112, 128, 57, 26, 32, 83, 224, 134, 73, 66, - 140, 192, 0, 14, 40, 60, 78, 20, 30, 8, 48, 97, 136, 48, 48, 128, 125, 128, 36, 0, 0, - 125, 74, 22, 82, - ]; - - assert_ne!(output_i686.as_slice(), output_other.as_slice()); - assert_ne!(output_i686.as_slice(), output_s390x.as_slice()); - fuzz_based_test( &[ 36, 36, 1, 0, 0, 1, 10, 0, 0, 0, 0, 0, 0, 63, 0, 0, 41, 36, 0, 0, 0, 0, 36, 36, 1, @@ -1924,13 +1885,27 @@ mod fuzz_based_tests { mem_level: 6, strategy: Strategy::Default, }, - if cfg!(target_arch = "x86") { - output_i686 - } else if cfg!(target_arch = "s390x") { - output_s390x - } else { - output_other - }, + output, + ) + } + + // a change introduced in [commit]. + // make sure we match the new behavior, and throw an error if the test suite runs with older + // versions + // + // [commit]: https://github.com/zlib-ng/zlib-ng/commit/322753f36e833343ae030e499564691da15eef32 + #[test] + fn deflate_medium_bypass() { + let mut input = [0u8; 268]; + input[0] = 0x16; + input[263] = 0x5; + + fuzz_based_test( + &input, + DeflateConfig::default(), + &[ + 120, 156, 19, 99, 24, 5, 12, 12, 12, 172, 32, 18, 0, 24, 45, 0, 28, + ], ) } } diff --git a/zlib-rs/src/deflate.rs b/zlib-rs/src/deflate.rs index f796b916..0c875c1d 100644 --- a/zlib-rs/src/deflate.rs +++ b/zlib-rs/src/deflate.rs @@ -14,7 +14,7 @@ use crate::{ use self::{ algorithm::CONFIGURATION_TABLE, - hash_calc::{Crc32HashCalc, HashCalcVariant, RollHashCalc, StandardHashCalc}, + hash_calc::{HashCalcVariant, RollHashCalc, StandardHashCalc}, pending::Pending, trees_tbl::STATIC_LTREE, window::Window, @@ -1388,9 +1388,6 @@ impl<'a> State<'a> { pub(crate) fn update_hash(&self, h: u32, val: u32) -> u32 { match self.hash_calc_variant { HashCalcVariant::Standard => StandardHashCalc::update_hash(h, val), - // SAFETY: self.hash_calc_variant is set by HashCalcVariant::for_max_chain_length, - // which avoids choosing Crc32 if the system doesn't have support. - HashCalcVariant::Crc32 => unsafe { Crc32HashCalc::update_hash(h, val) }, HashCalcVariant::Roll => RollHashCalc::update_hash(h, val), } } @@ -1399,9 +1396,6 @@ impl<'a> State<'a> { pub(crate) fn quick_insert_string(&mut self, string: usize) -> u16 { match self.hash_calc_variant { HashCalcVariant::Standard => StandardHashCalc::quick_insert_string(self, string), - // SAFETY: self.hash_calc_variant is set by HashCalcVariant::for_max_chain_length, - // which avoids choosing Crc32 if the system doesn't have support. - HashCalcVariant::Crc32 => unsafe { Crc32HashCalc::quick_insert_string(self, string) }, HashCalcVariant::Roll => RollHashCalc::quick_insert_string(self, string), } } @@ -1410,9 +1404,6 @@ impl<'a> State<'a> { pub(crate) fn insert_string(&mut self, string: usize, count: usize) { match self.hash_calc_variant { HashCalcVariant::Standard => StandardHashCalc::insert_string(self, string, count), - // SAFETY: self.hash_calc_variant is set by HashCalcVariant::for_max_chain_length, - // which avoids choosing Crc32 if the system doesn't have support. - HashCalcVariant::Crc32 => unsafe { Crc32HashCalc::insert_string(self, string, count) }, HashCalcVariant::Roll => RollHashCalc::insert_string(self, string, count), } } @@ -4196,35 +4187,14 @@ mod test { strategy: Strategy::Default, }; - let crc32 = [ - 24, 149, 99, 96, 96, 96, 96, 208, 6, 17, 112, 138, 129, 193, 128, 1, 29, 24, 50, 208, - 1, 200, 146, 169, 79, 24, 74, 59, 96, 147, 52, 71, 22, 70, 246, 88, 26, 94, 80, 128, - 83, 6, 162, 219, 144, 76, 183, 210, 5, 8, 67, 105, 7, 108, 146, 230, 216, 133, 145, - 129, 22, 3, 3, 131, 17, 3, 0, 3, 228, 25, 128, - ]; - - let other = [ + let expected = [ 24, 149, 99, 96, 96, 96, 96, 208, 6, 17, 112, 138, 129, 193, 128, 1, 29, 24, 50, 208, 1, 200, 146, 169, 79, 24, 74, 59, 96, 147, 52, 71, 22, 70, 246, 88, 26, 94, 80, 128, 83, 6, 162, 219, 144, 76, 183, 210, 5, 8, 67, 105, 36, 159, 35, 128, 57, 118, 97, 100, 160, 197, 192, 192, 96, 196, 0, 0, 3, 228, 25, 128, ]; - // the output is slightly different based on what hashing algorithm is used - match HashCalcVariant::for_compression_level(config.level as usize) { - HashCalcVariant::Crc32 => { - // the aarch64 hashing algorithm is different from the standard algorithm, but in - // this case they turn out to give the same output. Beware! - if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64") { - fuzz_based_test(&input, config, &crc32); - } else { - fuzz_based_test(&input, config, &other); - } - } - HashCalcVariant::Standard | HashCalcVariant::Roll => { - fuzz_based_test(&input, config, &other); - } - } + fuzz_based_test(&input, config, &expected); } #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] @@ -4243,5 +4213,4 @@ mod test { // TODO: determine whether changing the aligment of this field will improve performance. const _: () = assert!(offset_of!(State, strstart) == 2824); } - } diff --git a/zlib-rs/src/deflate/algorithm/medium.rs b/zlib-rs/src/deflate/algorithm/medium.rs index 435c2e5a..c4a413a2 100644 --- a/zlib-rs/src/deflate/algorithm/medium.rs +++ b/zlib-rs/src/deflate/algorithm/medium.rs @@ -229,12 +229,8 @@ fn insert_match(state: &mut State, mut m: Match) { return; } - /* Insert new strings in the hash table only if the match length - * is not too large. This saves time but degrades compression. - */ - if (m.match_length as usize) <= 16 * state.max_insert_length() - && state.lookahead >= WANT_MIN_MATCH - { + // Insert new strings in the hash table + if state.lookahead >= WANT_MIN_MATCH { m.match_length -= 1; /* string at strstart already in table */ m.strstart += 1; diff --git a/zlib-rs/src/deflate/hash_calc.rs b/zlib-rs/src/deflate/hash_calc.rs index a46f533e..880dd0cd 100644 --- a/zlib-rs/src/deflate/hash_calc.rs +++ b/zlib-rs/src/deflate/hash_calc.rs @@ -3,28 +3,15 @@ use crate::deflate::{State, HASH_SIZE, STD_MIN_MATCH}; #[derive(Debug, Clone, Copy)] pub enum HashCalcVariant { Standard, - /// # Safety - /// - /// This variant should only be used on supported systems, checked at runtime. See - /// [`Crc32HashCalc`]. - Crc32, Roll, } impl HashCalcVariant { - #[cfg(test)] - pub fn for_compression_level(level: usize) -> Self { - let max_chain_length = crate::deflate::algorithm::CONFIGURATION_TABLE[level].max_chain; - Self::for_max_chain_length(max_chain_length as usize) - } - /// Use rolling hash for deflate_slow algorithm with level 9. It allows us to /// properly lookup different hash chains to speed up longest_match search. pub fn for_max_chain_length(max_chain_length: usize) -> Self { if max_chain_length > 1024 { HashCalcVariant::Roll - } else if Crc32HashCalc::is_supported() { - HashCalcVariant::Crc32 } else { HashCalcVariant::Standard } @@ -137,158 +124,10 @@ impl RollHashCalc { } } -/// # Safety -/// -/// The methods of this struct can only be executed if the system has platform support, otherwise -/// the result is UB. Use [`Self::is_supported()`] to check at runtime whether the system has -/// support before executing any methods. -/// -/// # Compatibility -/// -/// The methods of this struct implement different CRC variants on different CPU types, -/// for consistency with zlib-ng. For example, the x86 implementation computes CRC-32C, -/// whereas the AArch64 implementation computes IEEE CRC-32. Therefore, this struct is -/// suitable for internal hash table use, but not for generating portable checksums. -pub struct Crc32HashCalc; - -impl Crc32HashCalc { - fn is_supported() -> bool { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - return crate::cpu_features::is_enabled_sse42(); - - // NOTE: more recent versions of zlib-ng no longer use the crc instructions on aarch64 - #[cfg(target_arch = "aarch64")] - return crate::cpu_features::is_enabled_crc(); - - #[allow(unreachable_code)] - false - } - - const HASH_CALC_OFFSET: usize = 0; - - const HASH_CALC_MASK: u32 = (HASH_SIZE - 1) as u32; - - #[cfg(target_arch = "x86")] - #[target_feature(enable = "sse4.2")] - unsafe fn hash_calc(h: u32, val: u32) -> u32 { - unsafe { core::arch::x86::_mm_crc32_u32(h, val) } - } - - #[cfg(target_arch = "x86_64")] - #[target_feature(enable = "sse4.2")] - unsafe fn hash_calc(h: u32, val: u32) -> u32 { - unsafe { core::arch::x86_64::_mm_crc32_u32(h, val) } - } - - #[cfg(target_arch = "aarch64")] - #[target_feature(enable = "neon")] - unsafe fn hash_calc(h: u32, val: u32) -> u32 { - unsafe { crate::crc32::acle::__crc32w(h, val) } - } - - #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))] - unsafe fn hash_calc(_h: u32, _val: u32) -> u32 { - assert!(!Self::is_supported()); - unimplemented!("there is no hardware support on this platform") - } - - #[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] - #[cfg_attr(target_arch = "x86", target_feature(enable = "sse4.2"))] - #[cfg_attr(target_arch = "x86_64", target_feature(enable = "sse4.2"))] - pub unsafe fn update_hash(h: u32, val: u32) -> u32 { - (unsafe { Self::hash_calc(h, val) }) & Self::HASH_CALC_MASK - } - - #[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] - #[cfg_attr(target_arch = "x86", target_feature(enable = "sse4.2"))] - #[cfg_attr(target_arch = "x86_64", target_feature(enable = "sse4.2"))] - pub unsafe fn quick_insert_string(state: &mut State, string: usize) -> u16 { - let slice = &state.window.filled()[string + Self::HASH_CALC_OFFSET..]; - let val = u32::from_le_bytes(slice[..4].try_into().unwrap()); - - let hm = unsafe { Self::update_hash(0, val) } as usize; - - let head = state.head.as_slice()[hm]; - if head != string as u16 { - state.prev.as_mut_slice()[string & state.w_mask] = head; - state.head.as_mut_slice()[hm] = string as u16; - } - - head - } - - #[cfg_attr(target_arch = "aarch64", target_feature(enable = "neon"))] - #[cfg_attr(target_arch = "x86", target_feature(enable = "sse4.2"))] - #[cfg_attr(target_arch = "x86_64", target_feature(enable = "sse4.2"))] - pub unsafe fn insert_string(state: &mut State, string: usize, count: usize) { - let slice = &state.window.filled()[string + Self::HASH_CALC_OFFSET..]; - - // it can happen that insufficient bytes are initialized - // .take(count) generates worse assembly - let slice = &slice[..Ord::min(slice.len(), count + 3)]; - - for (i, w) in slice.windows(4).enumerate() { - let idx = string as u16 + i as u16; - - let val = u32::from_le_bytes(w.try_into().unwrap()); - - let hm = unsafe { Self::update_hash(0, val) } as usize; - - let head = state.head.as_slice()[hm]; - if head != idx { - state.prev.as_mut_slice()[idx as usize & state.w_mask] = head; - state.head.as_mut_slice()[hm] = idx; - } - } - } -} - #[cfg(test)] mod tests { use super::*; - #[test] - #[cfg_attr( - not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")), - ignore = "no crc32 hardware support on this platform" - )] - fn crc32_hash_calc() { - if !Crc32HashCalc::is_supported() { - return; - } - - unsafe { - if cfg!(target_arch = "x86") || cfg!(target_arch = "x86_64") { - assert_eq!(Crc32HashCalc::hash_calc(0, 807411760), 2423125009); - assert_eq!(Crc32HashCalc::hash_calc(0, 540024864), 1452438466); - assert_eq!(Crc32HashCalc::hash_calc(0, 538980384), 435552201); - assert_eq!(Crc32HashCalc::hash_calc(0, 807411760), 2423125009); - assert_eq!(Crc32HashCalc::hash_calc(0, 540024864), 1452438466); - assert_eq!(Crc32HashCalc::hash_calc(0, 538980384), 435552201); - assert_eq!(Crc32HashCalc::hash_calc(0, 807411760), 2423125009); - assert_eq!(Crc32HashCalc::hash_calc(0, 540024864), 1452438466); - assert_eq!(Crc32HashCalc::hash_calc(0, 538980384), 435552201); - assert_eq!(Crc32HashCalc::hash_calc(0, 807411760), 2423125009); - assert_eq!(Crc32HashCalc::hash_calc(0, 540024864), 1452438466); - assert_eq!(Crc32HashCalc::hash_calc(0, 538980384), 435552201); - assert_eq!(Crc32HashCalc::hash_calc(0, 807411760), 2423125009); - assert_eq!(Crc32HashCalc::hash_calc(0, 170926112), 500028708); - assert_eq!(Crc32HashCalc::hash_calc(0, 537538592), 3694129053); - assert_eq!(Crc32HashCalc::hash_calc(0, 538970672), 373925026); - assert_eq!(Crc32HashCalc::hash_calc(0, 538976266), 4149335727); - assert_eq!(Crc32HashCalc::hash_calc(0, 538976288), 1767342659); - assert_eq!(Crc32HashCalc::hash_calc(0, 941629472), 4090502627); - assert_eq!(Crc32HashCalc::hash_calc(0, 775430176), 1744703325); - } else { - assert_eq!(Crc32HashCalc::hash_calc(0, 807411760), 2067507791); - assert_eq!(Crc32HashCalc::hash_calc(0, 540024864), 2086141925); - assert_eq!(Crc32HashCalc::hash_calc(0, 538980384), 716394180); - assert_eq!(Crc32HashCalc::hash_calc(0, 775430176), 1396070634); - assert_eq!(Crc32HashCalc::hash_calc(0, 941629472), 637105634); - } - } - } - #[test] fn roll_hash_calc() { assert_eq!(RollHashCalc::hash_calc(2565, 93), 82173); diff --git a/zlib-rs/src/deflate/longest_match.rs b/zlib-rs/src/deflate/longest_match.rs index 03c3cdc6..1307237f 100644 --- a/zlib-rs/src/deflate/longest_match.rs +++ b/zlib-rs/src/deflate/longest_match.rs @@ -2,22 +2,6 @@ use crate::deflate::{Pos, State, MIN_LOOKAHEAD, STD_MAX_MATCH, STD_MIN_MATCH}; const EARLY_EXIT_TRIGGER_LEVEL: i8 = 5; -const UNALIGNED_OK: bool = cfg!(any( - target_arch = "wasm32", - target_arch = "x86", - target_arch = "x86_64", - target_arch = "arm", - target_arch = "aarch64", - target_arch = "powerpc64", -)); - -const UNALIGNED64_OK: bool = cfg!(any( - target_arch = "wasm32", - target_arch = "x86_64", - target_arch = "aarch64", - target_arch = "powerpc64", -)); - /// Find the (length, offset) in the window of the longest match for the string /// at offset cur_match pub fn longest_match(state: &crate::deflate::State, cur_match: u16) -> (usize, u16) { @@ -75,9 +59,9 @@ fn longest_match_help( // Calculate read offset which should only extend an extra byte to find the next best match length. let mut offset = best_len - 1; - if best_len >= core::mem::size_of::() && UNALIGNED_OK { + if best_len >= core::mem::size_of::() { offset -= 2; - if best_len >= core::mem::size_of::() && UNALIGNED64_OK { + if best_len >= core::mem::size_of::() { offset -= 4; } } @@ -203,44 +187,29 @@ fn longest_match_help( // // scan_end is a bit trickier: it reads at a bounded offset from scan_start: // - // - UNALIGNED64_OK: scan_end is bounded by `258 - (4 + 2 + 1)`, so an 8-byte read is in-bounds - // - UNALIGNED_OK: scan_end is bounded by `258 - (2 + 1)`, so a 4-byte read is in-bounds - // - otherwise: scan_end is bounded by `258 - 1`, so a 2-byte read is in-bounds + // - >= 8: scan_end is bounded by `258 - (4 + 2 + 1)`, so an 8-byte read is in-bounds + // - >= 4: scan_end is bounded by `258 - (2 + 1)`, so a 4-byte read is in-bounds + // - >= 2: scan_end is bounded by `258 - 1`, so a 2-byte read is in-bounds unsafe { - if UNALIGNED_OK { - if best_len < core::mem::size_of::() { - loop { - if is_match::<2>(cur_match, mbase_start, mbase_end, scan_start, scan_end) { - break; - } - - goto_next_in_chain!(); + if best_len < core::mem::size_of::() { + loop { + if is_match::<2>(cur_match, mbase_start, mbase_end, scan_start, scan_end) { + break; } - } else if best_len >= core::mem::size_of::() && UNALIGNED64_OK { - loop { - if is_match::<8>(cur_match, mbase_start, mbase_end, scan_start, scan_end) { - break; - } - goto_next_in_chain!(); + goto_next_in_chain!(); + } + } else if best_len >= core::mem::size_of::() { + loop { + if is_match::<8>(cur_match, mbase_start, mbase_end, scan_start, scan_end) { + break; } - } else { - loop { - if is_match::<4>(cur_match, mbase_start, mbase_end, scan_start, scan_end) { - break; - } - goto_next_in_chain!(); - } + goto_next_in_chain!(); } } else { loop { - if memcmp_n_ptr::<2>(mbase_end.wrapping_add(cur_match as usize), scan_end) - && memcmp_n_ptr::<2>( - mbase_start.wrapping_add(cur_match as usize), - scan.as_ptr(), - ) - { + if is_match::<4>(cur_match, mbase_start, mbase_end, scan_start, scan_end) { break; } @@ -278,9 +247,9 @@ fn longest_match_help( } offset = best_len - 1; - if best_len >= core::mem::size_of::() && UNALIGNED_OK { + if best_len >= core::mem::size_of::() { offset -= 2; - if best_len >= core::mem::size_of::() && UNALIGNED64_OK { + if best_len >= core::mem::size_of::() { offset -= 4; } }