Tweaked how xmul is exposed, added p*::widening_mul

Multiplication, and carry-less multiplication, are inherently a widening operation. Unfortunately, at the time of writing, the types in Rust don't capture this well, being built around fixed-width wrapping multiplication. Rust's stdlib can rely on compiler-level optimizations to clean up performance issues from unnecessarily-wide multiplications, but this becomes a bit of an issue for our library, especially for u64 types, since we rely on intrinsics, which may be hard for compilers to optimize around. This commit adds widening_mul, based on a proposal to add widening_mul to Rust's primitive types: rust-lang/rust#85532 As well as several other tweaks to how xmul is provided, moving more arch-level details into xmul, but still limiting when it is emitted.
geky · Oct 31, 2021 · f61ae49 · f61ae49
1 parent 344170f
commit f61ae49
Show file tree

Hide file tree

Showing 12 changed files with 692 additions and 279 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -42,46 +42,15 @@ harness = false
 # tracking issue:
 # https://github.com/rust-lang/rust/issues/48556
 #
-use-nightly-features = []
+nightly = []
 
-# Makes p* types use a naive carry-less multiplication implementation
-# using shifts and xors. Mainly useful for testing/benchmarking.
+# Disable carry-less multiplication instructions, forcing the use
+# of naive bitwise implementations
 #
-# By default hardware xmul is used if available, falling back to a naive
-# implementation.
+# This is mostly available for testing, and in the case that hardware
+# xmul is bugged (or more likely this crate is bugged).
 #
-use-naive-xmul = ["gf256-macros/use-naive-xmul"]
-
-# Makes p* types require hardware-accelerated carry-less multiplication,
-# causing a compile error if carry-less multiplication instructions aren't
-# available in the current architecture.
-#
-# By default hardware xmul is used if available, falling back to a naive
-# implementation.
-#
-use-hardware-xmul = ["gf256-macros/use-hardware-xmul"]
-
-# Make gf* types use a naive multiplication implementation using shifts
-# and xors. Mainly useful for testing/benchmarking.
-#
-# By default log/antilog tables are used.
-#
-use-naive-gfmul = ["gf256-macros/use-naive-gfmul"]
-
-# Make gf* types use precompiled log/antilog tables.
-#
-# By default log/antilog tables are used.
-#
-use-table-gfmul = ["gf256-macros/use-table-gfmul"]
-
-# Makes gf* types use (potentially hardware accelerated) polynomial
-# multiplication with Barret reduction. This is generally slower than using
-# log/antilog tables, but may be useful if constant-time operations are
-# required.
-#
-# By default log/antilog tables are used.
-#
-use-barret-gfmul = ["gf256-macros/use-barret-gfmul"]
+no-xmul = ["gf256-macros/no-xmul"]
 
 [dev-dependencies]
 criterion = {version="0.3", features=["html_reports"]}

diff --git a/Makefile b/Makefile
@@ -3,20 +3,21 @@ override ENV += RUSTFLAGS="-Ctarget-cpu=native"
 
 .PHONY: all build
 all build:
-	$(ENV) cargo build
+	$(ENV) cargo +nightly build --features nightly
 
 .PHONY: test
 test:
-	$(ENV) cargo test --lib
-	$(ENV) cargo test --example find-p
-	$(ENV) cargo run --example crc
-	$(ENV) cargo run --example shamir
-	$(ENV) cargo run --example raid
-	$(ENV) cargo run --example rs
+	$(ENV) cargo +nightly test --features nightly --lib
+	$(ENV) cargo +nightly test --features nightly --example find-p
+	$(ENV) cargo +nightly run --features nightly --example find-p -- -w9 -n4 -m1
+	$(ENV) cargo +nightly run --features nightly --example crc
+	$(ENV) cargo +nightly run --features nightly --example shamir
+	$(ENV) cargo +nightly run --features nightly --example raid
+	$(ENV) cargo +nightly run --features nightly --example rs
 
 .PHONY: bench
 bench:
-	$(ENV) cargo +nightly bench --features use-nightly-features
+	$(ENV) cargo +nightly bench --features nightly
 
 .PHONY: clean
 clean:

diff --git a/benches/crc.rs b/benches/crc.rs
@@ -78,6 +78,20 @@ fn bench_crc(c: &mut Criterion) {
         |data| crc::word_barret_crc(data),
         BatchSize::SmallInput
     ));
+
+    let mut xs = xorshift64(42).map(|x| x as u8);
+    group.bench_function("reversed_barret_crc", |b| b.iter_batched_ref(
+        || (&mut xs).take(SIZE).collect::<Vec<u8>>(),
+        |data| crc::reversed_barret_crc(data),
+        BatchSize::SmallInput
+    ));
+
+    let mut xs = xorshift64(42).map(|x| x as u8);
+    group.bench_function("word_reversed_barret_crc", |b| b.iter_batched_ref(
+        || (&mut xs).take(SIZE).collect::<Vec<u8>>(),
+        |data| crc::word_reversed_barret_crc(data),
+        BatchSize::SmallInput
+    ));
 }
 
 criterion_group!(benches, bench_crc);

diff --git a/examples/crc.rs b/examples/crc.rs
@@ -164,17 +164,32 @@ pub fn small_table_crc(data: &[u8]) -> u32 {
 /// compile-time.
 ///
 pub fn barret_crc(data: &[u8]) -> u32 {
-    const BARRET_CONSTANT: p64 = {
-        p64(p128(0x10000000000000000)
-            .naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
     };
 
     let mut crc = p32(0xffffffff);
 
     for b in data {
         crc = crc ^ (p32::from(b.reverse_bits()) << 24);
-        let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
-        crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
+        crc = (crc << 8)
+            + ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
+                .wrapping_mul(p32::from_lossy(POLYNOMIAL));
     }
 
     u32::from(crc).reverse_bits() ^ 0xffffffff
@@ -184,9 +199,23 @@ pub fn barret_crc(data: &[u8]) -> u32 {
 /// barret_crc, but operating on a 32-bit word at a time
 ///
 pub fn word_barret_crc(data: &[u8]) -> u32 {
-    const BARRET_CONSTANT: p64 = {
-        p64(p128(0x10000000000000000)
-            .naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
     };
 
     let mut crc = p32(0xffffffff);
@@ -196,19 +225,118 @@ pub fn word_barret_crc(data: &[u8]) -> u32 {
     for word in &mut words {
         let word = <[u8; 4]>::try_from(word).unwrap();
         crc = crc ^ p32::from_le_bytes(word).reverse_bits();
-        let q = (p64::from(crc)*BARRET_CONSTANT) >> 32;
-        crc = p32::from_lossy(q*POLYNOMIAL);
+        crc = (crc.widening_mul(BARRET_CONSTANT).1 + crc)
+                .wrapping_mul(p32::from_lossy(POLYNOMIAL));
     }
 
     for b in words.remainder() {
         crc = crc ^ (p32::from(b.reverse_bits()) << 24);
-        let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
-        crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
+        crc = (crc << 8)
+            + ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
+                .wrapping_mul(p32::from_lossy(POLYNOMIAL));
     }
 
     u32::from(crc).reverse_bits() ^ 0xffffffff
 }
 
+/// A hardware-accelerated CRC implementation using Barret reduction without
+/// needing to bit-reverse the internal representation
+///
+/// CRC32 and polynomial multiplication instructions unfortunately are defined
+/// with different bit-endianness. This would normally mean we need to
+/// bit-reverse the incoming data before we can use polynomial multiplication.
+///
+/// However, polynomial multiplication has the odd property that it is
+/// symmetric, brev(a) * brev(b) = brev((a * b) << 1)
+///
+/// This means we can rewrite our Barret reduction CRC to operate entirely
+/// on a bit-reversed representation, shaving off several instructions.
+///
+/// In theory this should be faster, but measurements show this as actually
+/// being slightly slower, perhaps the extra 1-bit shift costs more on
+/// machines with bit-reverse instructions?
+///
+pub fn reversed_barret_crc(data: &[u8]) -> u32 {
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
+    };
+    const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
+    const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();
+
+    let mut crc = p32(0xffffffff);
+
+    for b in data {
+        crc = crc ^ p32::from(*b);
+        let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
+        let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
+        crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
+    }
+
+    u32::from(crc) ^ 0xffffffff
+}
+
+/// A hardware-accelerated CRC implementation using the same technique as
+/// reversed_barret_crc, but operating on a 32-bit word at a time
+///
+pub fn word_reversed_barret_crc(data: &[u8]) -> u32 {
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
+    };
+    const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
+    const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();
+
+    let mut crc = p32(0xffffffff);
+
+    // iterate over 4-byte words
+    let mut words = data.chunks_exact(4);
+    for word in &mut words {
+        let word = <[u8; 4]>::try_from(word).unwrap();
+        crc = crc ^ p32::from_le_bytes(word);
+        let (lo, _) = crc.widening_mul(BARRET_CONSTANT_REV);
+        let (lo, hi) = ((lo << 1u32) + crc).widening_mul(POLYNOMIAL_REV);
+        crc = (hi << 1u32) | (lo >> 31u32);
+    }
+
+    for b in words.remainder() {
+        crc = crc ^ p32::from(*b);
+        let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
+        let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
+        crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
+    }
+
+    u32::from(crc) ^ 0xffffffff
+}
+
 
 fn main() {
     let input = b"Hello World!";
@@ -217,31 +345,39 @@ fn main() {
     println!("testing crc({:?})", String::from_utf8_lossy(input));
 
     let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "naive_crc", output);
+    println!("{:<24} => 0x{:08x}", "naive_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "less_naive_crc", output);
+    let output = less_naive_crc(input);
+    println!("{:<24} => 0x{:08x}", "less_naive_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "word_less_naive_crc", output);
+    let output = word_less_naive_crc(input);
+    println!("{:<24} => 0x{:08x}", "word_less_naive_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "table_crc", output);
+    let output = table_crc(input);
+    println!("{:<24} => 0x{:08x}", "table_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "small_table_crc", output);
+    let output = small_table_crc(input);
+    println!("{:<24} => 0x{:08x}", "small_table_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "barret_crc", output);
+    let output = barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "barret_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "word_barret_crc", output);
+    let output = word_barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "word_barret_crc", output);
+    assert_eq!(output, expected);
+
+    let output = reversed_barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "reversed_barret_crc", output);
+    assert_eq!(output, expected);
+
+    let output = word_reversed_barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "word_reversed_barret_crc", output);
     assert_eq!(output, expected);
 
     println!();

diff --git a/examples/find-p.rs b/examples/find-p.rs
@@ -86,20 +86,10 @@ pub fn is_generator(g: p128, p: p128) -> bool {
     //
     let width = (128-p.leading_zeros()) - 1;
 
-    // We're going to do a lot of multiplications, so it helps to precalculate
-    // Barret's constant for Barret reduction. This trades a modulus operation
-    // for 2 multiplication, but means we can leverage carry-less multiplication
-    // hardware instructions.
-    //
-    // normally this is just (1 << (2*width)) / p, but we can precompute
-    // one step of division to avoid needing a 4x wide type
-    //
-    let mask = (1u128 << width) - 1;
-    let barret_constant = (((mask & p) << width) / p) + (p128(1) << width);
+    // Multiplication uses carry-less multiplicatio modulo our irreducible
+    // polynomial
     let gfmul = |a: p128, b: p128| -> p128 {
-        let x = a * b;
-        let q = ((x >> width) * barret_constant) >> width;
-        mask & ((q * p) + x)
+        (a * b) % p
     };
 
     // Exponentiation via squaring

diff --git a/gf256-macros/Cargo.toml b/gf256-macros/Cargo.toml
@@ -10,11 +10,7 @@ proc-macro = true
 
 [features]
 # See gf256/Cargo.toml for documentation over these features
-use-naive-xmul = []
-use-hardware-xmul = []
-use-naive-gfmul = []
-use-table-gfmul = []
-use-barret-gfmul = []
+no-xmul = []
 
 [dependencies]
 syn = {version="1.0.73", features=["full"]}