Skip to content

Commit

Permalink
Tweaked how xmul is exposed, added p*::widening_mul
Browse files Browse the repository at this point in the history
Multiplication, and carry-less multiplication, are inherently a widening
operation. Unfortunately, at the time of writing, the types in Rust
don't capture this well, being built around fixed-width wrapping
multiplication.

Rust's stdlib can rely on compiler-level optimizations to clean up
performance issues from unnecessarily-wide multiplications, but this
becomes a bit of an issue for our library, especially for u64 types,
since we rely on intrinsics, which may be hard for compilers to
optimize around.

This commit adds widening_mul, based on a proposal to add widening_mul
to Rust's primitive types:
rust-lang/rust#85532

As well as several other tweaks to how xmul is provided, moving more
arch-level details into xmul, but still limiting when it is emitted.
  • Loading branch information
geky committed Oct 31, 2021
1 parent 344170f commit f61ae49
Show file tree
Hide file tree
Showing 12 changed files with 692 additions and 279 deletions.
43 changes: 6 additions & 37 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,46 +42,15 @@ harness = false
# tracking issue:
# https://github.com/rust-lang/rust/issues/48556
#
use-nightly-features = []
nightly = []

# Makes p* types use a naive carry-less multiplication implementation
# using shifts and xors. Mainly useful for testing/benchmarking.
# Disable carry-less multiplication instructions, forcing the use
# of naive bitwise implementations
#
# By default hardware xmul is used if available, falling back to a naive
# implementation.
# This is mostly available for testing, and in the case that hardware
# xmul is bugged (or more likely this crate is bugged).
#
use-naive-xmul = ["gf256-macros/use-naive-xmul"]

# Makes p* types require hardware-accelerated carry-less multiplication,
# causing a compile error if carry-less multiplication instructions aren't
# available in the current architecture.
#
# By default hardware xmul is used if available, falling back to a naive
# implementation.
#
use-hardware-xmul = ["gf256-macros/use-hardware-xmul"]

# Make gf* types use a naive multiplication implementation using shifts
# and xors. Mainly useful for testing/benchmarking.
#
# By default log/antilog tables are used.
#
use-naive-gfmul = ["gf256-macros/use-naive-gfmul"]

# Make gf* types use precompiled log/antilog tables.
#
# By default log/antilog tables are used.
#
use-table-gfmul = ["gf256-macros/use-table-gfmul"]

# Makes gf* types use (potentially hardware accelerated) polynomial
# multiplication with Barret reduction. This is generally slower than using
# log/antilog tables, but may be useful if constant-time operations are
# required.
#
# By default log/antilog tables are used.
#
use-barret-gfmul = ["gf256-macros/use-barret-gfmul"]
no-xmul = ["gf256-macros/no-xmul"]

[dev-dependencies]
criterion = {version="0.3", features=["html_reports"]}
Expand Down
17 changes: 9 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,21 @@ override ENV += RUSTFLAGS="-Ctarget-cpu=native"

.PHONY: all build
all build:
$(ENV) cargo build
$(ENV) cargo +nightly build --features nightly

.PHONY: test
test:
$(ENV) cargo test --lib
$(ENV) cargo test --example find-p
$(ENV) cargo run --example crc
$(ENV) cargo run --example shamir
$(ENV) cargo run --example raid
$(ENV) cargo run --example rs
$(ENV) cargo +nightly test --features nightly --lib
$(ENV) cargo +nightly test --features nightly --example find-p
$(ENV) cargo +nightly run --features nightly --example find-p -- -w9 -n4 -m1
$(ENV) cargo +nightly run --features nightly --example crc
$(ENV) cargo +nightly run --features nightly --example shamir
$(ENV) cargo +nightly run --features nightly --example raid
$(ENV) cargo +nightly run --features nightly --example rs

.PHONY: bench
bench:
$(ENV) cargo +nightly bench --features use-nightly-features
$(ENV) cargo +nightly bench --features nightly

.PHONY: clean
clean:
Expand Down
14 changes: 14 additions & 0 deletions benches/crc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,20 @@ fn bench_crc(c: &mut Criterion) {
|data| crc::word_barret_crc(data),
BatchSize::SmallInput
));

let mut xs = xorshift64(42).map(|x| x as u8);
group.bench_function("reversed_barret_crc", |b| b.iter_batched_ref(
|| (&mut xs).take(SIZE).collect::<Vec<u8>>(),
|data| crc::reversed_barret_crc(data),
BatchSize::SmallInput
));

let mut xs = xorshift64(42).map(|x| x as u8);
group.bench_function("word_reversed_barret_crc", |b| b.iter_batched_ref(
|| (&mut xs).take(SIZE).collect::<Vec<u8>>(),
|data| crc::word_reversed_barret_crc(data),
BatchSize::SmallInput
));
}

criterion_group!(benches, bench_crc);
Expand Down
186 changes: 161 additions & 25 deletions examples/crc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,32 @@ pub fn small_table_crc(data: &[u8]) -> u32 {
/// compile-time.
///
pub fn barret_crc(data: &[u8]) -> u32 {
const BARRET_CONSTANT: p64 = {
p64(p128(0x10000000000000000)
.naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
// Normally this would be 0x10000000000000000 / __polynomial, but
// we eagerly do one step of division so we avoid needing a 4x wide
// type. We can also drop the highest bit if we add the high bits
// manually we use use this constant.
//
// = x % p
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
// \-----+-----/
// '-- Barret constant
//
// Note that the shifts and masks can go away if we operate on u32s,
// leaving 2 xmuls and 2 xors.
//
const BARRET_CONSTANT: p32 = {
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
};

let mut crc = p32(0xffffffff);

for b in data {
crc = crc ^ (p32::from(b.reverse_bits()) << 24);
let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
crc = (crc << 8)
+ ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
.wrapping_mul(p32::from_lossy(POLYNOMIAL));
}

u32::from(crc).reverse_bits() ^ 0xffffffff
Expand All @@ -184,9 +199,23 @@ pub fn barret_crc(data: &[u8]) -> u32 {
/// barret_crc, but operating on a 32-bit word at a time
///
pub fn word_barret_crc(data: &[u8]) -> u32 {
const BARRET_CONSTANT: p64 = {
p64(p128(0x10000000000000000)
.naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
// Normally this would be 0x10000000000000000 / __polynomial, but
// we eagerly do one step of division so we avoid needing a 4x wide
// type. We can also drop the highest bit if we add the high bits
// manually we use use this constant.
//
// = x % p
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
// \-----+-----/
// '-- Barret constant
//
// Note that the shifts and masks can go away if we operate on u32s,
// leaving 2 xmuls and 2 xors.
//
const BARRET_CONSTANT: p32 = {
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
};

let mut crc = p32(0xffffffff);
Expand All @@ -196,19 +225,118 @@ pub fn word_barret_crc(data: &[u8]) -> u32 {
for word in &mut words {
let word = <[u8; 4]>::try_from(word).unwrap();
crc = crc ^ p32::from_le_bytes(word).reverse_bits();
let q = (p64::from(crc)*BARRET_CONSTANT) >> 32;
crc = p32::from_lossy(q*POLYNOMIAL);
crc = (crc.widening_mul(BARRET_CONSTANT).1 + crc)
.wrapping_mul(p32::from_lossy(POLYNOMIAL));
}

for b in words.remainder() {
crc = crc ^ (p32::from(b.reverse_bits()) << 24);
let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
crc = (crc << 8)
+ ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
.wrapping_mul(p32::from_lossy(POLYNOMIAL));
}

u32::from(crc).reverse_bits() ^ 0xffffffff
}

/// A hardware-accelerated CRC implementation using Barret reduction without
/// needing to bit-reverse the internal representation
///
/// CRC32 and polynomial multiplication instructions unfortunately are defined
/// with different bit-endianness. This would normally mean we need to
/// bit-reverse the incoming data before we can use polynomial multiplication.
///
/// However, polynomial multiplication has the odd property that it is
/// symmetric, brev(a) * brev(b) = brev((a * b) << 1)
///
/// This means we can rewrite our Barret reduction CRC to operate entirely
/// on a bit-reversed representation, shaving off several instructions.
///
/// In theory this should be faster, but measurements show this as actually
/// being slightly slower, perhaps the extra 1-bit shift costs more on
/// machines with bit-reverse instructions?
///
pub fn reversed_barret_crc(data: &[u8]) -> u32 {
// Normally this would be 0x10000000000000000 / __polynomial, but
// we eagerly do one step of division so we avoid needing a 4x wide
// type. We can also drop the highest bit if we add the high bits
// manually we use use this constant.
//
// = x % p
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
// \-----+-----/
// '-- Barret constant
//
// Note that the shifts and masks can go away if we operate on u32s,
// leaving 2 xmuls and 2 xors.
//
const BARRET_CONSTANT: p32 = {
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
};
const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();

let mut crc = p32(0xffffffff);

for b in data {
crc = crc ^ p32::from(*b);
let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
}

u32::from(crc) ^ 0xffffffff
}

/// A hardware-accelerated CRC implementation using the same technique as
/// reversed_barret_crc, but operating on a 32-bit word at a time
///
pub fn word_reversed_barret_crc(data: &[u8]) -> u32 {
// Normally this would be 0x10000000000000000 / __polynomial, but
// we eagerly do one step of division so we avoid needing a 4x wide
// type. We can also drop the highest bit if we add the high bits
// manually we use use this constant.
//
// = x % p
// = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
// = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
// = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
// \-----+-----/
// '-- Barret constant
//
// Note that the shifts and masks can go away if we operate on u32s,
// leaving 2 xmuls and 2 xors.
//
const BARRET_CONSTANT: p32 = {
p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
};
const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();

let mut crc = p32(0xffffffff);

// iterate over 4-byte words
let mut words = data.chunks_exact(4);
for word in &mut words {
let word = <[u8; 4]>::try_from(word).unwrap();
crc = crc ^ p32::from_le_bytes(word);
let (lo, _) = crc.widening_mul(BARRET_CONSTANT_REV);
let (lo, hi) = ((lo << 1u32) + crc).widening_mul(POLYNOMIAL_REV);
crc = (hi << 1u32) | (lo >> 31u32);
}

for b in words.remainder() {
crc = crc ^ p32::from(*b);
let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
}

u32::from(crc) ^ 0xffffffff
}


fn main() {
let input = b"Hello World!";
Expand All @@ -217,31 +345,39 @@ fn main() {
println!("testing crc({:?})", String::from_utf8_lossy(input));

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "naive_crc", output);
println!("{:<24} => 0x{:08x}", "naive_crc", output);
assert_eq!(output, expected);

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "less_naive_crc", output);
let output = less_naive_crc(input);
println!("{:<24} => 0x{:08x}", "less_naive_crc", output);
assert_eq!(output, expected);

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "word_less_naive_crc", output);
let output = word_less_naive_crc(input);
println!("{:<24} => 0x{:08x}", "word_less_naive_crc", output);
assert_eq!(output, expected);

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "table_crc", output);
let output = table_crc(input);
println!("{:<24} => 0x{:08x}", "table_crc", output);
assert_eq!(output, expected);

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "small_table_crc", output);
let output = small_table_crc(input);
println!("{:<24} => 0x{:08x}", "small_table_crc", output);
assert_eq!(output, expected);

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "barret_crc", output);
let output = barret_crc(input);
println!("{:<24} => 0x{:08x}", "barret_crc", output);
assert_eq!(output, expected);

let output = naive_crc(input);
println!("{:<19} => 0x{:08x}", "word_barret_crc", output);
let output = word_barret_crc(input);
println!("{:<24} => 0x{:08x}", "word_barret_crc", output);
assert_eq!(output, expected);

let output = reversed_barret_crc(input);
println!("{:<24} => 0x{:08x}", "reversed_barret_crc", output);
assert_eq!(output, expected);

let output = word_reversed_barret_crc(input);
println!("{:<24} => 0x{:08x}", "word_reversed_barret_crc", output);
assert_eq!(output, expected);

println!();
Expand Down
16 changes: 3 additions & 13 deletions examples/find-p.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,20 +86,10 @@ pub fn is_generator(g: p128, p: p128) -> bool {
//
let width = (128-p.leading_zeros()) - 1;

// We're going to do a lot of multiplications, so it helps to precalculate
// Barret's constant for Barret reduction. This trades a modulus operation
// for 2 multiplication, but means we can leverage carry-less multiplication
// hardware instructions.
//
// normally this is just (1 << (2*width)) / p, but we can precompute
// one step of division to avoid needing a 4x wide type
//
let mask = (1u128 << width) - 1;
let barret_constant = (((mask & p) << width) / p) + (p128(1) << width);
// Multiplication uses carry-less multiplicatio modulo our irreducible
// polynomial
let gfmul = |a: p128, b: p128| -> p128 {
let x = a * b;
let q = ((x >> width) * barret_constant) >> width;
mask & ((q * p) + x)
(a * b) % p
};

// Exponentiation via squaring
Expand Down
6 changes: 1 addition & 5 deletions gf256-macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@ proc-macro = true

[features]
# See gf256/Cargo.toml for documentation over these features
use-naive-xmul = []
use-hardware-xmul = []
use-naive-gfmul = []
use-table-gfmul = []
use-barret-gfmul = []
no-xmul = []

[dependencies]
syn = {version="1.0.73", features=["full"]}
Expand Down
Loading

0 comments on commit f61ae49

Please sign in to comment.