From f61ae49e9241289097ba829271be7750b5429895 Mon Sep 17 00:00:00 2001 From: Christopher Haster Date: Sat, 30 Oct 2021 06:57:39 -0500 Subject: [PATCH] Tweaked how xmul is exposed, added p*::widening_mul Multiplication, and carry-less multiplication, are inherently a widening operation. Unfortunately, at the time of writing, the types in Rust don't capture this well, being built around fixed-width wrapping multiplication. Rust's stdlib can rely on compiler-level optimizations to clean up performance issues from unnecessarily-wide multiplications, but this becomes a bit of an issue for our library, especially for u64 types, since we rely on intrinsics, which may be hard for compilers to optimize around. This commit adds widening_mul, based on a proposal to add widening_mul to Rust's primitive types: https://github.com/rust-lang/rust/issues/85532 As well as several other tweaks to how xmul is provided, moving more arch-level details into xmul, but still limiting when it is emitted. --- Cargo.toml | 43 +---- Makefile | 17 +- benches/crc.rs | 14 ++ examples/crc.rs | 186 +++++++++++++++++--- examples/find-p.rs | 16 +- gf256-macros/Cargo.toml | 6 +- gf256-macros/src/gf.rs | 39 +++-- gf256-macros/src/lib.rs | 68 ++++--- gf256-macros/src/p.rs | 131 ++++++++------ src/lib.rs | 5 +- src/p.rs | 67 ++++--- src/xmul.rs | 379 ++++++++++++++++++++++++++++++++-------- 12 files changed, 692 insertions(+), 279 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6ad650e..c2c19dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,46 +42,15 @@ harness = false # tracking issue: # https://github.com/rust-lang/rust/issues/48556 # -use-nightly-features = [] +nightly = [] -# Makes p* types use a naive carry-less multiplication implementation -# using shifts and xors. Mainly useful for testing/benchmarking. +# Disable carry-less multiplication instructions, forcing the use +# of naive bitwise implementations # -# By default hardware xmul is used if available, falling back to a naive -# implementation. +# This is mostly available for testing, and in the case that hardware +# xmul is bugged (or more likely this crate is bugged). # -use-naive-xmul = ["gf256-macros/use-naive-xmul"] - -# Makes p* types require hardware-accelerated carry-less multiplication, -# causing a compile error if carry-less multiplication instructions aren't -# available in the current architecture. -# -# By default hardware xmul is used if available, falling back to a naive -# implementation. -# -use-hardware-xmul = ["gf256-macros/use-hardware-xmul"] - -# Make gf* types use a naive multiplication implementation using shifts -# and xors. Mainly useful for testing/benchmarking. -# -# By default log/antilog tables are used. -# -use-naive-gfmul = ["gf256-macros/use-naive-gfmul"] - -# Make gf* types use precompiled log/antilog tables. -# -# By default log/antilog tables are used. -# -use-table-gfmul = ["gf256-macros/use-table-gfmul"] - -# Makes gf* types use (potentially hardware accelerated) polynomial -# multiplication with Barret reduction. This is generally slower than using -# log/antilog tables, but may be useful if constant-time operations are -# required. -# -# By default log/antilog tables are used. -# -use-barret-gfmul = ["gf256-macros/use-barret-gfmul"] +no-xmul = ["gf256-macros/no-xmul"] [dev-dependencies] criterion = {version="0.3", features=["html_reports"]} diff --git a/Makefile b/Makefile index d2d5a02..c7f2859 100644 --- a/Makefile +++ b/Makefile @@ -3,20 +3,21 @@ override ENV += RUSTFLAGS="-Ctarget-cpu=native" .PHONY: all build all build: - $(ENV) cargo build + $(ENV) cargo +nightly build --features nightly .PHONY: test test: - $(ENV) cargo test --lib - $(ENV) cargo test --example find-p - $(ENV) cargo run --example crc - $(ENV) cargo run --example shamir - $(ENV) cargo run --example raid - $(ENV) cargo run --example rs + $(ENV) cargo +nightly test --features nightly --lib + $(ENV) cargo +nightly test --features nightly --example find-p + $(ENV) cargo +nightly run --features nightly --example find-p -- -w9 -n4 -m1 + $(ENV) cargo +nightly run --features nightly --example crc + $(ENV) cargo +nightly run --features nightly --example shamir + $(ENV) cargo +nightly run --features nightly --example raid + $(ENV) cargo +nightly run --features nightly --example rs .PHONY: bench bench: - $(ENV) cargo +nightly bench --features use-nightly-features + $(ENV) cargo +nightly bench --features nightly .PHONY: clean clean: diff --git a/benches/crc.rs b/benches/crc.rs index 86f1f6a..0ba9317 100644 --- a/benches/crc.rs +++ b/benches/crc.rs @@ -78,6 +78,20 @@ fn bench_crc(c: &mut Criterion) { |data| crc::word_barret_crc(data), BatchSize::SmallInput )); + + let mut xs = xorshift64(42).map(|x| x as u8); + group.bench_function("reversed_barret_crc", |b| b.iter_batched_ref( + || (&mut xs).take(SIZE).collect::>(), + |data| crc::reversed_barret_crc(data), + BatchSize::SmallInput + )); + + let mut xs = xorshift64(42).map(|x| x as u8); + group.bench_function("word_reversed_barret_crc", |b| b.iter_batched_ref( + || (&mut xs).take(SIZE).collect::>(), + |data| crc::word_reversed_barret_crc(data), + BatchSize::SmallInput + )); } criterion_group!(benches, bench_crc); diff --git a/examples/crc.rs b/examples/crc.rs index f5e3180..a5d3080 100644 --- a/examples/crc.rs +++ b/examples/crc.rs @@ -164,17 +164,32 @@ pub fn small_table_crc(data: &[u8]) -> u32 { /// compile-time. /// pub fn barret_crc(data: &[u8]) -> u32 { - const BARRET_CONSTANT: p64 = { - p64(p128(0x10000000000000000) - .naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64) + // Normally this would be 0x10000000000000000 / __polynomial, but + // we eagerly do one step of division so we avoid needing a 4x wide + // type. We can also drop the highest bit if we add the high bits + // manually we use use this constant. + // + // = x % p + // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32)) + // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32)) + // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32))) + // \-----+-----/ + // '-- Barret constant + // + // Note that the shifts and masks can go away if we operate on u32s, + // leaving 2 xmuls and 2 xors. + // + const BARRET_CONSTANT: p32 = { + p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32) }; let mut crc = p32(0xffffffff); for b in data { crc = crc ^ (p32::from(b.reverse_bits()) << 24); - let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32; - crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8); + crc = (crc << 8) + + ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32)) + .wrapping_mul(p32::from_lossy(POLYNOMIAL)); } u32::from(crc).reverse_bits() ^ 0xffffffff @@ -184,9 +199,23 @@ pub fn barret_crc(data: &[u8]) -> u32 { /// barret_crc, but operating on a 32-bit word at a time /// pub fn word_barret_crc(data: &[u8]) -> u32 { - const BARRET_CONSTANT: p64 = { - p64(p128(0x10000000000000000) - .naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64) + // Normally this would be 0x10000000000000000 / __polynomial, but + // we eagerly do one step of division so we avoid needing a 4x wide + // type. We can also drop the highest bit if we add the high bits + // manually we use use this constant. + // + // = x % p + // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32)) + // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32)) + // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32))) + // \-----+-----/ + // '-- Barret constant + // + // Note that the shifts and masks can go away if we operate on u32s, + // leaving 2 xmuls and 2 xors. + // + const BARRET_CONSTANT: p32 = { + p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32) }; let mut crc = p32(0xffffffff); @@ -196,19 +225,118 @@ pub fn word_barret_crc(data: &[u8]) -> u32 { for word in &mut words { let word = <[u8; 4]>::try_from(word).unwrap(); crc = crc ^ p32::from_le_bytes(word).reverse_bits(); - let q = (p64::from(crc)*BARRET_CONSTANT) >> 32; - crc = p32::from_lossy(q*POLYNOMIAL); + crc = (crc.widening_mul(BARRET_CONSTANT).1 + crc) + .wrapping_mul(p32::from_lossy(POLYNOMIAL)); } for b in words.remainder() { crc = crc ^ (p32::from(b.reverse_bits()) << 24); - let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32; - crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8); + crc = (crc << 8) + + ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32)) + .wrapping_mul(p32::from_lossy(POLYNOMIAL)); } u32::from(crc).reverse_bits() ^ 0xffffffff } +/// A hardware-accelerated CRC implementation using Barret reduction without +/// needing to bit-reverse the internal representation +/// +/// CRC32 and polynomial multiplication instructions unfortunately are defined +/// with different bit-endianness. This would normally mean we need to +/// bit-reverse the incoming data before we can use polynomial multiplication. +/// +/// However, polynomial multiplication has the odd property that it is +/// symmetric, brev(a) * brev(b) = brev((a * b) << 1) +/// +/// This means we can rewrite our Barret reduction CRC to operate entirely +/// on a bit-reversed representation, shaving off several instructions. +/// +/// In theory this should be faster, but measurements show this as actually +/// being slightly slower, perhaps the extra 1-bit shift costs more on +/// machines with bit-reverse instructions? +/// +pub fn reversed_barret_crc(data: &[u8]) -> u32 { + // Normally this would be 0x10000000000000000 / __polynomial, but + // we eagerly do one step of division so we avoid needing a 4x wide + // type. We can also drop the highest bit if we add the high bits + // manually we use use this constant. + // + // = x % p + // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32)) + // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32)) + // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32))) + // \-----+-----/ + // '-- Barret constant + // + // Note that the shifts and masks can go away if we operate on u32s, + // leaving 2 xmuls and 2 xors. + // + const BARRET_CONSTANT: p32 = { + p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32) + }; + const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits(); + const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits(); + + let mut crc = p32(0xffffffff); + + for b in data { + crc = crc ^ p32::from(*b); + let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV); + let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV); + crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32)); + } + + u32::from(crc) ^ 0xffffffff +} + +/// A hardware-accelerated CRC implementation using the same technique as +/// reversed_barret_crc, but operating on a 32-bit word at a time +/// +pub fn word_reversed_barret_crc(data: &[u8]) -> u32 { + // Normally this would be 0x10000000000000000 / __polynomial, but + // we eagerly do one step of division so we avoid needing a 4x wide + // type. We can also drop the highest bit if we add the high bits + // manually we use use this constant. + // + // = x % p + // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32)) + // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32)) + // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32))) + // \-----+-----/ + // '-- Barret constant + // + // Note that the shifts and masks can go away if we operate on u32s, + // leaving 2 xmuls and 2 xors. + // + const BARRET_CONSTANT: p32 = { + p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32) + }; + const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits(); + const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits(); + + let mut crc = p32(0xffffffff); + + // iterate over 4-byte words + let mut words = data.chunks_exact(4); + for word in &mut words { + let word = <[u8; 4]>::try_from(word).unwrap(); + crc = crc ^ p32::from_le_bytes(word); + let (lo, _) = crc.widening_mul(BARRET_CONSTANT_REV); + let (lo, hi) = ((lo << 1u32) + crc).widening_mul(POLYNOMIAL_REV); + crc = (hi << 1u32) | (lo >> 31u32); + } + + for b in words.remainder() { + crc = crc ^ p32::from(*b); + let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV); + let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV); + crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32)); + } + + u32::from(crc) ^ 0xffffffff +} + fn main() { let input = b"Hello World!"; @@ -217,31 +345,39 @@ fn main() { println!("testing crc({:?})", String::from_utf8_lossy(input)); let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "naive_crc", output); + println!("{:<24} => 0x{:08x}", "naive_crc", output); assert_eq!(output, expected); - let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "less_naive_crc", output); + let output = less_naive_crc(input); + println!("{:<24} => 0x{:08x}", "less_naive_crc", output); assert_eq!(output, expected); - let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "word_less_naive_crc", output); + let output = word_less_naive_crc(input); + println!("{:<24} => 0x{:08x}", "word_less_naive_crc", output); assert_eq!(output, expected); - let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "table_crc", output); + let output = table_crc(input); + println!("{:<24} => 0x{:08x}", "table_crc", output); assert_eq!(output, expected); - let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "small_table_crc", output); + let output = small_table_crc(input); + println!("{:<24} => 0x{:08x}", "small_table_crc", output); assert_eq!(output, expected); - let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "barret_crc", output); + let output = barret_crc(input); + println!("{:<24} => 0x{:08x}", "barret_crc", output); assert_eq!(output, expected); - let output = naive_crc(input); - println!("{:<19} => 0x{:08x}", "word_barret_crc", output); + let output = word_barret_crc(input); + println!("{:<24} => 0x{:08x}", "word_barret_crc", output); + assert_eq!(output, expected); + + let output = reversed_barret_crc(input); + println!("{:<24} => 0x{:08x}", "reversed_barret_crc", output); + assert_eq!(output, expected); + + let output = word_reversed_barret_crc(input); + println!("{:<24} => 0x{:08x}", "word_reversed_barret_crc", output); assert_eq!(output, expected); println!(); diff --git a/examples/find-p.rs b/examples/find-p.rs index 7aa71f5..f68d1cc 100644 --- a/examples/find-p.rs +++ b/examples/find-p.rs @@ -86,20 +86,10 @@ pub fn is_generator(g: p128, p: p128) -> bool { // let width = (128-p.leading_zeros()) - 1; - // We're going to do a lot of multiplications, so it helps to precalculate - // Barret's constant for Barret reduction. This trades a modulus operation - // for 2 multiplication, but means we can leverage carry-less multiplication - // hardware instructions. - // - // normally this is just (1 << (2*width)) / p, but we can precompute - // one step of division to avoid needing a 4x wide type - // - let mask = (1u128 << width) - 1; - let barret_constant = (((mask & p) << width) / p) + (p128(1) << width); + // Multiplication uses carry-less multiplicatio modulo our irreducible + // polynomial let gfmul = |a: p128, b: p128| -> p128 { - let x = a * b; - let q = ((x >> width) * barret_constant) >> width; - mask & ((q * p) + x) + (a * b) % p }; // Exponentiation via squaring diff --git a/gf256-macros/Cargo.toml b/gf256-macros/Cargo.toml index 339a114..463dd70 100644 --- a/gf256-macros/Cargo.toml +++ b/gf256-macros/Cargo.toml @@ -10,11 +10,7 @@ proc-macro = true [features] # See gf256/Cargo.toml for documentation over these features -use-naive-xmul = [] -use-hardware-xmul = [] -use-naive-gfmul = [] -use-table-gfmul = [] -use-barret-gfmul = [] +no-xmul = [] [dependencies] syn = {version="1.0.73", features=["full"]} diff --git a/gf256-macros/src/gf.rs b/gf256-macros/src/gf.rs index cc38e2c..d97cb6a 100644 --- a/gf256-macros/src/gf.rs +++ b/gf256-macros/src/gf.rs @@ -60,12 +60,23 @@ impl __gf { // Generate constant for Barret's reduction if we're // in Barret mode #[cfg(__if(__barret))] - const BARRET_CONSTANT: p16 = { - // normally this would be 0x10000 / __polynomial, but we eagerly - // do one step of division so we avoid needing a 4x wide type + const BARRET_CONSTANT: p8 = { + // Normally this would be 0x10000 / __polynomial, but we eagerly + // do one step of division so we avoid needing a 4x wide type. We + // can also drop the highest bit if we add the high bits manually + // we use use this constant. // - //p16(p32(0x10000).naive_div(p32(__polynomial)).0 as u16) - p16(__polynomial << 8).naive_div(p16(__polynomial)).naive_add(p16(0x100)) + // = x % p + // = 0xff & (x + p*(((x >> 8) * [0x10000/p]) >> 8)) + // = 0xff & (x + p*(((x >> 8) * [(p << 8)/p + 0x100]) >> 8)) + // = 0xff & (x + p*((((x >> 8) * [(p << 8)/p]) >> 8) + (x >> 8))) + // \-----+----/ + // '-- Barret constant + // + // Note that the shifts and masks can go away if we operate on u8s, + // leaving 2 xmuls and 2 xors. + // + p8(p16(__polynomial << 8).naive_div(p16(__polynomial)).0 as u8) }; /// Addition over gf(256), aka xor @@ -136,7 +147,7 @@ impl __gf { /// these are more expensive, but also allowed in const contexts /// #[inline] - pub const fn checked_naive_recip(self) -> Option<__gf> { + pub const fn naive_checked_recip(self) -> Option<__gf> { if self.0 == 0 { return None; } @@ -154,7 +165,7 @@ impl __gf { /// #[inline] pub const fn naive_recip(self) -> __gf { - match self.checked_naive_recip() { + match self.naive_checked_recip() { Some(x) => x, None => __gf(1 / 0), } @@ -163,8 +174,8 @@ impl __gf { /// Naive division over gf(256) /// #[inline] - pub const fn checked_naive_div(self, other: __gf) -> Option<__gf> { - match other.checked_naive_recip() { + pub const fn naive_checked_div(self, other: __gf) -> Option<__gf> { + match other.naive_checked_recip() { Some(other_recip) => Some(self.naive_mul(other_recip)), None => None, } @@ -176,7 +187,7 @@ impl __gf { /// #[inline] pub const fn naive_div(self, other: __gf) -> __gf { - match self.checked_naive_div(other) { + match self.naive_checked_div(other) { Some(x) => x, None => __gf(self.0 / 0), } @@ -218,9 +229,11 @@ impl __gf { // useful here if we have hardware xmul instructions, though // it may be more expensive if xmul is naive. // - let x = p16(self.0 as u16) * p16(other.0 as u16); - let q = (p16::mul(x >> 8, Self::BARRET_CONSTANT) >> 8); - __gf((p16::mul(q, Self::POLYNOMIAL) + x).0 as u8) + let (lo, hi) = p8(self.0).widening_mul(p8(other.0)); + let x = lo + + (hi.widening_mul(Self::BARRET_CONSTANT).1 + hi) + .wrapping_mul(p8(Self::POLYNOMIAL.0 as u8)); + __gf(x.0 as u8) } else { // fallback to naive multiplication over gf(256) self.naive_mul(other) diff --git a/gf256-macros/src/lib.rs b/gf256-macros/src/lib.rs index a2d79ac..1ca44c5 100644 --- a/gf256-macros/src/lib.rs +++ b/gf256-macros/src/lib.rs @@ -27,6 +27,24 @@ fn crate_() -> TokenTree { )) } +fn xmul_query() -> TokenStream { + quote! { + any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) + ) + } +} + fn token_replace( input: TokenStream, replacements: &HashMap @@ -154,7 +172,7 @@ struct PArgs { #[darling(default)] naive: bool, #[darling(default)] - hardware: bool, + xmul: bool, } #[proc_macro_attribute] @@ -165,8 +183,8 @@ pub fn p( let crate_ = crate_(); // parse args - let args = parse_macro_input!(args as syn::AttributeArgs); - let args = match PArgs::from_list(&args) { + let raw_args = parse_macro_input!(args as syn::AttributeArgs); + let args = match PArgs::from_list(&raw_args) { Ok(args) => args, Err(err) => { return err.write_errors().into(); @@ -174,18 +192,20 @@ pub fn p( }; // decide between implementations - let (naive, hardware) = match ( - (args.naive, args.hardware), - (cfg!(feature="use-naive-xmul"), cfg!(feature="use-hardware-xmul")) - ) { - // choose mode if one is explicitly requested - ((true, false), _ ) => (true, false), - ((false, false), (true, false)) => (true, false), - ((false, true,), _ ) => (false, true ), - ((false, false), (false, true )) => (false, true ), - - // default to neither, let the p* implementation make the decision - ((false, false), (false, false)) => (false, false), + let has_xmul = match (args.naive, args.xmul) { + (true, false) => false, + (false, true) => true, + (false, false) => { + // query target configuration and recurse back into our proc_macro + let input = TokenStream::from(input); + let xmul_query = xmul_query(); + let output = quote! { + #[cfg_attr(#xmul_query, #crate_::macros::p(xmul, #(#raw_args),*))] + #[cfg_attr(not(#xmul_query), #crate_::macros::p(naive, #(#raw_args),*))] + #input + }; + return output.into(); + }, // multiple modes selected? _ => panic!("invalid configuration of macro p (naive, hardware?)"), @@ -202,11 +222,11 @@ pub fn p( // let input = TokenStream::from(input); let output = quote! { - #[cfg_attr(target_pointer_width="8", #crate_::macros::p(u="usize", width=8, naive=#naive, hardware=#hardware))] - #[cfg_attr(target_pointer_width="16", #crate_::macros::p(u="usize", width=16, naive=#naive, hardware=#hardware))] - #[cfg_attr(target_pointer_width="32", #crate_::macros::p(u="usize", width=32, naive=#naive, hardware=#hardware))] - #[cfg_attr(target_pointer_width="64", #crate_::macros::p(u="usize", width=64, naive=#naive, hardware=#hardware))] - #[cfg_attr(target_pointer_width="128", #crate_::macros::p(u="usize", width=128, naive=#naive, hardware=#hardware))] + #[cfg_attr(target_pointer_width="8", #crate_::macros::p(width=8, #(#raw_args),*))] + #[cfg_attr(target_pointer_width="16", #crate_::macros::p(width=16, #(#raw_args),*))] + #[cfg_attr(target_pointer_width="32", #crate_::macros::p(width=32, #(#raw_args),*))] + #[cfg_attr(target_pointer_width="64", #crate_::macros::p(width=64, #(#raw_args),*))] + #[cfg_attr(target_pointer_width="128", #crate_::macros::p(width=128, #(#raw_args),*))] #input }; return output.into(); @@ -239,11 +259,11 @@ pub fn p( ("__is_usize".to_owned(), TokenTree::Ident( Ident::new(&format!("{}", args.u == "usize"), Span::call_site()) )), - ("__naive".to_owned(), TokenTree::Ident( - Ident::new(&format!("{}", naive), Span::call_site()) + ("__has_xmul".to_owned(), TokenTree::Ident( + Ident::new(&format!("{}", has_xmul), Span::call_site()) )), - ("__hardware".to_owned(), TokenTree::Ident( - Ident::new(&format!("{}", hardware), Span::call_site()) + ("__xmul".to_owned(), TokenTree::Ident( + Ident::new(&format!("xmul{}", width), Span::call_site()) )), ("__crate".to_owned(), crate_), ]); diff --git a/gf256-macros/src/p.rs b/gf256-macros/src/p.rs index 6cc8d38..b2603cf 100644 --- a/gf256-macros/src/p.rs +++ b/gf256-macros/src/p.rs @@ -43,6 +43,31 @@ impl __p { __p(self.0 ^ other.0) } + /// Naive polynomial multiplication + /// + /// Naive versions are built out of simple bitwise operations, + /// these are more expensive, but also allowed in const contexts + /// + /// This return a tuple containing the low and high parts in that order + /// + #[inline] + pub const fn naive_widening_mul(self, other: __p) -> (__p, __p) { + let a = self.0; + let b = other.0; + let mut lo = 0; + let mut hi = 0; + let mut i = 0; + while i < __width { + let mask = (((a as __i) << (__width-1-i)) >> (__width-1)) as __u; + lo ^= mask & (b << i); + hi ^= mask & (b >> (__width-1-i)); + i += 1; + } + // note we adjust hi by one here, otherwise we'd need to handle + // shifting > word size + (__p(lo), __p(hi >> 1)) + } + /// Naive polynomial multiplication /// /// Naive versions are built out of simple bitwise operations, @@ -52,11 +77,9 @@ impl __p { /// a flag indicating of overflow occured /// #[inline] - pub const fn overflowing_naive_mul(self, other: __p) -> (__p, bool) { - // x bits * y bits = x+y-1 bits, if this is more bits than the - // width we will overflow - let o = self.0.leading_zeros() + other.0.leading_zeros() < __width-1; - (self.wrapping_naive_mul(other), o) + pub const fn naive_overflowing_mul(self, other: __p) -> (__p, bool) { + let (lo, hi) = self.naive_widening_mul(other); + (lo, hi.0 != 0) } /// Naive polynomial multiplication @@ -67,8 +90,8 @@ impl __p { /// Note this returns None if an overflow occured /// #[inline] - pub const fn checked_naive_mul(self, other: __p) -> Option<__p> { - match self.overflowing_naive_mul(other) { + pub const fn naive_checked_mul(self, other: __p) -> Option<__p> { + match self.naive_overflowing_mul(other) { (_, true ) => None, (x, false) => Some(x), } @@ -82,7 +105,7 @@ impl __p { /// Note this wraps around the boundary of the type /// #[inline] - pub const fn wrapping_naive_mul(self, other: __p) -> __p { + pub const fn naive_wrapping_mul(self, other: __p) -> __p { let a = self.0; let b = other.0; let mut x = 0; @@ -108,12 +131,34 @@ impl __p { cfg_if! { // TODO feature flag for overflow-checks? if #[cfg(debug_assertions)] { - match self.checked_naive_mul(other) { + match self.naive_checked_mul(other) { Some(x) => x, None => __p(self.0 / 0), } } else { - self.wrapping_naive_mul(other) + self.naive_wrapping_mul(other) + } + } + } + + /// Naive polynomial multiplication + /// + /// This attempts to use carry-less multiplication + /// instructions when available (pclmulqdq on x86_64, + /// pmull on aarch64), otherwise falls back to the expensive + /// naive implementation + /// + /// This return a tuple containing the low and high parts in that order + /// + #[inline] + pub fn widening_mul(self, other: __p) -> (__p, __p) { + cfg_if! { + if #[cfg(__if(__has_xmul))] { + use __crate::internal::xmul::*; + let (lo, hi) = __xmul(self.0 as _, other.0 as _); + (__p(lo as __u), __p(hi as __u)) + } else { + self.naive_widening_mul(other) } } } @@ -130,10 +175,8 @@ impl __p { /// #[inline] pub fn overflowing_mul(self, other: __p) -> (__p, bool) { - // x bits * y bits = x+y-1 bits, if this is more bits than the - // width we will overflow - let o = self.0.leading_zeros() + other.0.leading_zeros() < __width-1; - (self.wrapping_mul(other), o) + let (lo, hi) = self.widening_mul(other); + (lo, hi.0 != 0) } /// Polynomial multiplication @@ -165,37 +208,11 @@ impl __p { #[inline] pub fn wrapping_mul(self, other: __p) -> __p { cfg_if! { - if #[cfg(all( - __if(!__naive), - target_arch="x86_64", - target_feature="pclmulqdq" - ))] { + if #[cfg(__if(__has_xmul))] { use __crate::internal::xmul::*; - - cfg_if! { - if #[cfg(__if(__width <= 64))] { - __p(__pclmulqdq_u64(self.0 as u64, other.0 as u64) as __u) - } else { - __p(__pclmulqdq_u128(self.0, other.0)) - } - } - } else if #[cfg(all( - __if(!__naive), - feature="use-nightly-features", - target_arch="aarch64", - target_feature="neon" - ))] { - use __crate::internal::xmul::*; - - cfg_if! { - if #[cfg(__if(__width <= 64))] { - __p(__pmull_u64(self.0 as u64, other.0 as u64) as __u) - } else { - __p(__pmull_u128(self.0, other.0)) - } - } - } else if #[cfg(__if(!__hardware))] { - self.wrapping_naive_mul(other) + __p(__xmul(self.0 as _, other.0 as _).0 as __u) + } else { + self.naive_wrapping_mul(other) } } } @@ -225,14 +242,14 @@ impl __p { /// Naive polynomial exponentiation #[inline] - pub const fn overflowing_naive_pow(self, exp: u32) -> (__p, bool) { + pub const fn naive_overflowing_pow(self, exp: u32) -> (__p, bool) { let mut a = self; let mut exp = exp; let mut x = __p(1); let mut o = false; loop { if exp & 1 != 0 { - let (x_, o_) = x.overflowing_naive_mul(a); + let (x_, o_) = x.naive_overflowing_mul(a); x = x_; o = o || o_; } @@ -241,7 +258,7 @@ impl __p { if exp == 0 { return (x, o); } - let (a_, o_) = a.overflowing_naive_mul(a); + let (a_, o_) = a.naive_overflowing_mul(a); a = a_; o = o || o_; } @@ -249,13 +266,13 @@ impl __p { /// Naive polynomial exponentiation #[inline] - pub const fn checked_naive_pow(self, exp: u32) -> Option<__p> { + pub const fn naive_checked_pow(self, exp: u32) -> Option<__p> { let mut a = self; let mut exp = exp; let mut x = __p(1); loop { if exp & 1 != 0 { - x = match x.checked_naive_mul(a) { + x = match x.naive_checked_mul(a) { Some(x) => x, None => return None, } @@ -265,7 +282,7 @@ impl __p { if exp == 0 { return Some(x); } - a = match a.checked_naive_mul(a) { + a = match a.naive_checked_mul(a) { Some(a) => a, None => return None, } @@ -274,20 +291,20 @@ impl __p { /// Naive polynomial exponentiation #[inline] - pub const fn wrapping_naive_pow(self, exp: u32) -> __p { + pub const fn naive_wrapping_pow(self, exp: u32) -> __p { let mut a = self; let mut exp = exp; let mut x = __p(1); loop { if exp & 1 != 0 { - x = x.wrapping_naive_mul(a); + x = x.naive_wrapping_mul(a); } exp >>= 1; if exp == 0 { return x; } - a = a.wrapping_naive_mul(a); + a = a.naive_wrapping_mul(a); } } @@ -403,7 +420,7 @@ impl __p { /// these are more expensive, but also allowed in const contexts /// #[inline] - pub const fn checked_naive_div(self, other: __p) -> Option<__p> { + pub const fn naive_checked_div(self, other: __p) -> Option<__p> { if other.0 == 0 { None } else { @@ -428,7 +445,7 @@ impl __p { /// #[inline] pub const fn naive_div(self, other: __p) -> __p { - match self.checked_naive_div(other) { + match self.naive_checked_div(other) { Some(x) => x, None => __p(self.0 / 0), } @@ -440,7 +457,7 @@ impl __p { /// these are more expensive, but also allowed in const contexts /// #[inline] - pub const fn checked_naive_rem(self, other: __p) -> Option<__p> { + pub const fn naive_checked_rem(self, other: __p) -> Option<__p> { if other.0 == 0 { None } else { @@ -464,7 +481,7 @@ impl __p { /// #[inline] pub const fn naive_rem(self, other: __p) -> __p { - match self.checked_naive_rem(other) { + match self.naive_checked_rem(other) { Some(x) => x, None => __p(self.0 / 0), } diff --git a/src/lib.rs b/src/lib.rs index 14aee41..668b398 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ // Enable stdsimd for pmull on aarch64 #![cfg_attr( - all(feature="use-nightly-features", target_arch="aarch64"), + all(not(feature="no-xmul"), feature="nightly", target_arch="aarch64"), feature(stdsimd) )] @@ -26,3 +26,6 @@ pub mod internal { pub use cfg_if; pub mod xmul; } + +pub use internal::xmul::HAS_XMUL; + diff --git a/src/p.rs b/src/p.rs index a0a8c15..903bb7b 100644 --- a/src/p.rs +++ b/src/p.rs @@ -35,11 +35,11 @@ mod test { #[test] fn mul() { - assert_eq!(p8(0xfe).wrapping_naive_mul(p8(0x87)), p8(0xfa)); - assert_eq!(p16(0xfedc).wrapping_naive_mul(p16(0x8765)), p16(0x7d2c)); - assert_eq!(p32(0xfedcba98).wrapping_naive_mul(p32(0x87654321)), p32(0x03da4198)); - assert_eq!(p64(0xfedcba9876543210).wrapping_naive_mul(p64(0x8765432100000000)), p64(0x0050401000000000)); - assert_eq!(p128(0xfedcba98765432100000000000000000).wrapping_naive_mul(p128(0x87654321000000000000000000000000)), p128(0x00000000000000000000000000000000)); + assert_eq!(p8(0xfe).naive_wrapping_mul(p8(0x87)), p8(0xfa)); + assert_eq!(p16(0xfedc).naive_wrapping_mul(p16(0x8765)), p16(0x7d2c)); + assert_eq!(p32(0xfedcba98).naive_wrapping_mul(p32(0x87654321)), p32(0x03da4198)); + assert_eq!(p64(0xfedcba9876543210).naive_wrapping_mul(p64(0x8765432100000000)), p64(0x0050401000000000)); + assert_eq!(p128(0xfedcba98765432100000000000000000).naive_wrapping_mul(p128(0x87654321000000000000000000000000)), p128(0x00000000000000000000000000000000)); assert_eq!(p8(0xfe).wrapping_mul(p8(0x87)), p8(0xfa)); assert_eq!(p16(0xfedc).wrapping_mul(p16(0x8765)), p16(0x7d2c)); @@ -79,12 +79,12 @@ mod test { } #[test] - fn hardware_mul() { + fn naive_mul() { for a in (0..=255).map(p8) { for b in (0..=255).map(p8) { - let res_naive = a.wrapping_naive_mul(b); - let res_hardware = a.wrapping_mul(b); - assert_eq!(res_naive, res_hardware); + let naive_res = a.naive_wrapping_mul(b); + let res_xmul = a.wrapping_mul(b); + assert_eq!(naive_res, res_xmul); } } } @@ -93,23 +93,48 @@ mod test { fn overflowing_mul() { for a in (0..=255).map(p8) { for b in (0..=255).map(p8) { - let (wrapped_naive, overflow_naive) = a.overflowing_naive_mul(b); - let (wrapped_hardware, overflow_hardware) = a.overflowing_mul(b); - let res_naive = p16::from(a).naive_mul(p16::from(b)); - let res_hardware = p16::from(a) * p16::from(b); + let (naive_wrapped, naive_overflow) = a.naive_overflowing_mul(b); + let (wrapped_xmul, overflow_xmul) = a.overflowing_mul(b); + let naive_res = p16::from(a).naive_mul(p16::from(b)); + let res_xmul = p16::from(a) * p16::from(b); - // same results naive vs hardware? - assert_eq!(wrapped_naive, wrapped_hardware); - assert_eq!(overflow_naive, overflow_hardware); - assert_eq!(res_naive, res_hardware); + // same results naive vs xmul? + assert_eq!(naive_wrapped, wrapped_xmul); + assert_eq!(naive_overflow, overflow_xmul); + assert_eq!(naive_res, res_xmul); // same wrapped results? - assert_eq!(wrapped_naive, p8::try_from(res_naive & 0xff).unwrap()); - assert_eq!(wrapped_hardware, p8::try_from(res_hardware & 0xff).unwrap()); + assert_eq!(naive_wrapped, p8::try_from(naive_res & 0xff).unwrap()); + assert_eq!(wrapped_xmul, p8::try_from(res_xmul & 0xff).unwrap()); // overflow set if overflow occured? - assert_eq!(overflow_naive, (p16::from(wrapped_naive) != res_naive)); - assert_eq!(overflow_hardware, (p16::from(wrapped_hardware) != res_hardware)); + assert_eq!(naive_overflow, (p16::from(naive_wrapped) != naive_res)); + assert_eq!(overflow_xmul, (p16::from(wrapped_xmul) != res_xmul)); + } + } + } + + #[test] + fn widening_mul() { + for a in (0..=255).map(p8) { + for b in (0..=255).map(p8) { + let (naive_lo, naive_hi) = a.naive_widening_mul(b); + let (lo_xmul, hi_xmul ) = a.widening_mul(b); + let naive_res = p16::from(a).naive_mul(p16::from(b)); + let res_xmul = p16::from(a) * p16::from(b); + + // same results naive vs xmul? + assert_eq!(naive_lo, lo_xmul); + assert_eq!(naive_hi, hi_xmul); + assert_eq!(naive_res, res_xmul); + + // same lo results? + assert_eq!(naive_lo, p8::try_from(naive_res & 0xff).unwrap()); + assert_eq!(lo_xmul, p8::try_from(res_xmul & 0xff).unwrap()); + + // same hi results? + assert_eq!(naive_hi, p8::try_from(naive_res >> 8).unwrap()); + assert_eq!(hi_xmul, p8::try_from(res_xmul >> 8).unwrap()); } } } diff --git a/src/xmul.rs b/src/xmul.rs index 36654d3..072a0dd 100644 --- a/src/xmul.rs +++ b/src/xmul.rs @@ -6,108 +6,337 @@ //! features unless the feature is enabled with #[feature!] at the crate //! level. //! -//! These functions may or may not exist depending on what target_features -//! are available, so they shouldn't be used directly. +//! These functions are intended to only be used by gf256's proc_macros, +//! these funcitons may or may not be available depending on target_features, +//! and may change behavior, so they shouldn't be used directly. //! +use cfg_if::cfg_if; -/// x86_64 provides 64-bit xmul via the pclmulqdq instruction -#[cfg(all( - target_arch="x86_64", - target_feature="pclmulqdq" + +/// True if carry-less multiplication instructions are available +/// +/// If this is false, any carry-less multiplication operations +/// will use a more expensive bitwise implementation. +/// +/// Some algorithms trade expensive division/remainder operations for +/// multiple multiplication operations, but this can backfire if +/// multiplication is also expensive. This flag allows algorithms +/// to choose the best strategy based on what's available. +/// +pub const HAS_XMUL: bool = { + cfg_if! { + if #[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) + ))] { + true + } else { + false + } + } +}; + + +/// Widening carry-less multiplication, if hardware instructions are available +/// +/// Result is a tuple (lo, hi) +/// +#[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) ))] #[inline] -pub fn __pclmulqdq(a: u64, b: u64) -> u128 { - use core::arch::x86_64::*; - unsafe { - let a = _mm_set_epi64x(0, a as i64); - let b = _mm_set_epi64x(0, b as i64); - let x = _mm_clmulepi64_si128::<0>(a, b); - let x0 = _mm_extract_epi64::<0>(x) as u64; - let x1 = _mm_extract_epi64::<1>(x) as u64; - ((x1 as u128) << 64) | (x0 as u128) +pub fn xmul8(a: u8, b: u8) -> (u8, u8) { + cfg_if! { + if #[cfg(all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ))] { + // x86_64 provides 64-bit xmul via the pclmulqdq instruction + use core::arch::x86_64::*; + unsafe { + let a = _mm_set_epi64x(0, a as i64); + let b = _mm_set_epi64x(0, b as i64); + let x = _mm_clmulepi64_si128::<0>(a, b); + let lo = _mm_extract_epi64::<0>(x) as u64; + (lo as u8, (lo >> 8) as u8) + } + } else if #[cfg(all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ))] { + // aarch64 provides 64-bit xmul via the pmull instruction + use core::arch::aarch64::*; + unsafe { + let x = vmull_p64(a as u64, b as u64); + (x as u8, (x >> 8) as u8) + } + } } } -/// x86_64 provides 64-bit xmul via the pclmulqdq instruction -#[cfg(all( - target_arch="x86_64", - target_feature="pclmulqdq" +/// Widening carry-less multiplication, if hardware instructions are available +/// +/// Result is a tuple (lo, hi) +/// +#[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) ))] #[inline] -pub fn __pclmulqdq_u64(a: u64, b: u64) -> u64 { - use core::arch::x86_64::*; - unsafe { - let a = _mm_set_epi64x(0, a as i64); - let b = _mm_set_epi64x(0, b as i64); - let x = _mm_clmulepi64_si128::<0>(a, b); - _mm_extract_epi64::<0>(x) as u64 +pub fn xmul16(a: u16, b: u16) -> (u16, u16) { + cfg_if! { + if #[cfg(all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ))] { + // x86_64 provides 64-bit xmul via the pclmulqdq instruction + use core::arch::x86_64::*; + unsafe { + let a = _mm_set_epi64x(0, a as i64); + let b = _mm_set_epi64x(0, b as i64); + let x = _mm_clmulepi64_si128::<0>(a, b); + let lo = _mm_extract_epi64::<0>(x) as u64; + (lo as u16, (lo >> 16) as u16) + } + } else if #[cfg(all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ))] { + // aarch64 provides 64-bit xmul via the pmull instruction + use core::arch::aarch64::*; + unsafe { + let x = vmull_p64(a as u64, b as u64); + (x as u16, (x >> 16) as u16) + } + } } } -/// x86_64 provides 64-bit xmul via the pclmulqdq instruction -#[cfg(all( - target_arch="x86_64", - target_feature="pclmulqdq" +/// Widening carry-less multiplication, if hardware instructions are available +/// +/// Result is a tuple (lo, hi) +/// +#[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) ))] #[inline] -pub fn __pclmulqdq_u128(a: u128, b: u128) -> u128 { - use core::arch::x86_64::*; - unsafe { - let a = _mm_set_epi64x((a >> 64) as i64, a as i64); - let b = _mm_set_epi64x((b >> 64) as i64, b as i64); - let x = _mm_clmulepi64_si128::<0x00>(a, b); - let y = _mm_clmulepi64_si128::<0x01>(a, b); - let z = _mm_clmulepi64_si128::<0x10>(a, b); - let x0 = _mm_extract_epi64::<0>(x) as u64; - let x1 = (_mm_extract_epi64::<1>(x) as u64) - ^ (_mm_extract_epi64::<0>(y) as u64) - ^ (_mm_extract_epi64::<0>(z) as u64); - ((x1 as u128) << 64) | (x0 as u128) +pub fn xmul32(a: u32, b: u32) -> (u32, u32) { + cfg_if! { + if #[cfg(all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ))] { + // x86_64 provides 64-bit xmul via the pclmulqdq instruction + use core::arch::x86_64::*; + unsafe { + let a = _mm_set_epi64x(0, a as i64); + let b = _mm_set_epi64x(0, b as i64); + let x = _mm_clmulepi64_si128::<0>(a, b); + let lo = _mm_extract_epi64::<0>(x) as u64; + (lo as u32, (lo >> 32) as u32) + } + } else if #[cfg(all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ))] { + // aarch64 provides 64-bit xmul via the pmull instruction + use core::arch::aarch64::*; + unsafe { + let x = vmull_p64(a as u64, b as u64); + (x as u32, (x >> 32) as u32) + } + } } } -/// aarch64 provides 64-bit xmul via the pmull instruction -#[cfg(all( - feature="use-nightly-features", - target_arch="aarch64", - target_feature="neon" +/// Widening carry-less multiplication, if hardware instructions are available +/// +/// Result is a tuple (lo, hi) +/// +#[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) ))] #[inline] -pub fn __pmull(a: u64, b: u64) -> u128 { - use core::arch::aarch64::*; - unsafe { - vmull_p64(a, b) +pub fn xmul64(a: u64, b: u64) -> (u64, u64) { + cfg_if! { + if #[cfg(all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ))] { + // x86_64 provides 64-bit xmul via the pclmulqdq instruction + use core::arch::x86_64::*; + unsafe { + let a = _mm_set_epi64x(0, a as i64); + let b = _mm_set_epi64x(0, b as i64); + let x = _mm_clmulepi64_si128::<0>(a, b); + let lo = _mm_extract_epi64::<0>(x) as u64; + let hi = _mm_extract_epi64::<1>(x) as u64; + (lo, hi) + } + } else if #[cfg(all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ))] { + // aarch64 provides 64-bit xmul via the pmull instruction + use core::arch::aarch64::*; + unsafe { + let x = vmull_p64(a as u64, b as u64); + (x as u64, (x >> 64) as u64) + } + } } } -/// aarch64 provides 64-bit xmul via the pmull instruction -#[cfg(all( - feature="use-nightly-features", - target_arch="aarch64", - target_feature="neon" +/// Widening carry-less multiplication, if hardware instructions are available +/// +/// Result is a tuple (lo, hi) +/// +#[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) ))] #[inline] -pub fn __pmull_u64(a: u64, b: u64) -> u64 { - use core::arch::aarch64::*; - unsafe { - vmull_p64(a, b) as u64 +pub fn xmul128(a: u128, b: u128) -> (u128, u128) { + cfg_if! { + if #[cfg(all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ))] { + // x86_64 provides 64-bit xmul via the pclmulqdq instruction + use core::arch::x86_64::*; + unsafe { + let a = _mm_set_epi64x((a >> 64) as i64, a as i64); + let b = _mm_set_epi64x((b >> 64) as i64, b as i64); + let x = _mm_clmulepi64_si128::<0x00>(a, b); + let y = _mm_clmulepi64_si128::<0x01>(a, b); + let z = _mm_clmulepi64_si128::<0x10>(a, b); + let w = _mm_clmulepi64_si128::<0x11>(a, b); + let lolo = _mm_extract_epi64::<0>(x) as u64; + let lohi = (_mm_extract_epi64::<1>(x) as u64) + ^ (_mm_extract_epi64::<0>(y) as u64) + ^ (_mm_extract_epi64::<0>(z) as u64); + let hilo = (_mm_extract_epi64::<0>(w) as u64) + ^ (_mm_extract_epi64::<1>(y) as u64) + ^ (_mm_extract_epi64::<1>(z) as u64); + let hihi = _mm_extract_epi64::<1>(w) as u64; + let lo = ((lohi as u128) << 64) | (lolo as u128); + let hi = ((hihi as u128) << 64) | (hilo as u128); + (lo, hi) + } + } else if #[cfg(all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ))] { + // aarch64 provides 64-bit xmul via the pmull instruction + use core::arch::aarch64::*; + unsafe { + let x = vmull_p64(a as u64, b as u64); + let y = vmull_p64((a >> 64) as u64, (b >> 0) as u64); + let z = vmull_p64((a >> 0) as u64, (b >> 64) as u64); + let w = vmull_p64((a >> 64) as u64, (b >> 64) as u64); + (x ^ (y << 64) ^ (z << 64), w ^ (y >> 64) ^ (z >> 64)) + } + } } } -/// aarch64 provides 64-bit xmul via the pmull instruction -#[cfg(all( - feature="use-nightly-features", - target_arch="aarch64", - target_feature="neon" -))] -#[inline] -pub fn __pmull_u128(a: u128, b: u128) -> u128 { - use core::arch::aarch64::*; - unsafe { - let x = vmull_p64(a as u64, b as u64); - let y = vmull_p64((a >> 64) as u64, (b >> 0) as u64) << 64; - let z = vmull_p64((a >> 0) as u64, (b >> 64) as u64) << 64; - x ^ y ^ z + +#[cfg(test)] +mod test { + use super::*; + + #[cfg(any( + all( + not(feature="no-xmul"), + target_arch="x86_64", + target_feature="pclmulqdq" + ), + all( + not(feature="no-xmul"), + feature="nightly", + target_arch="aarch64", + target_feature="neon" + ) + ))] + #[test] + fn xmul() { + assert_eq!(xmul8(0x12, 0x12), (0x04, 0x01)); + assert_eq!(xmul16(0x1234, 0x1234), (0x0510, 0x0104)); + assert_eq!(xmul32(0x12345678, 0x12345678), (0x11141540, 0x01040510)); + assert_eq!(xmul64(0x123456789abcdef1, 0x123456789abcdef1), (0x4144455051545501, 0x0104051011141540)); + assert_eq!(xmul128(0x123456789abcdef123456789abcdef12, 0x123456789abcdef123456789abcdef12), (0x04051011141540414445505154550104, 0x01040510111415404144455051545501)); } }