From f61ae49e9241289097ba829271be7750b5429895 Mon Sep 17 00:00:00 2001
From: Christopher Haster <chaster@utexas.edu>
Date: Sat, 30 Oct 2021 06:57:39 -0500
Subject: [PATCH] Tweaked how xmul is exposed, added p*::widening_mul

Multiplication, and carry-less multiplication, are inherently a widening
operation. Unfortunately, at the time of writing, the types in Rust
don't capture this well, being built around fixed-width wrapping
multiplication.

Rust's stdlib can rely on compiler-level optimizations to clean up
performance issues from unnecessarily-wide multiplications, but this
becomes a bit of an issue for our library, especially for u64 types,
since we rely on intrinsics, which may be hard for compilers to
optimize around.

This commit adds widening_mul, based on a proposal to add widening_mul
to Rust's primitive types:
https://github.com/rust-lang/rust/issues/85532

As well as several other tweaks to how xmul is provided, moving more
arch-level details into xmul, but still limiting when it is emitted.
---
 Cargo.toml              |  43 +----
 Makefile                |  17 +-
 benches/crc.rs          |  14 ++
 examples/crc.rs         | 186 +++++++++++++++++---
 examples/find-p.rs      |  16 +-
 gf256-macros/Cargo.toml |   6 +-
 gf256-macros/src/gf.rs  |  39 +++--
 gf256-macros/src/lib.rs |  68 ++++---
 gf256-macros/src/p.rs   | 131 ++++++++------
 src/lib.rs              |   5 +-
 src/p.rs                |  67 ++++---
 src/xmul.rs             | 379 ++++++++++++++++++++++++++++++++--------
 12 files changed, 692 insertions(+), 279 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6ad650e..c2c19dc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,46 +42,15 @@ harness = false
 # tracking issue:
 # https://github.com/rust-lang/rust/issues/48556
 #
-use-nightly-features = []
+nightly = []
 
-# Makes p* types use a naive carry-less multiplication implementation
-# using shifts and xors. Mainly useful for testing/benchmarking.
+# Disable carry-less multiplication instructions, forcing the use
+# of naive bitwise implementations
 #
-# By default hardware xmul is used if available, falling back to a naive
-# implementation.
+# This is mostly available for testing, and in the case that hardware
+# xmul is bugged (or more likely this crate is bugged).
 #
-use-naive-xmul = ["gf256-macros/use-naive-xmul"]
-
-# Makes p* types require hardware-accelerated carry-less multiplication,
-# causing a compile error if carry-less multiplication instructions aren't
-# available in the current architecture.
-#
-# By default hardware xmul is used if available, falling back to a naive
-# implementation.
-#
-use-hardware-xmul = ["gf256-macros/use-hardware-xmul"]
-
-# Make gf* types use a naive multiplication implementation using shifts
-# and xors. Mainly useful for testing/benchmarking.
-#
-# By default log/antilog tables are used.
-#
-use-naive-gfmul = ["gf256-macros/use-naive-gfmul"]
-
-# Make gf* types use precompiled log/antilog tables.
-#
-# By default log/antilog tables are used.
-#
-use-table-gfmul = ["gf256-macros/use-table-gfmul"]
-
-# Makes gf* types use (potentially hardware accelerated) polynomial
-# multiplication with Barret reduction. This is generally slower than using
-# log/antilog tables, but may be useful if constant-time operations are
-# required.
-#
-# By default log/antilog tables are used.
-#
-use-barret-gfmul = ["gf256-macros/use-barret-gfmul"]
+no-xmul = ["gf256-macros/no-xmul"]
 
 [dev-dependencies]
 criterion = {version="0.3", features=["html_reports"]}
diff --git a/Makefile b/Makefile
index d2d5a02..c7f2859 100644
--- a/Makefile
+++ b/Makefile
@@ -3,20 +3,21 @@ override ENV += RUSTFLAGS="-Ctarget-cpu=native"
 
 .PHONY: all build
 all build:
-	$(ENV) cargo build
+	$(ENV) cargo +nightly build --features nightly
 
 .PHONY: test
 test:
-	$(ENV) cargo test --lib
-	$(ENV) cargo test --example find-p
-	$(ENV) cargo run --example crc
-	$(ENV) cargo run --example shamir
-	$(ENV) cargo run --example raid
-	$(ENV) cargo run --example rs
+	$(ENV) cargo +nightly test --features nightly --lib
+	$(ENV) cargo +nightly test --features nightly --example find-p
+	$(ENV) cargo +nightly run --features nightly --example find-p -- -w9 -n4 -m1
+	$(ENV) cargo +nightly run --features nightly --example crc
+	$(ENV) cargo +nightly run --features nightly --example shamir
+	$(ENV) cargo +nightly run --features nightly --example raid
+	$(ENV) cargo +nightly run --features nightly --example rs
 
 .PHONY: bench
 bench:
-	$(ENV) cargo +nightly bench --features use-nightly-features
+	$(ENV) cargo +nightly bench --features nightly
 
 .PHONY: clean
 clean:
diff --git a/benches/crc.rs b/benches/crc.rs
index 86f1f6a..0ba9317 100644
--- a/benches/crc.rs
+++ b/benches/crc.rs
@@ -78,6 +78,20 @@ fn bench_crc(c: &mut Criterion) {
         |data| crc::word_barret_crc(data),
         BatchSize::SmallInput
     ));
+
+    let mut xs = xorshift64(42).map(|x| x as u8);
+    group.bench_function("reversed_barret_crc", |b| b.iter_batched_ref(
+        || (&mut xs).take(SIZE).collect::<Vec<u8>>(),
+        |data| crc::reversed_barret_crc(data),
+        BatchSize::SmallInput
+    ));
+
+    let mut xs = xorshift64(42).map(|x| x as u8);
+    group.bench_function("word_reversed_barret_crc", |b| b.iter_batched_ref(
+        || (&mut xs).take(SIZE).collect::<Vec<u8>>(),
+        |data| crc::word_reversed_barret_crc(data),
+        BatchSize::SmallInput
+    ));
 }
 
 criterion_group!(benches, bench_crc);
diff --git a/examples/crc.rs b/examples/crc.rs
index f5e3180..a5d3080 100644
--- a/examples/crc.rs
+++ b/examples/crc.rs
@@ -164,17 +164,32 @@ pub fn small_table_crc(data: &[u8]) -> u32 {
 /// compile-time.
 ///
 pub fn barret_crc(data: &[u8]) -> u32 {
-    const BARRET_CONSTANT: p64 = {
-        p64(p128(0x10000000000000000)
-            .naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
     };
 
     let mut crc = p32(0xffffffff);
 
     for b in data {
         crc = crc ^ (p32::from(b.reverse_bits()) << 24);
-        let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
-        crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
+        crc = (crc << 8)
+            + ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
+                .wrapping_mul(p32::from_lossy(POLYNOMIAL));
     }
 
     u32::from(crc).reverse_bits() ^ 0xffffffff
@@ -184,9 +199,23 @@ pub fn barret_crc(data: &[u8]) -> u32 {
 /// barret_crc, but operating on a 32-bit word at a time
 ///
 pub fn word_barret_crc(data: &[u8]) -> u32 {
-    const BARRET_CONSTANT: p64 = {
-        p64(p128(0x10000000000000000)
-            .naive_div(p128(POLYNOMIAL.0 as u128)).0 as u64)
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
     };
 
     let mut crc = p32(0xffffffff);
@@ -196,19 +225,118 @@ pub fn word_barret_crc(data: &[u8]) -> u32 {
     for word in &mut words {
         let word = <[u8; 4]>::try_from(word).unwrap();
         crc = crc ^ p32::from_le_bytes(word).reverse_bits();
-        let q = (p64::from(crc)*BARRET_CONSTANT) >> 32;
-        crc = p32::from_lossy(q*POLYNOMIAL);
+        crc = (crc.widening_mul(BARRET_CONSTANT).1 + crc)
+                .wrapping_mul(p32::from_lossy(POLYNOMIAL));
     }
 
     for b in words.remainder() {
         crc = crc ^ (p32::from(b.reverse_bits()) << 24);
-        let q = (p64::from(crc >> 24)*BARRET_CONSTANT) >> 32;
-        crc = p32::from_lossy(q*POLYNOMIAL) + (crc << 8);
+        crc = (crc << 8)
+            + ((crc >> 24u32).widening_mul(BARRET_CONSTANT).1 + (crc >> 24u32))
+                .wrapping_mul(p32::from_lossy(POLYNOMIAL));
     }
 
     u32::from(crc).reverse_bits() ^ 0xffffffff
 }
 
+/// A hardware-accelerated CRC implementation using Barret reduction without
+/// needing to bit-reverse the internal representation
+///
+/// CRC32 and polynomial multiplication instructions unfortunately are defined
+/// with different bit-endianness. This would normally mean we need to
+/// bit-reverse the incoming data before we can use polynomial multiplication.
+///
+/// However, polynomial multiplication has the odd property that it is
+/// symmetric, brev(a) * brev(b) = brev((a * b) << 1)
+///
+/// This means we can rewrite our Barret reduction CRC to operate entirely
+/// on a bit-reversed representation, shaving off several instructions.
+///
+/// In theory this should be faster, but measurements show this as actually
+/// being slightly slower, perhaps the extra 1-bit shift costs more on
+/// machines with bit-reverse instructions?
+///
+pub fn reversed_barret_crc(data: &[u8]) -> u32 {
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
+    };
+    const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
+    const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();
+
+    let mut crc = p32(0xffffffff);
+
+    for b in data {
+        crc = crc ^ p32::from(*b);
+        let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
+        let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
+        crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
+    }
+
+    u32::from(crc) ^ 0xffffffff
+}
+
+/// A hardware-accelerated CRC implementation using the same technique as
+/// reversed_barret_crc, but operating on a 32-bit word at a time
+///
+pub fn word_reversed_barret_crc(data: &[u8]) -> u32 {
+    // Normally this would be 0x10000000000000000 / __polynomial, but
+    // we eagerly do one step of division so we avoid needing a 4x wide
+    // type. We can also drop the highest bit if we add the high bits
+    // manually we use use this constant.
+    //
+    // = x % p
+    // = 0xffffffff & (x + p*(((x >> 32) * [0x10000000000000000/p]) >> 32))
+    // = 0xffffffff & (x + p*(((x >> 32) * [(p << 32)/p + 0x100000000]) >> 32))
+    // = 0xffffffff & (x + p*((((x >> 32) * [(p << 32)/p]) >> 32) + (x >> 32)))
+    //                                      \-----+-----/
+    //                                            '-- Barret constant
+    //
+    // Note that the shifts and masks can go away if we operate on u32s,
+    // leaving 2 xmuls and 2 xors.
+    //
+    const BARRET_CONSTANT: p32 = {
+        p32(p64(POLYNOMIAL.0 << 32).naive_div(POLYNOMIAL).0 as u32)
+    };
+    const POLYNOMIAL_REV: p32 = p32(POLYNOMIAL.0 as u32).reverse_bits();
+    const BARRET_CONSTANT_REV: p32 = BARRET_CONSTANT.reverse_bits();
+
+    let mut crc = p32(0xffffffff);
+
+    // iterate over 4-byte words
+    let mut words = data.chunks_exact(4);
+    for word in &mut words {
+        let word = <[u8; 4]>::try_from(word).unwrap();
+        crc = crc ^ p32::from_le_bytes(word);
+        let (lo, _) = crc.widening_mul(BARRET_CONSTANT_REV);
+        let (lo, hi) = ((lo << 1u32) + crc).widening_mul(POLYNOMIAL_REV);
+        crc = (hi << 1u32) | (lo >> 31u32);
+    }
+
+    for b in words.remainder() {
+        crc = crc ^ p32::from(*b);
+        let (lo, _) = (crc << 24u32).widening_mul(BARRET_CONSTANT_REV);
+        let (lo, hi) = ((lo << 1u32) + (crc << 24u32)).widening_mul(POLYNOMIAL_REV);
+        crc = (crc >> 8u32) + ((hi << 1u32) | (lo >> 31u32));
+    }
+
+    u32::from(crc) ^ 0xffffffff
+}
+
 
 fn main() {
     let input = b"Hello World!";
@@ -217,31 +345,39 @@ fn main() {
     println!("testing crc({:?})", String::from_utf8_lossy(input));
 
     let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "naive_crc", output);
+    println!("{:<24} => 0x{:08x}", "naive_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "less_naive_crc", output);
+    let output = less_naive_crc(input);
+    println!("{:<24} => 0x{:08x}", "less_naive_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "word_less_naive_crc", output);
+    let output = word_less_naive_crc(input);
+    println!("{:<24} => 0x{:08x}", "word_less_naive_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "table_crc", output);
+    let output = table_crc(input);
+    println!("{:<24} => 0x{:08x}", "table_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "small_table_crc", output);
+    let output = small_table_crc(input);
+    println!("{:<24} => 0x{:08x}", "small_table_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "barret_crc", output);
+    let output = barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "barret_crc", output);
     assert_eq!(output, expected);
 
-    let output = naive_crc(input);
-    println!("{:<19} => 0x{:08x}", "word_barret_crc", output);
+    let output = word_barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "word_barret_crc", output);
+    assert_eq!(output, expected);
+
+    let output = reversed_barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "reversed_barret_crc", output);
+    assert_eq!(output, expected);
+
+    let output = word_reversed_barret_crc(input);
+    println!("{:<24} => 0x{:08x}", "word_reversed_barret_crc", output);
     assert_eq!(output, expected);
 
     println!();
diff --git a/examples/find-p.rs b/examples/find-p.rs
index 7aa71f5..f68d1cc 100644
--- a/examples/find-p.rs
+++ b/examples/find-p.rs
@@ -86,20 +86,10 @@ pub fn is_generator(g: p128, p: p128) -> bool {
     //
     let width = (128-p.leading_zeros()) - 1;
 
-    // We're going to do a lot of multiplications, so it helps to precalculate
-    // Barret's constant for Barret reduction. This trades a modulus operation
-    // for 2 multiplication, but means we can leverage carry-less multiplication
-    // hardware instructions.
-    //
-    // normally this is just (1 << (2*width)) / p, but we can precompute
-    // one step of division to avoid needing a 4x wide type
-    //
-    let mask = (1u128 << width) - 1;
-    let barret_constant = (((mask & p) << width) / p) + (p128(1) << width);
+    // Multiplication uses carry-less multiplicatio modulo our irreducible
+    // polynomial
     let gfmul = |a: p128, b: p128| -> p128 {
-        let x = a * b;
-        let q = ((x >> width) * barret_constant) >> width;
-        mask & ((q * p) + x)
+        (a * b) % p
     };
 
     // Exponentiation via squaring
diff --git a/gf256-macros/Cargo.toml b/gf256-macros/Cargo.toml
index 339a114..463dd70 100644
--- a/gf256-macros/Cargo.toml
+++ b/gf256-macros/Cargo.toml
@@ -10,11 +10,7 @@ proc-macro = true
 
 [features]
 # See gf256/Cargo.toml for documentation over these features
-use-naive-xmul = []
-use-hardware-xmul = []
-use-naive-gfmul = []
-use-table-gfmul = []
-use-barret-gfmul = []
+no-xmul = []
 
 [dependencies]
 syn = {version="1.0.73", features=["full"]}
diff --git a/gf256-macros/src/gf.rs b/gf256-macros/src/gf.rs
index cc38e2c..d97cb6a 100644
--- a/gf256-macros/src/gf.rs
+++ b/gf256-macros/src/gf.rs
@@ -60,12 +60,23 @@ impl __gf {
     // Generate constant for Barret's reduction if we're
     // in Barret mode
     #[cfg(__if(__barret))]
-    const BARRET_CONSTANT: p16 = {
-        // normally this would be 0x10000 / __polynomial, but we eagerly
-        // do one step of division so we avoid needing a 4x wide type
+    const BARRET_CONSTANT: p8 = {
+        // Normally this would be 0x10000 / __polynomial, but we eagerly
+        // do one step of division so we avoid needing a 4x wide type. We
+        // can also drop the highest bit if we add the high bits manually
+        // we use use this constant.
         //
-        //p16(p32(0x10000).naive_div(p32(__polynomial)).0 as u16)
-        p16(__polynomial << 8).naive_div(p16(__polynomial)).naive_add(p16(0x100))
+        // = x % p
+        // = 0xff & (x + p*(((x >> 8) * [0x10000/p]) >> 8))
+        // = 0xff & (x + p*(((x >> 8) * [(p << 8)/p + 0x100]) >> 8))
+        // = 0xff & (x + p*((((x >> 8) * [(p << 8)/p]) >> 8) + (x >> 8)))
+        //                               \-----+----/
+        //                                     '-- Barret constant
+        //
+        // Note that the shifts and masks can go away if we operate on u8s,
+        // leaving 2 xmuls and 2 xors.
+        //
+        p8(p16(__polynomial << 8).naive_div(p16(__polynomial)).0 as u8)
     };
 
     /// Addition over gf(256), aka xor
@@ -136,7 +147,7 @@ impl __gf {
     /// these are more expensive, but also allowed in const contexts
     ///
     #[inline]
-    pub const fn checked_naive_recip(self) -> Option<__gf> {
+    pub const fn naive_checked_recip(self) -> Option<__gf> {
         if self.0 == 0 {
             return None;
         }
@@ -154,7 +165,7 @@ impl __gf {
     ///
     #[inline]
     pub const fn naive_recip(self) -> __gf {
-        match self.checked_naive_recip() {
+        match self.naive_checked_recip() {
             Some(x) => x,
             None => __gf(1 / 0),
         }
@@ -163,8 +174,8 @@ impl __gf {
     /// Naive division over gf(256)
     ///
     #[inline]
-    pub const fn checked_naive_div(self, other: __gf) -> Option<__gf> {
-        match other.checked_naive_recip() {
+    pub const fn naive_checked_div(self, other: __gf) -> Option<__gf> {
+        match other.naive_checked_recip() {
             Some(other_recip) => Some(self.naive_mul(other_recip)),
             None => None,
         }
@@ -176,7 +187,7 @@ impl __gf {
     ///
     #[inline]
     pub const fn naive_div(self, other: __gf) -> __gf {
-        match self.checked_naive_div(other) {
+        match self.naive_checked_div(other) {
             Some(x) => x,
             None => __gf(self.0 / 0),
         }
@@ -218,9 +229,11 @@ impl __gf {
                 // useful here if we have hardware xmul instructions, though
                 // it may be more expensive if xmul is naive.
                 //
-                let x = p16(self.0 as u16) * p16(other.0 as u16);
-                let q = (p16::mul(x >> 8, Self::BARRET_CONSTANT) >> 8);
-                __gf((p16::mul(q, Self::POLYNOMIAL) + x).0 as u8)
+                let (lo, hi) = p8(self.0).widening_mul(p8(other.0));
+                let x = lo
+                    + (hi.widening_mul(Self::BARRET_CONSTANT).1 + hi)
+                        .wrapping_mul(p8(Self::POLYNOMIAL.0 as u8));
+                __gf(x.0 as u8)
             } else {
                 // fallback to naive multiplication over gf(256)
                 self.naive_mul(other)
diff --git a/gf256-macros/src/lib.rs b/gf256-macros/src/lib.rs
index a2d79ac..1ca44c5 100644
--- a/gf256-macros/src/lib.rs
+++ b/gf256-macros/src/lib.rs
@@ -27,6 +27,24 @@ fn crate_() -> TokenTree {
     ))
 }
 
+fn xmul_query() -> TokenStream {
+    quote! {
+        any(
+            all(
+                not(feature="no-xmul"),
+                target_arch="x86_64",
+                target_feature="pclmulqdq"
+            ),
+            all(
+                not(feature="no-xmul"),
+                feature="nightly",
+                target_arch="aarch64",
+                target_feature="neon"
+            )
+        )
+    }
+}
+
 fn token_replace(
     input: TokenStream,
     replacements: &HashMap<String, TokenTree>
@@ -154,7 +172,7 @@ struct PArgs {
     #[darling(default)]
     naive: bool,
     #[darling(default)]
-    hardware: bool,
+    xmul: bool,
 }
 
 #[proc_macro_attribute]
@@ -165,8 +183,8 @@ pub fn p(
     let crate_ = crate_();
 
     // parse args
-    let args = parse_macro_input!(args as syn::AttributeArgs);
-    let args = match PArgs::from_list(&args) {
+    let raw_args = parse_macro_input!(args as syn::AttributeArgs);
+    let args = match PArgs::from_list(&raw_args) {
         Ok(args) => args,
         Err(err) => {
             return err.write_errors().into();
@@ -174,18 +192,20 @@ pub fn p(
     };
 
     // decide between implementations
-    let (naive, hardware) = match (
-        (args.naive, args.hardware),
-        (cfg!(feature="use-naive-xmul"), cfg!(feature="use-hardware-xmul"))
-    ) {
-        // choose mode if one is explicitly requested
-        ((true,  false), _             ) => (true,  false),
-        ((false, false), (true,  false)) => (true,  false),
-        ((false, true,), _             ) => (false, true ),
-        ((false, false), (false, true )) => (false, true ),
-
-        // default to neither, let the p* implementation make the decision
-        ((false, false), (false, false)) => (false, false),
+    let has_xmul = match (args.naive, args.xmul) {
+        (true, false) => false,
+        (false, true) => true,
+        (false, false) => {
+            // query target configuration and recurse back into our proc_macro
+            let input = TokenStream::from(input);
+            let xmul_query = xmul_query();
+            let output = quote! {
+                #[cfg_attr(#xmul_query,      #crate_::macros::p(xmul,  #(#raw_args),*))]
+                #[cfg_attr(not(#xmul_query), #crate_::macros::p(naive, #(#raw_args),*))]
+                #input
+            };
+            return output.into();
+        },
 
         // multiple modes selected?
         _ => panic!("invalid configuration of macro p (naive, hardware?)"),
@@ -202,11 +222,11 @@ pub fn p(
             //
             let input = TokenStream::from(input);
             let output = quote! {
-                #[cfg_attr(target_pointer_width="8",   #crate_::macros::p(u="usize", width=8,   naive=#naive, hardware=#hardware))]
-                #[cfg_attr(target_pointer_width="16",  #crate_::macros::p(u="usize", width=16,  naive=#naive, hardware=#hardware))]
-                #[cfg_attr(target_pointer_width="32",  #crate_::macros::p(u="usize", width=32,  naive=#naive, hardware=#hardware))]
-                #[cfg_attr(target_pointer_width="64",  #crate_::macros::p(u="usize", width=64,  naive=#naive, hardware=#hardware))]
-                #[cfg_attr(target_pointer_width="128", #crate_::macros::p(u="usize", width=128, naive=#naive, hardware=#hardware))]
+                #[cfg_attr(target_pointer_width="8",   #crate_::macros::p(width=8,   #(#raw_args),*))]
+                #[cfg_attr(target_pointer_width="16",  #crate_::macros::p(width=16,  #(#raw_args),*))]
+                #[cfg_attr(target_pointer_width="32",  #crate_::macros::p(width=32,  #(#raw_args),*))]
+                #[cfg_attr(target_pointer_width="64",  #crate_::macros::p(width=64,  #(#raw_args),*))]
+                #[cfg_attr(target_pointer_width="128", #crate_::macros::p(width=128, #(#raw_args),*))]
                 #input
             };
             return output.into();
@@ -239,11 +259,11 @@ pub fn p(
         ("__is_usize".to_owned(), TokenTree::Ident(
             Ident::new(&format!("{}", args.u == "usize"), Span::call_site())
         )),
-        ("__naive".to_owned(), TokenTree::Ident(
-            Ident::new(&format!("{}", naive), Span::call_site())
+        ("__has_xmul".to_owned(), TokenTree::Ident(
+            Ident::new(&format!("{}", has_xmul), Span::call_site())
         )),
-        ("__hardware".to_owned(), TokenTree::Ident(
-            Ident::new(&format!("{}", hardware), Span::call_site())
+        ("__xmul".to_owned(), TokenTree::Ident(
+            Ident::new(&format!("xmul{}", width), Span::call_site())
         )),
         ("__crate".to_owned(), crate_),
     ]);
diff --git a/gf256-macros/src/p.rs b/gf256-macros/src/p.rs
index 6cc8d38..b2603cf 100644
--- a/gf256-macros/src/p.rs
+++ b/gf256-macros/src/p.rs
@@ -43,6 +43,31 @@ impl __p {
         __p(self.0 ^ other.0)
     }
 
+    /// Naive polynomial multiplication
+    ///
+    /// Naive versions are built out of simple bitwise operations,
+    /// these are more expensive, but also allowed in const contexts
+    ///
+    /// This return a tuple containing the low and high parts in that order
+    ///
+    #[inline]
+    pub const fn naive_widening_mul(self, other: __p) -> (__p, __p) {
+        let a = self.0;
+        let b = other.0;
+        let mut lo = 0;
+        let mut hi = 0;
+        let mut i = 0;
+        while i < __width {
+            let mask = (((a as __i) << (__width-1-i)) >> (__width-1)) as __u;
+            lo ^= mask & (b << i);
+            hi ^= mask & (b >> (__width-1-i));
+            i += 1;
+        }
+        // note we adjust hi by one here, otherwise we'd need to handle
+        // shifting > word size
+        (__p(lo), __p(hi >> 1))
+    }
+
     /// Naive polynomial multiplication
     ///
     /// Naive versions are built out of simple bitwise operations,
@@ -52,11 +77,9 @@ impl __p {
     /// a flag indicating of overflow occured
     ///
     #[inline]
-    pub const fn overflowing_naive_mul(self, other: __p) -> (__p, bool) {
-        // x bits * y bits = x+y-1 bits, if this is more bits than the
-        // width we will overflow
-        let o = self.0.leading_zeros() + other.0.leading_zeros() < __width-1;
-        (self.wrapping_naive_mul(other), o)
+    pub const fn naive_overflowing_mul(self, other: __p) -> (__p, bool) {
+        let (lo, hi) = self.naive_widening_mul(other);
+        (lo, hi.0 != 0)
     }
 
     /// Naive polynomial multiplication
@@ -67,8 +90,8 @@ impl __p {
     /// Note this returns None if an overflow occured
     ///
     #[inline]
-    pub const fn checked_naive_mul(self, other: __p) -> Option<__p> {
-        match self.overflowing_naive_mul(other) {
+    pub const fn naive_checked_mul(self, other: __p) -> Option<__p> {
+        match self.naive_overflowing_mul(other) {
             (_, true ) => None,
             (x, false) => Some(x),
         }
@@ -82,7 +105,7 @@ impl __p {
     /// Note this wraps around the boundary of the type
     ///
     #[inline]
-    pub const fn wrapping_naive_mul(self, other: __p) -> __p {
+    pub const fn naive_wrapping_mul(self, other: __p) -> __p {
         let a = self.0;
         let b = other.0;
         let mut x = 0;
@@ -108,12 +131,34 @@ impl __p {
         cfg_if! {
             // TODO feature flag for overflow-checks?
             if #[cfg(debug_assertions)] {
-                match self.checked_naive_mul(other) {
+                match self.naive_checked_mul(other) {
                     Some(x) => x,
                     None => __p(self.0 / 0),
                 }
             } else {
-                self.wrapping_naive_mul(other)
+                self.naive_wrapping_mul(other)
+            }
+        }
+    }
+
+    /// Naive polynomial multiplication
+    ///
+    /// This attempts to use carry-less multiplication
+    /// instructions when available (pclmulqdq on x86_64,
+    /// pmull on aarch64), otherwise falls back to the expensive
+    /// naive implementation
+    ///
+    /// This return a tuple containing the low and high parts in that order
+    ///
+    #[inline]
+    pub fn widening_mul(self, other: __p) -> (__p, __p) {
+        cfg_if! {
+            if #[cfg(__if(__has_xmul))] {
+                use __crate::internal::xmul::*;
+                let (lo, hi) = __xmul(self.0 as _, other.0 as _);
+                (__p(lo as __u), __p(hi as __u))
+            } else {
+                self.naive_widening_mul(other)
             }
         }
     }
@@ -130,10 +175,8 @@ impl __p {
     ///
     #[inline]
     pub fn overflowing_mul(self, other: __p) -> (__p, bool) {
-        // x bits * y bits = x+y-1 bits, if this is more bits than the
-        // width we will overflow
-        let o = self.0.leading_zeros() + other.0.leading_zeros() < __width-1;
-        (self.wrapping_mul(other), o)
+        let (lo, hi) = self.widening_mul(other);
+        (lo, hi.0 != 0)
     }
 
     /// Polynomial multiplication
@@ -165,37 +208,11 @@ impl __p {
     #[inline]
     pub fn wrapping_mul(self, other: __p) -> __p {
         cfg_if! {
-            if #[cfg(all(
-                __if(!__naive),
-                target_arch="x86_64",
-                target_feature="pclmulqdq"
-            ))] {
+            if #[cfg(__if(__has_xmul))] {
                 use __crate::internal::xmul::*;
-
-                cfg_if! {
-                    if #[cfg(__if(__width <= 64))] {
-                        __p(__pclmulqdq_u64(self.0 as u64, other.0 as u64) as __u)
-                    } else {
-                        __p(__pclmulqdq_u128(self.0, other.0))
-                    }
-                }
-            } else if #[cfg(all(
-                __if(!__naive),
-                feature="use-nightly-features",
-                target_arch="aarch64",
-                target_feature="neon"
-            ))] {
-                use __crate::internal::xmul::*;
-
-                cfg_if! {
-                    if #[cfg(__if(__width <= 64))] {
-                        __p(__pmull_u64(self.0 as u64, other.0 as u64) as __u)
-                    } else {
-                        __p(__pmull_u128(self.0, other.0))
-                    }
-                }
-            } else if #[cfg(__if(!__hardware))] {
-                self.wrapping_naive_mul(other)
+                __p(__xmul(self.0 as _, other.0 as _).0 as __u)
+            } else {
+                self.naive_wrapping_mul(other)
             }
         }
     }
@@ -225,14 +242,14 @@ impl __p {
 
     /// Naive polynomial exponentiation
     #[inline]
-    pub const fn overflowing_naive_pow(self, exp: u32) -> (__p, bool) {
+    pub const fn naive_overflowing_pow(self, exp: u32) -> (__p, bool) {
         let mut a = self;
         let mut exp = exp;
         let mut x = __p(1);
         let mut o = false;
         loop {
             if exp & 1 != 0 {
-                let (x_, o_) = x.overflowing_naive_mul(a);
+                let (x_, o_) = x.naive_overflowing_mul(a);
                 x = x_;
                 o = o || o_;
             }
@@ -241,7 +258,7 @@ impl __p {
             if exp == 0 {
                 return (x, o);
             }
-            let (a_, o_) = a.overflowing_naive_mul(a);
+            let (a_, o_) = a.naive_overflowing_mul(a);
             a = a_;
             o = o || o_;
         }
@@ -249,13 +266,13 @@ impl __p {
 
     /// Naive polynomial exponentiation
     #[inline]
-    pub const fn checked_naive_pow(self, exp: u32) -> Option<__p> {
+    pub const fn naive_checked_pow(self, exp: u32) -> Option<__p> {
         let mut a = self;
         let mut exp = exp;
         let mut x = __p(1);
         loop {
             if exp & 1 != 0 {
-                x = match x.checked_naive_mul(a) {
+                x = match x.naive_checked_mul(a) {
                     Some(x) => x,
                     None => return None,
                 }
@@ -265,7 +282,7 @@ impl __p {
             if exp == 0 {
                 return Some(x);
             }
-            a = match a.checked_naive_mul(a) {
+            a = match a.naive_checked_mul(a) {
                 Some(a) => a,
                 None => return None,
             }
@@ -274,20 +291,20 @@ impl __p {
 
     /// Naive polynomial exponentiation
     #[inline]
-    pub const fn wrapping_naive_pow(self, exp: u32) -> __p {
+    pub const fn naive_wrapping_pow(self, exp: u32) -> __p {
         let mut a = self;
         let mut exp = exp;
         let mut x = __p(1);
         loop {
             if exp & 1 != 0 {
-                x = x.wrapping_naive_mul(a);
+                x = x.naive_wrapping_mul(a);
             }
 
             exp >>= 1;
             if exp == 0 {
                 return x;
             }
-            a = a.wrapping_naive_mul(a);
+            a = a.naive_wrapping_mul(a);
         }
     }
 
@@ -403,7 +420,7 @@ impl __p {
     /// these are more expensive, but also allowed in const contexts
     ///
     #[inline]
-    pub const fn checked_naive_div(self, other: __p) -> Option<__p> {
+    pub const fn naive_checked_div(self, other: __p) -> Option<__p> {
         if other.0 == 0 {
             None
         } else {
@@ -428,7 +445,7 @@ impl __p {
     ///
     #[inline]
     pub const fn naive_div(self, other: __p) -> __p {
-        match self.checked_naive_div(other) {
+        match self.naive_checked_div(other) {
             Some(x) => x,
             None => __p(self.0 / 0),
         }
@@ -440,7 +457,7 @@ impl __p {
     /// these are more expensive, but also allowed in const contexts
     ///
     #[inline]
-    pub const fn checked_naive_rem(self, other: __p) -> Option<__p> {
+    pub const fn naive_checked_rem(self, other: __p) -> Option<__p> {
         if other.0 == 0 {
             None
         } else {
@@ -464,7 +481,7 @@ impl __p {
     ///
     #[inline]
     pub const fn naive_rem(self, other: __p) -> __p {
-        match self.checked_naive_rem(other) {
+        match self.naive_checked_rem(other) {
             Some(x) => x,
             None => __p(self.0 / 0),
         }
diff --git a/src/lib.rs b/src/lib.rs
index 14aee41..668b398 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,7 +1,7 @@
 
 // Enable stdsimd for pmull on aarch64
 #![cfg_attr(
-    all(feature="use-nightly-features", target_arch="aarch64"),
+    all(not(feature="no-xmul"), feature="nightly", target_arch="aarch64"),
     feature(stdsimd)
 )]
 
@@ -26,3 +26,6 @@ pub mod internal {
     pub use cfg_if;
     pub mod xmul;
 }
+
+pub use internal::xmul::HAS_XMUL;
+
diff --git a/src/p.rs b/src/p.rs
index a0a8c15..903bb7b 100644
--- a/src/p.rs
+++ b/src/p.rs
@@ -35,11 +35,11 @@ mod test {
 
     #[test]
     fn mul() {
-        assert_eq!(p8(0xfe).wrapping_naive_mul(p8(0x87)), p8(0xfa));
-        assert_eq!(p16(0xfedc).wrapping_naive_mul(p16(0x8765)), p16(0x7d2c));
-        assert_eq!(p32(0xfedcba98).wrapping_naive_mul(p32(0x87654321)), p32(0x03da4198));
-        assert_eq!(p64(0xfedcba9876543210).wrapping_naive_mul(p64(0x8765432100000000)), p64(0x0050401000000000));
-        assert_eq!(p128(0xfedcba98765432100000000000000000).wrapping_naive_mul(p128(0x87654321000000000000000000000000)), p128(0x00000000000000000000000000000000));
+        assert_eq!(p8(0xfe).naive_wrapping_mul(p8(0x87)), p8(0xfa));
+        assert_eq!(p16(0xfedc).naive_wrapping_mul(p16(0x8765)), p16(0x7d2c));
+        assert_eq!(p32(0xfedcba98).naive_wrapping_mul(p32(0x87654321)), p32(0x03da4198));
+        assert_eq!(p64(0xfedcba9876543210).naive_wrapping_mul(p64(0x8765432100000000)), p64(0x0050401000000000));
+        assert_eq!(p128(0xfedcba98765432100000000000000000).naive_wrapping_mul(p128(0x87654321000000000000000000000000)), p128(0x00000000000000000000000000000000));
 
         assert_eq!(p8(0xfe).wrapping_mul(p8(0x87)), p8(0xfa));
         assert_eq!(p16(0xfedc).wrapping_mul(p16(0x8765)), p16(0x7d2c));
@@ -79,12 +79,12 @@ mod test {
     }
 
     #[test]
-    fn hardware_mul() {
+    fn naive_mul() {
         for a in (0..=255).map(p8) {
             for b in (0..=255).map(p8) {
-                let res_naive = a.wrapping_naive_mul(b);
-                let res_hardware = a.wrapping_mul(b);
-                assert_eq!(res_naive, res_hardware);
+                let naive_res = a.naive_wrapping_mul(b);
+                let res_xmul = a.wrapping_mul(b);
+                assert_eq!(naive_res, res_xmul);
             }
         }
     }
@@ -93,23 +93,48 @@ mod test {
     fn overflowing_mul() {
         for a in (0..=255).map(p8) {
             for b in (0..=255).map(p8) {
-                let (wrapped_naive, overflow_naive) = a.overflowing_naive_mul(b);
-                let (wrapped_hardware, overflow_hardware) = a.overflowing_mul(b);
-                let res_naive = p16::from(a).naive_mul(p16::from(b));
-                let res_hardware = p16::from(a) * p16::from(b);
+                let (naive_wrapped, naive_overflow) = a.naive_overflowing_mul(b);
+                let (wrapped_xmul, overflow_xmul) = a.overflowing_mul(b);
+                let naive_res = p16::from(a).naive_mul(p16::from(b));
+                let res_xmul = p16::from(a) * p16::from(b);
 
-                // same results naive vs hardware?
-                assert_eq!(wrapped_naive, wrapped_hardware);
-                assert_eq!(overflow_naive, overflow_hardware);
-                assert_eq!(res_naive, res_hardware);
+                // same results naive vs xmul?
+                assert_eq!(naive_wrapped, wrapped_xmul);
+                assert_eq!(naive_overflow, overflow_xmul);
+                assert_eq!(naive_res, res_xmul);
 
                 // same wrapped results?
-                assert_eq!(wrapped_naive, p8::try_from(res_naive & 0xff).unwrap());
-                assert_eq!(wrapped_hardware, p8::try_from(res_hardware & 0xff).unwrap());
+                assert_eq!(naive_wrapped, p8::try_from(naive_res & 0xff).unwrap());
+                assert_eq!(wrapped_xmul, p8::try_from(res_xmul & 0xff).unwrap());
 
                 // overflow set if overflow occured?
-                assert_eq!(overflow_naive, (p16::from(wrapped_naive) != res_naive));
-                assert_eq!(overflow_hardware, (p16::from(wrapped_hardware) != res_hardware));
+                assert_eq!(naive_overflow, (p16::from(naive_wrapped) != naive_res));
+                assert_eq!(overflow_xmul, (p16::from(wrapped_xmul) != res_xmul));
+            }
+        }
+    }
+
+    #[test]
+    fn widening_mul() {
+        for a in (0..=255).map(p8) {
+            for b in (0..=255).map(p8) {
+                let (naive_lo, naive_hi) = a.naive_widening_mul(b);
+                let (lo_xmul, hi_xmul ) = a.widening_mul(b);
+                let naive_res = p16::from(a).naive_mul(p16::from(b));
+                let res_xmul = p16::from(a) * p16::from(b);
+
+                // same results naive vs xmul?
+                assert_eq!(naive_lo, lo_xmul);
+                assert_eq!(naive_hi, hi_xmul);
+                assert_eq!(naive_res, res_xmul);
+
+                // same lo results?
+                assert_eq!(naive_lo, p8::try_from(naive_res & 0xff).unwrap());
+                assert_eq!(lo_xmul, p8::try_from(res_xmul & 0xff).unwrap());
+
+                // same hi results?
+                assert_eq!(naive_hi, p8::try_from(naive_res >> 8).unwrap());
+                assert_eq!(hi_xmul, p8::try_from(res_xmul >> 8).unwrap());
             }
         }
     }
diff --git a/src/xmul.rs b/src/xmul.rs
index 36654d3..072a0dd 100644
--- a/src/xmul.rs
+++ b/src/xmul.rs
@@ -6,108 +6,337 @@
 //! features unless the feature is enabled with #[feature!] at the crate
 //! level.
 //!
-//! These functions may or may not exist depending on what target_features
-//! are available, so they shouldn't be used directly.
+//! These functions are intended to only be used by gf256's proc_macros,
+//! these funcitons may or may not be available depending on target_features,
+//! and may change behavior, so they shouldn't be used directly.
 //!
 
+use cfg_if::cfg_if;
 
-/// x86_64 provides 64-bit xmul via the pclmulqdq instruction
-#[cfg(all(
-    target_arch="x86_64",
-    target_feature="pclmulqdq"
+
+/// True if carry-less multiplication instructions are available
+///
+/// If this is false, any carry-less multiplication operations
+/// will use a more expensive bitwise implementation.
+///
+/// Some algorithms trade expensive division/remainder operations for
+/// multiple multiplication operations, but this can backfire if
+/// multiplication is also expensive. This flag allows algorithms
+/// to choose the best strategy based on what's available.
+///
+pub const HAS_XMUL: bool = {
+    cfg_if! {
+        if #[cfg(any(
+            all(
+                not(feature="no-xmul"),
+                target_arch="x86_64",
+                target_feature="pclmulqdq"
+            ),
+            all(
+                not(feature="no-xmul"),
+                feature="nightly",
+                target_arch="aarch64",
+                target_feature="neon"
+            )
+        ))] {
+            true
+        } else {
+            false
+        }
+    }
+};
+
+
+/// Widening carry-less multiplication, if hardware instructions are available
+///
+/// Result is a tuple (lo, hi)
+///
+#[cfg(any(
+    all(
+        not(feature="no-xmul"),
+        target_arch="x86_64",
+        target_feature="pclmulqdq"
+    ),
+    all(
+        not(feature="no-xmul"),
+        feature="nightly",
+        target_arch="aarch64",
+        target_feature="neon"
+    )
 ))]
 #[inline]
-pub fn __pclmulqdq(a: u64, b: u64) -> u128 {
-    use core::arch::x86_64::*;
-    unsafe {
-        let a = _mm_set_epi64x(0, a as i64);
-        let b = _mm_set_epi64x(0, b as i64);
-        let x = _mm_clmulepi64_si128::<0>(a, b);
-        let x0 = _mm_extract_epi64::<0>(x) as u64;
-        let x1 = _mm_extract_epi64::<1>(x) as u64;
-        ((x1 as u128) << 64) | (x0 as u128)
+pub fn xmul8(a: u8, b: u8) -> (u8, u8) {
+    cfg_if! {
+        if #[cfg(all(
+            not(feature="no-xmul"),
+            target_arch="x86_64",
+            target_feature="pclmulqdq"
+        ))] {
+            // x86_64 provides 64-bit xmul via the pclmulqdq instruction
+            use core::arch::x86_64::*;
+            unsafe {
+                let a = _mm_set_epi64x(0, a as i64);
+                let b = _mm_set_epi64x(0, b as i64);
+                let x = _mm_clmulepi64_si128::<0>(a, b);
+                let lo = _mm_extract_epi64::<0>(x) as u64;
+                (lo as u8, (lo >> 8) as u8)
+            }
+        } else if #[cfg(all(
+            not(feature="no-xmul"),
+            feature="nightly",
+            target_arch="aarch64",
+            target_feature="neon"
+        ))] {
+            // aarch64 provides 64-bit xmul via the pmull instruction
+            use core::arch::aarch64::*;
+            unsafe {
+                let x = vmull_p64(a as u64, b as u64);
+                (x as u8, (x >> 8) as u8)
+            }
+        }
     }
 }
 
-/// x86_64 provides 64-bit xmul via the pclmulqdq instruction
-#[cfg(all(
-    target_arch="x86_64",
-    target_feature="pclmulqdq"
+/// Widening carry-less multiplication, if hardware instructions are available
+///
+/// Result is a tuple (lo, hi)
+///
+#[cfg(any(
+    all(
+        not(feature="no-xmul"),
+        target_arch="x86_64",
+        target_feature="pclmulqdq"
+    ),
+    all(
+        not(feature="no-xmul"),
+        feature="nightly",
+        target_arch="aarch64",
+        target_feature="neon"
+    )
 ))]
 #[inline]
-pub fn __pclmulqdq_u64(a: u64, b: u64) -> u64 {
-    use core::arch::x86_64::*;
-    unsafe {
-        let a = _mm_set_epi64x(0, a as i64);
-        let b = _mm_set_epi64x(0, b as i64);
-        let x = _mm_clmulepi64_si128::<0>(a, b);
-        _mm_extract_epi64::<0>(x) as u64
+pub fn xmul16(a: u16, b: u16) -> (u16, u16) {
+    cfg_if! {
+        if #[cfg(all(
+            not(feature="no-xmul"),
+            target_arch="x86_64",
+            target_feature="pclmulqdq"
+        ))] {
+            // x86_64 provides 64-bit xmul via the pclmulqdq instruction
+            use core::arch::x86_64::*;
+            unsafe {
+                let a = _mm_set_epi64x(0, a as i64);
+                let b = _mm_set_epi64x(0, b as i64);
+                let x = _mm_clmulepi64_si128::<0>(a, b);
+                let lo = _mm_extract_epi64::<0>(x) as u64;
+                (lo as u16, (lo >> 16) as u16)
+            }
+        } else if #[cfg(all(
+            not(feature="no-xmul"),
+            feature="nightly",
+            target_arch="aarch64",
+            target_feature="neon"
+        ))] {
+            // aarch64 provides 64-bit xmul via the pmull instruction
+            use core::arch::aarch64::*;
+            unsafe {
+                let x = vmull_p64(a as u64, b as u64);
+                (x as u16, (x >> 16) as u16)
+            }
+        }
     }
 }
 
-/// x86_64 provides 64-bit xmul via the pclmulqdq instruction
-#[cfg(all(
-    target_arch="x86_64",
-    target_feature="pclmulqdq"
+/// Widening carry-less multiplication, if hardware instructions are available
+///
+/// Result is a tuple (lo, hi)
+///
+#[cfg(any(
+    all(
+        not(feature="no-xmul"),
+        target_arch="x86_64",
+        target_feature="pclmulqdq"
+    ),
+    all(
+        not(feature="no-xmul"),
+        feature="nightly",
+        target_arch="aarch64",
+        target_feature="neon"
+    )
 ))]
 #[inline]
-pub fn __pclmulqdq_u128(a: u128, b: u128) -> u128 {
-    use core::arch::x86_64::*;
-    unsafe {
-        let a = _mm_set_epi64x((a >> 64) as i64, a as i64);
-        let b = _mm_set_epi64x((b >> 64) as i64, b as i64);
-        let x = _mm_clmulepi64_si128::<0x00>(a, b);
-        let y = _mm_clmulepi64_si128::<0x01>(a, b);
-        let z = _mm_clmulepi64_si128::<0x10>(a, b);
-        let x0 = _mm_extract_epi64::<0>(x) as u64;
-        let x1 = (_mm_extract_epi64::<1>(x) as u64)
-            ^ (_mm_extract_epi64::<0>(y) as u64)
-            ^ (_mm_extract_epi64::<0>(z) as u64);
-        ((x1 as u128) << 64) | (x0 as u128)
+pub fn xmul32(a: u32, b: u32) -> (u32, u32) {
+    cfg_if! {
+        if #[cfg(all(
+            not(feature="no-xmul"),
+            target_arch="x86_64",
+            target_feature="pclmulqdq"
+        ))] {
+            // x86_64 provides 64-bit xmul via the pclmulqdq instruction
+            use core::arch::x86_64::*;
+            unsafe {
+                let a = _mm_set_epi64x(0, a as i64);
+                let b = _mm_set_epi64x(0, b as i64);
+                let x = _mm_clmulepi64_si128::<0>(a, b);
+                let lo = _mm_extract_epi64::<0>(x) as u64;
+                (lo as u32, (lo >> 32) as u32)
+            }
+        } else if #[cfg(all(
+            not(feature="no-xmul"),
+            feature="nightly",
+            target_arch="aarch64",
+            target_feature="neon"
+        ))] {
+            // aarch64 provides 64-bit xmul via the pmull instruction
+            use core::arch::aarch64::*;
+            unsafe {
+                let x = vmull_p64(a as u64, b as u64);
+                (x as u32, (x >> 32) as u32)
+            }
+        }
     }
 }
 
-/// aarch64 provides 64-bit xmul via the pmull instruction
-#[cfg(all(
-    feature="use-nightly-features",
-    target_arch="aarch64",
-    target_feature="neon"
+/// Widening carry-less multiplication, if hardware instructions are available
+///
+/// Result is a tuple (lo, hi)
+///
+#[cfg(any(
+    all(
+        not(feature="no-xmul"),
+        target_arch="x86_64",
+        target_feature="pclmulqdq"
+    ),
+    all(
+        not(feature="no-xmul"),
+        feature="nightly",
+        target_arch="aarch64",
+        target_feature="neon"
+    )
 ))]
 #[inline]
-pub fn __pmull(a: u64, b: u64) -> u128 {
-    use core::arch::aarch64::*;
-    unsafe {
-        vmull_p64(a, b)
+pub fn xmul64(a: u64, b: u64) -> (u64, u64) {
+    cfg_if! {
+        if #[cfg(all(
+            not(feature="no-xmul"),
+            target_arch="x86_64",
+            target_feature="pclmulqdq"
+        ))] {
+            // x86_64 provides 64-bit xmul via the pclmulqdq instruction
+            use core::arch::x86_64::*;
+            unsafe {
+                let a = _mm_set_epi64x(0, a as i64);
+                let b = _mm_set_epi64x(0, b as i64);
+                let x = _mm_clmulepi64_si128::<0>(a, b);
+                let lo = _mm_extract_epi64::<0>(x) as u64;
+                let hi = _mm_extract_epi64::<1>(x) as u64;
+                (lo, hi)
+            }
+        } else if #[cfg(all(
+            not(feature="no-xmul"),
+            feature="nightly",
+            target_arch="aarch64",
+            target_feature="neon"
+        ))] {
+            // aarch64 provides 64-bit xmul via the pmull instruction
+            use core::arch::aarch64::*;
+            unsafe {
+                let x = vmull_p64(a as u64, b as u64);
+                (x as u64, (x >> 64) as u64)
+            }
+        }
     }
 }
 
-/// aarch64 provides 64-bit xmul via the pmull instruction
-#[cfg(all(
-    feature="use-nightly-features",
-    target_arch="aarch64",
-    target_feature="neon"
+/// Widening carry-less multiplication, if hardware instructions are available
+///
+/// Result is a tuple (lo, hi)
+///
+#[cfg(any(
+    all(
+        not(feature="no-xmul"),
+        target_arch="x86_64",
+        target_feature="pclmulqdq"
+    ),
+    all(
+        not(feature="no-xmul"),
+        feature="nightly",
+        target_arch="aarch64",
+        target_feature="neon"
+    )
 ))]
 #[inline]
-pub fn __pmull_u64(a: u64, b: u64) -> u64 {
-    use core::arch::aarch64::*;
-    unsafe {
-        vmull_p64(a, b) as u64
+pub fn xmul128(a: u128, b: u128) -> (u128, u128) {
+    cfg_if! {
+        if #[cfg(all(
+            not(feature="no-xmul"),
+            target_arch="x86_64",
+            target_feature="pclmulqdq"
+        ))] {
+            // x86_64 provides 64-bit xmul via the pclmulqdq instruction
+            use core::arch::x86_64::*;
+            unsafe {
+                let a = _mm_set_epi64x((a >> 64) as i64, a as i64);
+                let b = _mm_set_epi64x((b >> 64) as i64, b as i64);
+                let x = _mm_clmulepi64_si128::<0x00>(a, b);
+                let y = _mm_clmulepi64_si128::<0x01>(a, b);
+                let z = _mm_clmulepi64_si128::<0x10>(a, b);
+                let w = _mm_clmulepi64_si128::<0x11>(a, b);
+                let lolo = _mm_extract_epi64::<0>(x) as u64;
+                let lohi = (_mm_extract_epi64::<1>(x) as u64)
+                    ^ (_mm_extract_epi64::<0>(y) as u64)
+                    ^ (_mm_extract_epi64::<0>(z) as u64);
+                let hilo = (_mm_extract_epi64::<0>(w) as u64)
+                    ^ (_mm_extract_epi64::<1>(y) as u64)
+                    ^ (_mm_extract_epi64::<1>(z) as u64);
+                let hihi = _mm_extract_epi64::<1>(w) as u64;
+                let lo = ((lohi as u128) << 64) | (lolo as u128);
+                let hi = ((hihi as u128) << 64) | (hilo as u128);
+                (lo, hi)
+            }
+        } else if #[cfg(all(
+            not(feature="no-xmul"),
+            feature="nightly",
+            target_arch="aarch64",
+            target_feature="neon"
+        ))] {
+            // aarch64 provides 64-bit xmul via the pmull instruction
+            use core::arch::aarch64::*;
+            unsafe {
+                let x = vmull_p64(a as u64, b as u64);
+                let y = vmull_p64((a >> 64) as u64, (b >>  0) as u64);
+                let z = vmull_p64((a >>  0) as u64, (b >> 64) as u64);
+                let w = vmull_p64((a >> 64) as u64, (b >> 64) as u64);
+                (x ^ (y << 64) ^ (z << 64), w ^ (y >> 64) ^ (z >> 64))
+            }
+        }
     }
 }
 
-/// aarch64 provides 64-bit xmul via the pmull instruction
-#[cfg(all(
-    feature="use-nightly-features",
-    target_arch="aarch64",
-    target_feature="neon"
-))]
-#[inline]
-pub fn __pmull_u128(a: u128, b: u128) -> u128 {
-    use core::arch::aarch64::*;
-    unsafe {
-        let x = vmull_p64(a as u64, b as u64);
-        let y = vmull_p64((a >> 64) as u64, (b >>  0) as u64) << 64;
-        let z = vmull_p64((a >>  0) as u64, (b >> 64) as u64) << 64;
-        x ^ y ^ z
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[cfg(any(
+        all(
+            not(feature="no-xmul"),
+            target_arch="x86_64",
+            target_feature="pclmulqdq"
+        ),
+        all(
+            not(feature="no-xmul"),
+            feature="nightly",
+            target_arch="aarch64",
+            target_feature="neon"
+        )
+    ))]
+    #[test]
+    fn xmul() {
+        assert_eq!(xmul8(0x12, 0x12), (0x04, 0x01));
+        assert_eq!(xmul16(0x1234, 0x1234), (0x0510, 0x0104));
+        assert_eq!(xmul32(0x12345678, 0x12345678), (0x11141540, 0x01040510));
+        assert_eq!(xmul64(0x123456789abcdef1, 0x123456789abcdef1), (0x4144455051545501, 0x0104051011141540));
+        assert_eq!(xmul128(0x123456789abcdef123456789abcdef12, 0x123456789abcdef123456789abcdef12), (0x04051011141540414445505154550104, 0x01040510111415404144455051545501));
     }
 }