From 080f6d3d6cf9bf97b524ed475218540df1b92ebf Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Tue, 13 Oct 2020 00:06:27 -0500
Subject: [PATCH 1/8] Remove `aapcs_on_arm` mistake

---
 src/int/mul.rs | 1 -
 1 file changed, 1 deletion(-)
diff --git a/src/int/mul.rs b/src/int/mul.rs
index 42f13913..1e32560c 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -90,7 +90,6 @@ intrinsics! {
         a.mul(b)
     }
 
-    #[aapcs_on_arm]
     pub extern "C" fn __multi3(a: i128, b: i128) -> i128 {
         a.mul(b)
     }

From b1a7a00e488c0c27eac0fa510238121bde5a1298 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Mon, 12 Oct 2020 17:58:02 -0500
Subject: [PATCH 2/8] Introduce the `DInt` and `HInt` traits

and add various methods that will be used for improved fuzzing
---
 src/int/mod.rs | 203 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 202 insertions(+), 1 deletion(-)

diff --git a/src/int/mod.rs b/src/int/mod.rs
index 5e695d5f..da2263f6 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -12,13 +12,15 @@ pub mod udiv;
 pub use self::leading_zeros::__clzsi2;
 
 /// Trait for some basic operations on integers
-pub(crate) trait Int:
+#[doc(hidden)]
+pub trait Int:
     Copy
     + PartialEq
     + PartialOrd
     + ops::AddAssign
     + ops::BitAndAssign
     + ops::BitOrAssign
+    + ops::BitXorAssign
     + ops::ShlAssign<i32>
     + ops::ShrAssign<u32>
     + ops::Add<Output = Self>
@@ -41,6 +43,14 @@ pub(crate) trait Int:
 
     const ZERO: Self;
     const ONE: Self;
+    const MIN: Self;
+
+    /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
+    /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111,
+    /// 112,119,120,125,126,127].
+    const FUZZ_LENGTHS: [u8; 20];
+    /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
+    const FUZZ_NUM: usize;
 
     /// Extracts the sign from self and returns a tuple.
     ///
@@ -59,17 +69,25 @@ pub(crate) trait Int:
 
     fn from_bool(b: bool) -> Self;
 
+    /// Prevents the need for excessive conversions between signed and unsigned
+    fn logical_shr(self, other: u32) -> Self;
+
     // copied from primitive integers, but put in a trait
+    fn is_zero(self) -> bool;
     fn max_value() -> Self;
     fn min_value() -> Self;
+    fn wrapping_neg(self) -> Self;
     fn wrapping_add(self, other: Self) -> Self;
     fn wrapping_mul(self, other: Self) -> Self;
     fn wrapping_sub(self, other: Self) -> Self;
     fn wrapping_shl(self, other: u32) -> Self;
+    fn wrapping_shr(self, other: u32) -> Self;
+    fn rotate_left(self, other: u32) -> Self;
     fn overflowing_add(self, other: Self) -> (Self, bool);
     fn aborting_div(self, other: Self) -> Self;
     fn aborting_rem(self, other: Self) -> Self;
     fn leading_zeros(self) -> u32;
+    fn count_ones(self) -> u32;
 }
 
 fn unwrap<T>(t: Option<T>) -> T {
@@ -85,11 +103,78 @@ macro_rules! int_impl_common {
 
         const ZERO: Self = 0;
         const ONE: Self = 1;
+        const MIN: Self = <Self>::MIN;
+
+        const FUZZ_LENGTHS: [u8; 20] = {
+            let bits = <Self as Int>::BITS;
+            let mut v = [0u8; 20];
+            v[0] = 0;
+            v[1] = 1;
+            v[2] = 2; // important for parity and the iX::MIN case when reversed
+            let mut i = 3;
+            // No need for any more until the byte boundary, because there should be no algorithms
+            // that are sensitive to anything not next to byte boundaries after 2. We also scale
+            // in powers of two, which is important to prevent u128 corner tests from getting too
+            // big.
+            let mut l = 8;
+            loop {
+                if l >= ((bits / 2) as u8) {
+                    break;
+                }
+                // get both sides of the byte boundary
+                v[i] = l - 1;
+                i += 1;
+                v[i] = l;
+                i += 1;
+                l *= 2;
+            }
+
+            if bits != 8 {
+                // add the lower side of the middle boundary
+                v[i] = ((bits / 2) - 1) as u8;
+                i += 1;
+            }
+
+            // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+            // boundary because of algorithms that split the high part up. We reverse the scaling
+            // as we go to Self::BITS.
+            let mid = i;
+            let mut j = 1;
+            loop {
+                v[i] = (bits as u8) - (v[mid - j]) - 1;
+                if j == mid {
+                    break;
+                }
+                i += 1;
+                j += 1;
+            }
+            v
+        };
+
+        const FUZZ_NUM: usize = {
+            let log2 = (<Self as Int>::BITS - 1).count_ones() as usize;
+            if log2 == 3 {
+                // case for u8
+                6
+            } else {
+                // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+                // boundaries.
+                8 + (4 * (log2 - 4))
+            }
+        };
 
         fn from_bool(b: bool) -> Self {
             b as $ty
         }
 
+        fn logical_shr(self, other: u32) -> Self {
+            Self::from_unsigned(self.unsigned().wrapping_shr(other))
+        }
+
+        fn is_zero(self) -> bool {
+            self == Self::ZERO
+        }
+
         fn max_value() -> Self {
             <Self>::max_value()
         }
@@ -98,6 +183,10 @@ macro_rules! int_impl_common {
             <Self>::min_value()
         }
 
+        fn wrapping_neg(self) -> Self {
+            <Self>::wrapping_neg(self)
+        }
+
         fn wrapping_add(self, other: Self) -> Self {
             <Self>::wrapping_add(self, other)
         }
@@ -114,6 +203,14 @@ macro_rules! int_impl_common {
             <Self>::wrapping_shl(self, other)
         }
 
+        fn wrapping_shr(self, other: u32) -> Self {
+            <Self>::wrapping_shr(self, other)
+        }
+
+        fn rotate_left(self, other: u32) -> Self {
+            <Self>::rotate_left(self, other)
+        }
+
         fn overflowing_add(self, other: Self) -> (Self, bool) {
             <Self>::overflowing_add(self, other)
         }
@@ -129,6 +226,10 @@ macro_rules! int_impl_common {
         fn leading_zeros(self) -> u32 {
             <Self>::leading_zeros(self)
         }
+
+        fn count_ones(self) -> u32 {
+            <Self>::count_ones(self)
+        }
     };
 }
 
@@ -178,11 +279,111 @@ macro_rules! int_impl {
     };
 }
 
+int_impl!(isize, usize, usize::MAX.count_ones());
+int_impl!(i8, u8, 8);
 int_impl!(i16, u16, 16);
 int_impl!(i32, u32, 32);
 int_impl!(i64, u64, 64);
 int_impl!(i128, u128, 128);
 
+/// Trait for integers twice the bit width of another integer. This is implemented for all
+/// primitives except for `u8`, because there is not a smaller primitive.
+#[doc(hidden)]
+pub trait DInt: Int {
+    /// Integer that is half the bit width of the integer this trait is implemented for
+    type H: HInt<D = Self> + Int;
+
+    /// Returns the low half of `self`
+    fn lo(self) -> Self::H;
+    /// Returns the high half of `self`
+    fn hi(self) -> Self::H;
+    /// Returns the low and high halves of `self` as a tuple
+    fn lo_hi(self) -> (Self::H, Self::H);
+    /// Constructs an integer using lower and higher half parts
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self;
+}
+
+/// Trait for integers half the bit width of another integer. This is implemented for all
+/// primitives except for `u128`, because it there is not a larger primitive.
+#[doc(hidden)]
+pub trait HInt: Int {
+    /// Integer that is double the bit width of the integer this trait is implemented for
+    type D: DInt<H = Self> + Int;
+
+    /// Widens (using default extension) the integer to have double bit width
+    fn widen(self) -> Self::D;
+    /// Widens (zero extension only) the integer to have double bit width. This is needed to get
+    /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
+    fn zero_widen(self) -> Self::D;
+    /// Widens the integer to have double bit width and shifts the integer into the higher bits
+    fn widen_hi(self) -> Self::D;
+    /// Widening multiplication with zero widening. This cannot overflow.
+    fn zero_widen_mul(self, rhs: Self) -> Self::D;
+    /// Widening multiplication. This cannot overflow.
+    fn widen_mul(self, rhs: Self) -> Self::D;
+}
+
+macro_rules! impl_d_int {
+    ($($X:ident $D:ident),*) => {
+        $(
+            impl DInt for $D {
+                type H = $X;
+
+                fn lo(self) -> Self::H {
+                    self as $X
+                }
+                fn hi(self) -> Self::H {
+                    (self >> <$X as Int>::BITS) as $X
+                }
+                fn lo_hi(self) -> (Self::H, Self::H) {
+                    (self.lo(), self.hi())
+                }
+                fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+                    lo.zero_widen() | hi.widen_hi()
+                }
+            }
+        )*
+    };
+}
+
+macro_rules! impl_h_int {
+    ($($H:ident $uH:ident $X:ident),*) => {
+        $(
+            impl HInt for $H {
+                type D = $X;
+
+                fn widen(self) -> Self::D {
+                    self as $X
+                }
+                fn zero_widen(self) -> Self::D {
+                    (self as $uH) as $X
+                }
+                fn widen_hi(self) -> Self::D {
+                    (self as $X) << <$H as Int>::BITS
+                }
+                fn zero_widen_mul(self, rhs: Self) -> Self::D {
+                    self.zero_widen().wrapping_mul(rhs.zero_widen())
+                }
+                fn widen_mul(self, rhs: Self) -> Self::D {
+                    self.widen().wrapping_mul(rhs.widen())
+                }
+            }
+        )*
+    };
+}
+
+impl_d_int!(u8 u16, u16 u32, u32 u64, u64 u128, i8 i16, i16 i32, i32 i64, i64 i128);
+impl_h_int!(
+    u8 u8 u16,
+    u16 u16 u32,
+    u32 u32 u64,
+    u64 u64 u128,
+    i8 u8 i16,
+    i16 u16 i32,
+    i32 u32 i64,
+    i64 u64 i128
+);
+
 /// Trait to convert an integer to/from smaller parts
 pub(crate) trait LargeInt: Int {
     type LowHalf: Int;

From d1960ecb0cd698e28812899cb8461d80f4898b03 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Mon, 12 Oct 2020 17:59:34 -0500
Subject: [PATCH 3/8] Overhaul overflowing multiplication impls

---
 src/float/conv.rs |   6 +--
 src/int/mul.rs    | 123 +++++++++++++++++++++++++---------------------
 2 files changed, 71 insertions(+), 58 deletions(-)

diff --git a/src/float/conv.rs b/src/float/conv.rs
index 8a0fc6cb..e9ca0f13 100644
--- a/src/float/conv.rs
+++ b/src/float/conv.rs
@@ -11,7 +11,7 @@ macro_rules! int_to_float {
         let mant_dig = <$fty>::SIGNIFICAND_BITS + 1;
         let exponent_bias = <$fty>::EXPONENT_BIAS;
 
-        let n = <$ity>::BITS;
+        let n = <$ity as Int>::BITS;
         let (s, a) = i.extract_sign();
         let mut a = a;
 
@@ -21,7 +21,7 @@ macro_rules! int_to_float {
         // exponent
         let mut e = sd - 1;
 
-        if <$ity>::BITS < mant_dig {
+        if <$ity as Int>::BITS < mant_dig {
             return <$fty>::from_parts(
                 s,
                 (e + exponent_bias) as <$fty as Float>::Int,
@@ -165,7 +165,7 @@ macro_rules! float_to_int {
         let f = $f;
         let fixint_min = <$ity>::min_value();
         let fixint_max = <$ity>::max_value();
-        let fixint_bits = <$ity>::BITS as usize;
+        let fixint_bits = <$ity as Int>::BITS as usize;
         let fixint_unsigned = fixint_min == 0;
 
         let sign_bit = <$fty>::SIGN_MASK;
diff --git a/src/int/mul.rs b/src/int/mul.rs
index 1e32560c..e5c0afc1 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -1,7 +1,5 @@
-use core::ops;
-
-use int::Int;
 use int::LargeInt;
+use int::{DInt, HInt, Int};
 
 trait Mul: LargeInt {
     fn mul(self, other: Self) -> Self {
@@ -29,59 +27,72 @@ trait Mul: LargeInt {
 impl Mul for u64 {}
 impl Mul for i128 {}
 
-trait Mulo: Int + ops::Neg<Output = Self> {
-    fn mulo(self, other: Self, overflow: &mut i32) -> Self {
-        *overflow = 0;
-        let result = self.wrapping_mul(other);
-        if self == Self::min_value() {
-            if other != Self::ZERO && other != Self::ONE {
-                *overflow = 1;
+pub(crate) trait UMulo: Int + DInt {
+    fn mulo(self, rhs: Self) -> (Self, bool) {
+        match (self.hi().is_zero(), rhs.hi().is_zero()) {
+            // overflow is guaranteed
+            (false, false) => (self.wrapping_mul(rhs), true),
+            (true, false) => {
+                let mul_lo = self.lo().widen_mul(rhs.lo());
+                let mul_hi = self.lo().widen_mul(rhs.hi());
+                let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi());
+                (mul, o || !mul_hi.hi().is_zero())
             }
-            return result;
-        }
-        if other == Self::min_value() {
-            if self != Self::ZERO && self != Self::ONE {
-                *overflow = 1;
+            (false, true) => {
+                let mul_lo = rhs.lo().widen_mul(self.lo());
+                let mul_hi = rhs.lo().widen_mul(self.hi());
+                let (mul, o) = mul_lo.overflowing_add(mul_hi.lo().widen_hi());
+                (mul, o || !mul_hi.hi().is_zero())
             }
-            return result;
+            // overflow is guaranteed to not happen, and use a smaller widening multiplication
+            (true, true) => (self.lo().widen_mul(rhs.lo()), false),
         }
+    }
+}
 
-        let sa = self >> (Self::BITS - 1);
-        let abs_a = (self ^ sa) - sa;
-        let sb = other >> (Self::BITS - 1);
-        let abs_b = (other ^ sb) - sb;
-        let two = Self::ONE + Self::ONE;
-        if abs_a < two || abs_b < two {
-            return result;
-        }
-        if sa == sb {
-            if abs_a > Self::max_value().aborting_div(abs_b) {
-                *overflow = 1;
+impl UMulo for u32 {}
+impl UMulo for u64 {}
+impl UMulo for u128 {}
+
+macro_rules! impl_signed_mulo {
+    ($fn:ident, $iD:ident, $uD:ident) => {
+        fn $fn(lhs: $iD, rhs: $iD) -> ($iD, bool) {
+            let mut lhs = lhs;
+            let mut rhs = rhs;
+            // the test against `mul_neg` below fails without this early return
+            if lhs == 0 || rhs == 0 {
+                return (0, false);
             }
-        } else {
-            if abs_a > Self::min_value().aborting_div(-abs_b) {
-                *overflow = 1;
+
+            let lhs_neg = lhs < 0;
+            let rhs_neg = rhs < 0;
+            if lhs_neg {
+                lhs = lhs.wrapping_neg();
             }
-        }
-        result
-    }
-}
+            if rhs_neg {
+                rhs = rhs.wrapping_neg();
+            }
+            let mul_neg = lhs_neg != rhs_neg;
 
-impl Mulo for i32 {}
-impl Mulo for i64 {}
-impl Mulo for i128 {}
+            let (mul, o) = (lhs as $uD).mulo(rhs as $uD);
+            let mut mul = mul as $iD;
 
-trait UMulo: Int {
-    fn mulo(self, other: Self, overflow: &mut i32) -> Self {
-        *overflow = 0;
-        let result = self.wrapping_mul(other);
-        if self > Self::max_value().aborting_div(other) {
-            *overflow = 1;
+            if mul_neg {
+                mul = mul.wrapping_neg();
+            }
+            if (mul < 0) != mul_neg {
+                // this one check happens to catch all edge cases related to `$iD::MIN`
+                (mul, true)
+            } else {
+                (mul, o)
+            }
         }
-        result
-    }
+    };
 }
-impl UMulo for u128 {}
+
+impl_signed_mulo!(i32_overflowing_mul, i32, u32);
+impl_signed_mulo!(i64_overflowing_mul, i64, u64);
+impl_signed_mulo!(i128_overflowing_mul, i128, u128);
 
 intrinsics! {
     #[maybe_use_optimized_c_shim]
@@ -95,27 +106,29 @@ intrinsics! {
     }
 
     pub extern "C" fn __mulosi4(a: i32, b: i32, oflow: &mut i32) -> i32 {
-        a.mulo(b, oflow)
+        let (mul, o) = i32_overflowing_mul(a, b);
+        *oflow = o as i32;
+        mul
     }
 
     pub extern "C" fn __mulodi4(a: i64, b: i64, oflow: &mut i32) -> i64 {
-        a.mulo(b, oflow)
+        let (mul, o) = i64_overflowing_mul(a, b);
+        *oflow = o as i32;
+        mul
     }
 
     #[unadjusted_on_win64]
     pub extern "C" fn __muloti4(a: i128, b: i128, oflow: &mut i32) -> i128 {
-        a.mulo(b, oflow)
+        let (mul, o) = i128_overflowing_mul(a, b);
+        *oflow = o as i32;
+        mul
     }
 
     pub extern "C" fn __rust_i128_mulo(a: i128, b: i128) -> (i128, bool) {
-        let mut oflow = 0;
-        let r = __muloti4(a, b, &mut oflow);
-        (r, oflow != 0)
+        i128_overflowing_mul(a, b)
     }
 
     pub extern "C" fn __rust_u128_mulo(a: u128, b: u128) -> (u128, bool) {
-        let mut oflow = 0;
-        let r = a.mulo(b, &mut oflow);
-        (r, oflow != 0)
+        a.mulo(b)
     }
 }

From 400c5042d8fcc951e1c27252615cb4b9557ad084 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Mon, 12 Oct 2020 23:58:55 -0500
Subject: [PATCH 4/8] Completely replace `LargeInt`

---
 src/int/addsub.rs | 84 +++++++++++++----------------------------------
 src/int/mod.rs    | 44 -------------------------
 src/int/mul.rs    | 45 +++++++++++++------------
 src/int/shift.rs  | 72 ++++++++++++++++++----------------------
 src/macros.rs     |  8 ++---
 5 files changed, 83 insertions(+), 170 deletions(-)

diff --git a/src/int/addsub.rs b/src/int/addsub.rs
index 0a88e2fc..f4841e90 100644
--- a/src/int/addsub.rs
+++ b/src/int/addsub.rs
@@ -1,25 +1,16 @@
-use int::Int;
-use int::LargeInt;
+use int::{DInt, Int};
 
-trait UAddSub: LargeInt {
+trait UAddSub: DInt {
     fn uadd(self, other: Self) -> Self {
-        let (low, carry) = self.low().overflowing_add(other.low());
-        let high = self.high().wrapping_add(other.high());
-        let carry = if carry {
-            Self::HighHalf::ONE
-        } else {
-            Self::HighHalf::ZERO
-        };
-        Self::from_parts(low, high.wrapping_add(carry))
+        let (lo, carry) = self.lo().overflowing_add(other.lo());
+        let hi = self.hi().wrapping_add(other.hi());
+        let carry = if carry { Self::H::ONE } else { Self::H::ZERO };
+        Self::from_lo_hi(lo, hi.wrapping_add(carry))
     }
     fn uadd_one(self) -> Self {
-        let (low, carry) = self.low().overflowing_add(Self::LowHalf::ONE);
-        let carry = if carry {
-            Self::HighHalf::ONE
-        } else {
-            Self::HighHalf::ZERO
-        };
-        Self::from_parts(low, self.high().wrapping_add(carry))
+        let (lo, carry) = self.lo().overflowing_add(Self::H::ONE);
+        let carry = if carry { Self::H::ONE } else { Self::H::ZERO };
+        Self::from_lo_hi(lo, self.hi().wrapping_add(carry))
     }
     fn usub(self, other: Self) -> Self {
         let uneg = (!other).uadd_one();
@@ -48,19 +39,9 @@ trait Addo: AddSub
 where
     <Self as Int>::UnsignedInt: UAddSub,
 {
-    fn addo(self, other: Self, overflow: &mut i32) -> Self {
-        *overflow = 0;
-        let result = AddSub::add(self, other);
-        if other >= Self::ZERO {
-            if result < self {
-                *overflow = 1;
-            }
-        } else {
-            if result >= self {
-                *overflow = 1;
-            }
-        }
-        result
+    fn addo(self, other: Self) -> (Self, bool) {
+        let sum = AddSub::add(self, other);
+        (sum, (other < Self::ZERO) != (sum < self))
     }
 }
 
@@ -71,19 +52,9 @@ trait Subo: AddSub
 where
     <Self as Int>::UnsignedInt: UAddSub,
 {
-    fn subo(self, other: Self, overflow: &mut i32) -> Self {
-        *overflow = 0;
-        let result = AddSub::sub(self, other);
-        if other >= Self::ZERO {
-            if result > self {
-                *overflow = 1;
-            }
-        } else {
-            if result <= self {
-                *overflow = 1;
-            }
-        }
-        result
+    fn subo(self, other: Self) -> (Self, bool) {
+        let sum = AddSub::sub(self, other);
+        (sum, (other < Self::ZERO) != (self < sum))
     }
 }
 
@@ -92,43 +63,34 @@ impl Subo for u128 {}
 
 intrinsics! {
     pub extern "C" fn __rust_i128_add(a: i128, b: i128) -> i128 {
-        __rust_u128_add(a as _, b as _) as _
+        AddSub::add(a,b)
     }
 
     pub extern "C" fn __rust_i128_addo(a: i128, b: i128) -> (i128, bool) {
-        let mut oflow = 0;
-        let r = a.addo(b, &mut oflow);
-        (r, oflow != 0)
+        a.addo(b)
     }
 
     pub extern "C" fn __rust_u128_add(a: u128, b: u128) -> u128 {
-        a.add(b)
+        AddSub::add(a,b)
     }
 
     pub extern "C" fn __rust_u128_addo(a: u128, b: u128) -> (u128, bool) {
-        let mut oflow = 0;
-        let r = a.addo(b, &mut oflow);
-        (r, oflow != 0)
+        a.addo(b)
     }
 
-
     pub extern "C" fn __rust_i128_sub(a: i128, b: i128) -> i128 {
-        __rust_u128_sub(a as _, b as _) as _
+        AddSub::sub(a,b)
     }
 
     pub extern "C" fn __rust_i128_subo(a: i128, b: i128) -> (i128, bool) {
-        let mut oflow = 0;
-        let r = a.subo(b, &mut oflow);
-        (r, oflow != 0)
+        a.subo(b)
     }
 
     pub extern "C" fn __rust_u128_sub(a: u128, b: u128) -> u128 {
-        a.sub(b)
+        AddSub::sub(a,b)
     }
 
     pub extern "C" fn __rust_u128_subo(a: u128, b: u128) -> (u128, bool) {
-        let mut oflow = 0;
-        let r = a.subo(b, &mut oflow);
-        (r, oflow != 0)
+        a.subo(b)
     }
 }
diff --git a/src/int/mod.rs b/src/int/mod.rs
index da2263f6..1ce3d92d 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -384,50 +384,6 @@ impl_h_int!(
     i64 u64 i128
 );
 
-/// Trait to convert an integer to/from smaller parts
-pub(crate) trait LargeInt: Int {
-    type LowHalf: Int;
-    type HighHalf: Int;
-
-    fn low(self) -> Self::LowHalf;
-    fn low_as_high(low: Self::LowHalf) -> Self::HighHalf;
-    fn high(self) -> Self::HighHalf;
-    fn high_as_low(low: Self::HighHalf) -> Self::LowHalf;
-    fn from_parts(low: Self::LowHalf, high: Self::HighHalf) -> Self;
-}
-
-macro_rules! large_int {
-    ($ty:ty, $tylow:ty, $tyhigh:ty, $halfbits:expr) => {
-        impl LargeInt for $ty {
-            type LowHalf = $tylow;
-            type HighHalf = $tyhigh;
-
-            fn low(self) -> $tylow {
-                self as $tylow
-            }
-            fn low_as_high(low: $tylow) -> $tyhigh {
-                low as $tyhigh
-            }
-            fn high(self) -> $tyhigh {
-                (self >> $halfbits) as $tyhigh
-            }
-            fn high_as_low(high: $tyhigh) -> $tylow {
-                high as $tylow
-            }
-            fn from_parts(low: $tylow, high: $tyhigh) -> $ty {
-                low as $ty | ((high as $ty) << $halfbits)
-            }
-        }
-    };
-}
-
-large_int!(u32, u16, u16, 16);
-large_int!(i32, u16, i16, 16);
-large_int!(u64, u32, u32, 32);
-large_int!(i64, u32, i32, 32);
-large_int!(u128, u64, u64, 64);
-large_int!(i128, u64, i64, 64);
-
 /// Trait to express (possibly lossy) casting of integers
 pub(crate) trait CastInto<T: Copy>: Copy {
     fn cast(self) -> T;
diff --git a/src/int/mul.rs b/src/int/mul.rs
index e5c0afc1..a5238eea 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -1,26 +1,29 @@
-use int::LargeInt;
 use int::{DInt, HInt, Int};
 
-trait Mul: LargeInt {
-    fn mul(self, other: Self) -> Self {
-        let half_bits = Self::BITS / 4;
-        let lower_mask = !<<Self as LargeInt>::LowHalf>::ZERO >> half_bits;
-        let mut low = (self.low() & lower_mask).wrapping_mul(other.low() & lower_mask);
-        let mut t = low >> half_bits;
-        low &= lower_mask;
-        t += (self.low() >> half_bits).wrapping_mul(other.low() & lower_mask);
-        low += (t & lower_mask) << half_bits;
-        let mut high = Self::low_as_high(t >> half_bits);
-        t = low >> half_bits;
-        low &= lower_mask;
-        t += (other.low() >> half_bits).wrapping_mul(self.low() & lower_mask);
-        low += (t & lower_mask) << half_bits;
-        high += Self::low_as_high(t >> half_bits);
-        high += Self::low_as_high((self.low() >> half_bits).wrapping_mul(other.low() >> half_bits));
-        high = high
-            .wrapping_add(self.high().wrapping_mul(Self::low_as_high(other.low())))
-            .wrapping_add(Self::low_as_high(self.low()).wrapping_mul(other.high()));
-        Self::from_parts(low, high)
+trait Mul: DInt
+where
+    Self::H: DInt,
+{
+    fn mul(self, rhs: Self) -> Self {
+        // In order to prevent infinite recursion, we cannot use the `widen_mul` in this:
+        //self.lo().widen_mul(rhs.lo())
+        //    .wrapping_add(self.lo().wrapping_mul(rhs.hi()).widen_hi())
+        //    .wrapping_add(self.hi().wrapping_mul(rhs.lo()).widen_hi())
+
+        let lhs_lo = self.lo();
+        let rhs_lo = rhs.lo();
+        // construct the widening multiplication using only `Self::H` sized multiplications
+        let tmp_0 = lhs_lo.lo().zero_widen_mul(rhs_lo.lo());
+        let tmp_1 = lhs_lo.lo().zero_widen_mul(rhs_lo.hi());
+        let tmp_2 = lhs_lo.hi().zero_widen_mul(rhs_lo.lo());
+        let tmp_3 = lhs_lo.hi().zero_widen_mul(rhs_lo.hi());
+        // sum up all widening partials
+        let mul = Self::from_lo_hi(tmp_0, tmp_3)
+            .wrapping_add(tmp_1.zero_widen() << (Self::BITS / 4))
+            .wrapping_add(tmp_2.zero_widen() << (Self::BITS / 4));
+        // add the higher partials
+        mul.wrapping_add(lhs_lo.wrapping_mul(rhs.hi()).widen_hi())
+            .wrapping_add(self.hi().wrapping_mul(rhs_lo).widen_hi())
     }
 }
 
diff --git a/src/int/shift.rs b/src/int/shift.rs
index 674c3ee8..20561786 100644
--- a/src/int/shift.rs
+++ b/src/int/shift.rs
@@ -1,20 +1,18 @@
-use int::{Int, LargeInt};
+use int::{DInt, HInt, Int};
 
-trait Ashl: Int + LargeInt {
+trait Ashl: DInt {
     /// Returns `a << b`, requires `b < Self::BITS`
-    fn ashl(self, offset: u32) -> Self
-    where
-        Self: LargeInt<HighHalf = <Self as LargeInt>::LowHalf>,
-    {
-        let half_bits = Self::BITS / 2;
-        if offset & half_bits != 0 {
-            Self::from_parts(Int::ZERO, self.low() << (offset - half_bits))
-        } else if offset == 0 {
+    fn ashl(self, shl: u32) -> Self {
+        let n_h = Self::H::BITS;
+        if shl & n_h != 0 {
+            // we only need `self.lo()` because `self.hi()` will be shifted out entirely
+            (self.lo() << (shl - n_h)).widen_hi()
+        } else if shl == 0 {
             self
         } else {
-            Self::from_parts(
-                self.low() << offset,
-                (self.high() << offset) | (self.low() >> (half_bits - offset)),
+            Self::from_lo_hi(
+                self.lo() << shl,
+                self.lo().logical_shr(n_h - shl) | (self.hi() << shl),
             )
         }
     }
@@ -24,25 +22,22 @@ impl Ashl for u32 {}
 impl Ashl for u64 {}
 impl Ashl for u128 {}
 
-trait Ashr: Int + LargeInt {
+trait Ashr: DInt {
     /// Returns arithmetic `a >> b`, requires `b < Self::BITS`
-    fn ashr(self, offset: u32) -> Self
-    where
-        Self: LargeInt<LowHalf = <<Self as LargeInt>::HighHalf as Int>::UnsignedInt>,
-    {
-        let half_bits = Self::BITS / 2;
-        if offset & half_bits != 0 {
-            Self::from_parts(
-                (self.high() >> (offset - half_bits)).unsigned(),
-                self.high() >> (half_bits - 1),
+    fn ashr(self, shr: u32) -> Self {
+        let n_h = Self::H::BITS;
+        if shr & n_h != 0 {
+            Self::from_lo_hi(
+                self.hi() >> (shr - n_h),
+                // smear the sign bit
+                self.hi() >> (n_h - 1),
             )
-        } else if offset == 0 {
+        } else if shr == 0 {
             self
         } else {
-            let high_unsigned = self.high().unsigned();
-            Self::from_parts(
-                (high_unsigned << (half_bits - offset)) | (self.low() >> offset),
-                self.high() >> offset,
+            Self::from_lo_hi(
+                self.lo().logical_shr(shr) | (self.hi() << (n_h - shr)),
+                self.hi() >> shr,
             )
         }
     }
@@ -52,21 +47,18 @@ impl Ashr for i32 {}
 impl Ashr for i64 {}
 impl Ashr for i128 {}
 
-trait Lshr: Int + LargeInt {
+trait Lshr: DInt {
     /// Returns logical `a >> b`, requires `b < Self::BITS`
-    fn lshr(self, offset: u32) -> Self
-    where
-        Self: LargeInt<HighHalf = <Self as LargeInt>::LowHalf>,
-    {
-        let half_bits = Self::BITS / 2;
-        if offset & half_bits != 0 {
-            Self::from_parts(self.high() >> (offset - half_bits), Int::ZERO)
-        } else if offset == 0 {
+    fn lshr(self, shr: u32) -> Self {
+        let n_h = Self::H::BITS;
+        if shr & n_h != 0 {
+            self.hi().logical_shr(shr - n_h).zero_widen()
+        } else if shr == 0 {
             self
         } else {
-            Self::from_parts(
-                (self.high() << (half_bits - offset)) | (self.low() >> offset),
-                self.high() >> offset,
+            Self::from_lo_hi(
+                self.lo().logical_shr(shr) | (self.hi() << (n_h - shr)),
+                self.hi().logical_shr(shr),
             )
         }
     }
diff --git a/src/macros.rs b/src/macros.rs
index b02f3ea5..56f27164 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -284,16 +284,16 @@ pub mod win64_128bit_abi_hack {
 
     impl From<i128> for U64x2 {
         fn from(i: i128) -> U64x2 {
-            use int::LargeInt;
+            use int::DInt;
             let j = i as u128;
-            U64x2(j.low(), j.high())
+            U64x2(j.lo(), j.hi())
         }
     }
 
     impl From<u128> for U64x2 {
         fn from(i: u128) -> U64x2 {
-            use int::LargeInt;
-            U64x2(i.low(), i.high())
+            use int::DInt;
+            U64x2(i.lo(), i.hi())
         }
     }
 }

From 35e323aa0097208d4b7db75fcb8d73d0c83ff3d3 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Tue, 1 Dec 2020 23:16:26 -0600
Subject: [PATCH 5/8] Remove `WideInt`

---
 src/float/cmp.rs | 19 ++++---------------
 src/float/div.rs | 10 +++++-----
 src/float/mod.rs | 11 ++++++++---
 src/float/mul.rs | 30 +++++++++++++++++++-----------
 src/int/mod.rs   | 45 ---------------------------------------------
 5 files changed, 36 insertions(+), 79 deletions(-)

diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index 20ab92e4..79c26b09 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -1,7 +1,7 @@
 #![allow(unreachable_code)]
 
 use float::Float;
-use int::{CastInto, Int};
+use int::Int;
 
 #[derive(Clone, Copy)]
 enum Result {
@@ -31,13 +31,7 @@ impl Result {
     }
 }
 
-fn cmp<F: Float>(a: F, b: F) -> Result
-where
-    u32: CastInto<F::Int>,
-    F::Int: CastInto<u32>,
-    i32: CastInto<F::Int>,
-    F::Int: CastInto<i32>,
-{
+fn cmp<F: Float>(a: F, b: F) -> Result {
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
     let szero = F::SignedInt::ZERO;
@@ -90,13 +84,8 @@ where
         }
     }
 }
-fn unord<F: Float>(a: F, b: F) -> bool
-where
-    u32: CastInto<F::Int>,
-    F::Int: CastInto<u32>,
-    i32: CastInto<F::Int>,
-    F::Int: CastInto<i32>,
-{
+
+fn unord<F: Float>(a: F, b: F) -> bool {
     let one = F::Int::ONE;
 
     let sign_bit = F::SIGN_MASK as F::Int;
diff --git a/src/float/div.rs b/src/float/div.rs
index 7c582a44..dd6467f8 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -1,5 +1,5 @@
 use float::Float;
-use int::{CastInto, Int, WideInt};
+use int::{CastInto, DInt, HInt, Int};
 
 fn div32<F: Float>(a: F, b: F) -> F
 where
@@ -7,7 +7,7 @@ where
     F::Int: CastInto<u32>,
     i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
-    F::Int: WideInt,
+    F::Int: HInt,
 {
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
@@ -156,7 +156,7 @@ where
     //       is the error in the reciprocal of b scaled by the maximum
     //       possible value of a.  As a consequence of this error bound,
     //       either q or nextafter(q) is the correctly rounded
-    let (mut quotient, _) = <F::Int as WideInt>::wide_mul(a_significand << 1, reciprocal.cast());
+    let mut quotient = (a_significand << 1).widen_mul(reciprocal.cast()).hi();
 
     // Two cases: quotient is in [0.5, 1.0) or quotient is in [1.0, 2.0).
     // In either case, we are going to compute a residual of the form
@@ -211,7 +211,7 @@ where
     F::Int: CastInto<u64>,
     i64: CastInto<F::Int>,
     F::Int: CastInto<i64>,
-    F::Int: WideInt,
+    F::Int: HInt,
 {
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
@@ -394,7 +394,7 @@ where
 
     // We need a 64 x 64 multiply high to compute q, which isn't a basic
     // operation in C, so we need to be a little bit fussy.
-    let (mut quotient, _) = <F::Int as WideInt>::wide_mul(a_significand << 2, reciprocal.cast());
+    let mut quotient = (a_significand << 2).widen_mul(reciprocal.cast()).hi();
 
     // Two cases: quotient is in [0.5, 1.0) or quotient is in [1.0, 2.0).
     // In either case, we are going to compute a residual of the form
diff --git a/src/float/mod.rs b/src/float/mod.rs
index 06e9aad4..34b3c6ac 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -13,7 +13,8 @@ pub mod pow;
 pub mod sub;
 
 /// Trait for some basic operations on floats
-pub(crate) trait Float:
+#[doc(hidden)]
+pub trait Float:
     Copy
     + PartialEq
     + PartialOrd
@@ -66,7 +67,6 @@ pub(crate) trait Float:
     /// Returns `self` transmuted to `Self::SignedInt`
     fn signed_repr(self) -> Self::SignedInt;
 
-    #[cfg(test)]
     /// Checks if two floats have the same bit representation. *Except* for NaNs! NaN can be
     /// represented in multiple different ways. This method returns `true` if two NaNs are
     /// compared.
@@ -80,6 +80,9 @@ pub(crate) trait Float:
 
     /// Returns (normalized exponent, normalized significand)
     fn normalize(significand: Self::Int) -> (i32, Self::Int);
+
+    /// Returns if `self` is subnormal
+    fn is_subnormal(&self) -> bool;
 }
 
 // FIXME: Some of this can be removed if RFC Issue #1424 is resolved
@@ -106,7 +109,6 @@ macro_rules! float_impl {
             fn signed_repr(self) -> Self::SignedInt {
                 unsafe { mem::transmute(self) }
             }
-            #[cfg(test)]
             fn eq_repr(self, rhs: Self) -> bool {
                 if self.is_nan() && rhs.is_nan() {
                     true
@@ -133,6 +135,9 @@ macro_rules! float_impl {
                     significand << shift as Self::Int,
                 )
             }
+            fn is_subnormal(&self) -> bool {
+                (self.repr() & Self::EXPONENT_MASK) == Self::Int::ZERO
+            }
         }
     };
 }
diff --git a/src/float/mul.rs b/src/float/mul.rs
index 7b28793c..540e7bdc 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -1,5 +1,5 @@
 use float::Float;
-use int::{CastInto, Int, WideInt};
+use int::{CastInto, DInt, HInt, Int};
 
 fn mul<F: Float>(a: F, b: F) -> F
 where
@@ -7,7 +7,7 @@ where
     F::Int: CastInto<u32>,
     i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
-    F::Int: WideInt,
+    F::Int: HInt,
 {
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
@@ -112,8 +112,9 @@ where
     // have (exponentBits + 2) integral digits, all but two of which must be
     // zero.  Normalizing this result is just a conditional left-shift by one
     // and bumping the exponent accordingly.
-    let (mut product_high, mut product_low) =
-        <F::Int as WideInt>::wide_mul(a_significand, b_significand << exponent_bits);
+    let (mut product_low, mut product_high) = a_significand
+        .widen_mul(b_significand << exponent_bits)
+        .lo_hi();
 
     let a_exponent_i32: i32 = a_exponent.cast();
     let b_exponent_i32: i32 = b_exponent.cast();
@@ -126,7 +127,8 @@ where
     if (product_high & implicit_bit) != zero {
         product_exponent = product_exponent.wrapping_add(1);
     } else {
-        <F::Int as WideInt>::wide_shift_left(&mut product_high, &mut product_low, 1);
+        product_high = (product_high << 1) | (product_low >> (bits - 1));
+        product_low <<= 1;
     }
 
     // If we have overflowed the type, return +/- infinity.
@@ -142,17 +144,23 @@ where
         // handle this case separately, but we make it a special case to
         // simplify the shift logic.
         let shift = one.wrapping_sub(product_exponent.cast()).cast();
-        if shift >= bits as i32 {
+        if shift >= bits {
             return F::from_repr(product_sign);
         }
 
         // Otherwise, shift the significand of the result so that the round
         // bit is the high bit of productLo.
-        <F::Int as WideInt>::wide_shift_right_with_sticky(
-            &mut product_high,
-            &mut product_low,
-            shift,
-        )
+        if shift < bits {
+            let sticky = product_low << (bits - shift);
+            product_low = product_high << (bits - shift) | product_low >> shift | sticky;
+            product_high >>= shift;
+        } else if shift < (2 * bits) {
+            let sticky = product_high << (2 * bits - shift) | product_low;
+            product_low = product_high >> (shift - bits) | sticky;
+            product_high = zero;
+        } else {
+            product_high = zero;
+        }
     } else {
         // Result is normal before rounding; insert the exponent.
         product_high &= significand_mask;
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 1ce3d92d..d2302826 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -408,48 +408,3 @@ cast_into!(u64);
 cast_into!(i64);
 cast_into!(u128);
 cast_into!(i128);
-
-pub(crate) trait WideInt: Int {
-    type Output: Int;
-
-    fn wide_mul(self, other: Self) -> (Self, Self);
-    fn wide_shift_left(&mut self, low: &mut Self, count: i32);
-    fn wide_shift_right_with_sticky(&mut self, low: &mut Self, count: i32);
-}
-
-macro_rules! impl_wide_int {
-    ($ty:ty, $tywide:ty, $bits:expr) => {
-        impl WideInt for $ty {
-            type Output = $ty;
-
-            fn wide_mul(self, other: Self) -> (Self, Self) {
-                let product = (self as $tywide).wrapping_mul(other as $tywide);
-                ((product >> ($bits as $ty)) as $ty, product as $ty)
-            }
-
-            fn wide_shift_left(&mut self, low: &mut Self, count: i32) {
-                *self = (*self << count) | (*low >> ($bits - count));
-                *low = *low << count;
-            }
-
-            fn wide_shift_right_with_sticky(&mut self, low: &mut Self, count: i32) {
-                if count < $bits {
-                    let sticky = *low << ($bits - count);
-                    *low = *self << ($bits - count) | *low >> count | sticky;
-                    *self = *self >> count;
-                } else if count < 2 * $bits {
-                    let sticky = *self << (2 * $bits - count) | *low;
-                    *low = *self >> (count - $bits) | sticky;
-                    *self = 0;
-                } else {
-                    let sticky = *self | *low;
-                    *self = sticky;
-                    *self = 0;
-                }
-            }
-        }
-    };
-}
-
-impl_wide_int!(u32, u64, 32);
-impl_wide_int!(u64, u128, 64);

From 96eaffff5a9bc300114d2ba135707e17ffb0ba92 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Tue, 1 Dec 2020 23:09:57 -0600
Subject: [PATCH 6/8] replace some transmutes

---
 src/float/mod.rs   | 9 +++------
 testcrate/build.rs | 8 ++++----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/float/mod.rs b/src/float/mod.rs
index 34b3c6ac..c4b69016 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -1,4 +1,3 @@
-use core::mem;
 use core::ops;
 
 use super::int::Int;
@@ -85,8 +84,6 @@ pub trait Float:
     fn is_subnormal(&self) -> bool;
 }
 
-// FIXME: Some of this can be removed if RFC Issue #1424 is resolved
-//        https://github.com/rust-lang/rfcs/issues/1424
 macro_rules! float_impl {
     ($ty:ident, $ity:ident, $sity:ident, $bits:expr, $significand_bits:expr) => {
         impl Float for $ty {
@@ -104,10 +101,10 @@ macro_rules! float_impl {
             const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK);
 
             fn repr(self) -> Self::Int {
-                unsafe { mem::transmute(self) }
+                self.to_bits()
             }
             fn signed_repr(self) -> Self::SignedInt {
-                unsafe { mem::transmute(self) }
+                self.to_bits() as Self::SignedInt
             }
             fn eq_repr(self, rhs: Self) -> bool {
                 if self.is_nan() && rhs.is_nan() {
@@ -117,7 +114,7 @@ macro_rules! float_impl {
                 }
             }
             fn from_repr(a: Self::Int) -> Self {
-                unsafe { mem::transmute(a) }
+                Self::from_bits(a)
             }
             fn from_parts(sign: bool, exponent: Self::Int, significand: Self::Int) -> Self {
                 Self::from_repr(
diff --git a/testcrate/build.rs b/testcrate/build.rs
index 1baa6a96..1ecd0179 100644
--- a/testcrate/build.rs
+++ b/testcrate/build.rs
@@ -633,7 +633,7 @@ fn main() {
             if a.0.is_nan()
                 || b.0.is_nan()
                 || c.is_nan()
-                || c.abs() <= unsafe { mem::transmute(4503599627370495u64) }
+                || c.abs() <= f64::from_bits(4503599627370495u64)
             {
                 None
             } else {
@@ -651,7 +651,7 @@ fn main() {
             if a.0.is_nan()
                 || b.0.is_nan()
                 || c.is_nan()
-                || c.abs() <= unsafe { mem::transmute(16777215u32) }
+                || c.abs() <= f32::from_bits(16777215u32)
             {
                 None
             } else {
@@ -671,7 +671,7 @@ fn main() {
                 if a.0.is_nan()
                     || b.0.is_nan()
                     || c.is_nan()
-                    || c.abs() <= unsafe { mem::transmute(4503599627370495u64) }
+                    || c.abs() <= f64::from_bits(4503599627370495u64)
                 {
                     None
                 } else {
@@ -689,7 +689,7 @@ fn main() {
                 if a.0.is_nan()
                     || b.0.is_nan()
                     || c.is_nan()
-                    || c.abs() <= unsafe { mem::transmute(16777215u32) }
+                    || c.abs() <= f32::from_bits(16777215u32)
                 {
                     None
                 } else {

From 430c0b41d08821cd81ec5afb748d86606fdc4d40 Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Sat, 7 Nov 2020 13:09:51 -0600
Subject: [PATCH 7/8] fix some clippy warnings

---
 src/mem/impls.rs   | 8 +++-----
 src/mem/mod.rs     | 4 ++--
 src/mem/x86_64.rs  | 4 ++--
 testcrate/build.rs | 5 +----
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/mem/impls.rs b/src/mem/impls.rs
index 6bd1a7ba..b3eef990 100644
--- a/src/mem/impls.rs
+++ b/src/mem/impls.rs
@@ -1,10 +1,8 @@
-use super::c_int;
-
 #[inline(always)]
 pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, n: usize) {
     let mut i = 0;
     while i < n {
-        *dest.offset(i as isize) = *src.offset(i as isize);
+        *dest.add(i) = *src.add(i);
         i += 1;
     }
 }
@@ -15,7 +13,7 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
     let mut i = n;
     while i != 0 {
         i -= 1;
-        *dest.offset(i as isize) = *src.offset(i as isize);
+        *dest.add(i) = *src.add(i);
     }
 }
 
@@ -23,7 +21,7 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, n: usize) {
 pub unsafe fn set_bytes(s: *mut u8, c: u8, n: usize) {
     let mut i = 0;
     while i < n {
-        *s.offset(i as isize) = c;
+        *s.add(i) = c;
         i += 1;
     }
 }
diff --git a/src/mem/mod.rs b/src/mem/mod.rs
index adb7c2c6..107762c4 100644
--- a/src/mem/mod.rs
+++ b/src/mem/mod.rs
@@ -45,8 +45,8 @@ pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
 pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
     let mut i = 0;
     while i < n {
-        let a = *s1.offset(i as isize);
-        let b = *s2.offset(i as isize);
+        let a = *s1.add(i);
+        let b = *s2.add(i);
         if a != b {
             return a as i32 - b as i32;
         }
diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs
index 7eefd809..8cbbdf77 100644
--- a/src/mem/x86_64.rs
+++ b/src/mem/x86_64.rs
@@ -59,8 +59,8 @@ pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
         "cld",
         byte_count = in(reg) byte_count,
         inout("rcx") qword_count => _,
-        inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _,
-        inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
+        inout("rdi") dest.add(count).wrapping_sub(8) => _,
+        inout("rsi") src.add(count).wrapping_sub(8) => _,
         options(nostack)
     );
 }
diff --git a/testcrate/build.rs b/testcrate/build.rs
index 1ecd0179..39c2486c 100644
--- a/testcrate/build.rs
+++ b/testcrate/build.rs
@@ -648,10 +648,7 @@ fn main() {
                 return None;
             }
             let c = a.0 / b.0;
-            if a.0.is_nan()
-                || b.0.is_nan()
-                || c.is_nan()
-                || c.abs() <= f32::from_bits(16777215u32)
+            if a.0.is_nan() || b.0.is_nan() || c.is_nan() || c.abs() <= f32::from_bits(16777215u32)
             {
                 None
             } else {

From c2ff1b3119dafb4c56e8e9b8b75f20b9fd4ba3ed Mon Sep 17 00:00:00 2001
From: Aaron Kutch <aaronkutch@att.net>
Date: Mon, 7 Dec 2020 23:25:42 -0600
Subject: [PATCH 8/8] Completely overhaul fuzz testing

adds testing for almost every numerical intrinsic
---
 src/int/mod.rs                   |  11 ++
 testcrate/Cargo.toml             |   2 +-
 testcrate/src/lib.rs             | 258 +++++++++++++++++++++++++++++++
 testcrate/tests/addsub.rs        | 109 +++++++++++++
 testcrate/tests/cmp.rs           |  52 +++++++
 testcrate/tests/conv.rs          | 125 +++++++++++++++
 testcrate/tests/div_rem.rs       | 147 +++++++++---------
 testcrate/tests/leading_zeros.rs |  54 -------
 testcrate/tests/misc.rs          | 134 ++++++++++++++++
 testcrate/tests/mul.rs           | 114 ++++++++++++++
 testcrate/tests/shift.rs         |  60 +++++++
 11 files changed, 937 insertions(+), 129 deletions(-)
 create mode 100644 testcrate/tests/addsub.rs
 create mode 100644 testcrate/tests/cmp.rs
 create mode 100644 testcrate/tests/conv.rs
 delete mode 100644 testcrate/tests/leading_zeros.rs
 create mode 100644 testcrate/tests/misc.rs
 create mode 100644 testcrate/tests/mul.rs
 create mode 100644 testcrate/tests/shift.rs

diff --git a/src/int/mod.rs b/src/int/mod.rs
index d2302826..cb94803a 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -72,6 +72,9 @@ pub trait Int:
     /// Prevents the need for excessive conversions between signed and unsigned
     fn logical_shr(self, other: u32) -> Self;
 
+    /// Absolute difference between two integers.
+    fn abs_diff(self, other: Self) -> Self::UnsignedInt;
+
     // copied from primitive integers, but put in a trait
     fn is_zero(self) -> bool;
     fn max_value() -> Self;
@@ -251,6 +254,10 @@ macro_rules! int_impl {
                 me
             }
 
+            fn abs_diff(self, other: Self) -> Self {
+                (self.wrapping_sub(other) as $ity).wrapping_abs() as $uty
+            }
+
             int_impl_common!($uty, $bits);
         }
 
@@ -274,6 +281,10 @@ macro_rules! int_impl {
                 me as $ity
             }
 
+            fn abs_diff(self, other: Self) -> $uty {
+                self.wrapping_sub(other).wrapping_abs() as $uty
+            }
+
             int_impl_common!($ity, $bits);
         }
     };
diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml
index ce8df2d1..ff9a6a45 100644
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -11,7 +11,7 @@ doctest = false
 [build-dependencies]
 rand = "0.7"
 
-[dev-dependencies]
+[dependencies]
 # For fuzzing tests we want a deterministic seedable RNG. We also eliminate potential
 # problems with system RNGs on the variety of platforms this crate is tested on.
 # `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts.
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 0c9ac1ac..9bd155f6 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -1 +1,259 @@
+//! This crate is for integration testing and fuzz testing of functions in `compiler-builtins`. This
+//! includes publicly documented intrinsics and some internal alternative implementation functions
+//! such as `usize_leading_zeros_riscv` (which are tested because they are configured for
+//! architectures not tested by the CI).
+//!
+//! The general idea is to use a combination of edge case testing and randomized fuzz testing. The
+//! edge case testing is crucial for checking cases like where both inputs are equal or equal to
+//! special values such as `i128::MIN`, which is unlikely for the random fuzzer by itself to
+//! encounter. The randomized fuzz testing is specially designed to cover wide swaths of search
+//! space in as few iterations as possible. See `fuzz_values` in `testcrate/tests/misc.rs` for an
+//! example.
+//!
+//! Some floating point tests are disabled for specific architectures, because they do not have
+//! correct rounding.
 #![no_std]
+
+use compiler_builtins::float::Float;
+use compiler_builtins::int::Int;
+
+use rand_xoshiro::rand_core::{RngCore, SeedableRng};
+use rand_xoshiro::Xoshiro128StarStar;
+
+/// Sets the number of fuzz iterations run for most tests. In practice, the vast majority of bugs
+/// are caught by the edge case testers. Most of the remaining bugs triggered by more complex
+/// sequences are caught well within 10_000 fuzz iterations. For classes of algorithms like division
+/// that are vulnerable to rare edge cases, we want 1_000_000 iterations to be more confident. In
+/// practical CI, however, we only want to run the more strenuous test once to catch algorithmic
+/// level bugs, and run the 10_000 iteration test on most targets. Target-dependent bugs are likely
+/// to involve miscompilation and misconfiguration that is likely to break algorithms in quickly
+/// caught ways. We choose to configure `N = 1_000_000` iterations for `x86_64` targets (and if
+/// debug assertions are disabled. Tests without `--release` would take too long) which are likely
+/// to have fast hardware, and run `N = 10_000` for all other targets.
+pub const N: u32 = if cfg!(target_arch = "x86_64") && !cfg!(debug_assertions) {
+    1_000_000
+} else {
+    10_000
+};
+
+/// Random fuzzing step. When run several times, it results in excellent fuzzing entropy such as:
+/// 11110101010101011110111110011111
+/// 10110101010100001011101011001010
+/// 1000000000000000
+/// 10000000000000110111110000001010
+/// 1111011111111101010101111110101
+/// 101111111110100000000101000000
+/// 10000000110100000000100010101
+/// 1010101010101000
+fn fuzz_step<I: Int>(rng: &mut Xoshiro128StarStar, x: &mut I) {
+    let ones = !I::ZERO;
+    let bit_indexing_mask: u32 = I::BITS - 1;
+    // It happens that all the RNG we need can come from one call. 7 bits are needed to index a
+    // worst case 128 bit integer, and there are 4 indexes that need to be made plus 4 bits for
+    // selecting operations
+    let rng32 = rng.next_u32();
+
+    // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of
+    // ones with `lhs` and `rhs`.
+    let r0 = bit_indexing_mask & rng32;
+    let r1 = bit_indexing_mask & (rng32 >> 7);
+    let mask = ones.wrapping_shl(r0).rotate_left(r1);
+    match (rng32 >> 14) % 4 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        // both 2 and 3 to make XORs as common as ORs and ANDs combined
+        _ => *x ^= mask,
+    }
+
+    // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order
+    // problems that might occur for algorithms with two modes of operation (potentially
+    // there is some invariant that can be broken and maintained via alternating between modes,
+    // breaking the algorithm when it reaches the end).
+    let mut alt_ones = I::ONE;
+    for _ in 0..(I::BITS / 2) {
+        alt_ones <<= 2;
+        alt_ones |= I::ONE;
+    }
+    let r0 = bit_indexing_mask & (rng32 >> 16);
+    let r1 = bit_indexing_mask & (rng32 >> 23);
+    let mask = alt_ones.wrapping_shl(r0).rotate_left(r1);
+    match rng32 >> 30 {
+        0 => *x |= mask,
+        1 => *x &= mask,
+        _ => *x ^= mask,
+    }
+}
+
+// We need macros like this, because `#![no_std]` prevents us from using iterators
+macro_rules! edge_cases {
+    ($I:ident, $case:ident, $inner:block) => {
+        for i0 in 0..$I::FUZZ_NUM {
+            let mask_lo = (!$I::UnsignedInt::ZERO).wrapping_shr($I::FUZZ_LENGTHS[i0] as u32);
+            for i1 in i0..I::FUZZ_NUM {
+                let mask_hi =
+                    (!$I::UnsignedInt::ZERO).wrapping_shl($I::FUZZ_LENGTHS[i1 - i0] as u32);
+                let $case = I::from_unsigned(mask_lo & mask_hi);
+                $inner
+            }
+        }
+    };
+}
+
+/// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
+/// edge cases, followed by a more random fuzzer that runs `n` times.
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
+    // edge case tester. Calls `f` 210 times for u128.
+    // zero gets skipped by the loop
+    f(I::ZERO);
+    edge_cases!(I, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = Int::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        f(x)
+    }
+}
+
+/// The same as `fuzz`, except `f` has two inputs.
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F) {
+    // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
+    edge_cases!(I, case, {
+        f(I::ZERO, case);
+    });
+    edge_cases!(I, case, {
+        f(case, I::ZERO);
+    });
+    // Nested edge tester. Calls `f` 44100 times for `u128`.
+    edge_cases!(I, case0, {
+        edge_cases!(I, case1, {
+            f(case0, case1);
+        })
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = I::ZERO;
+    let mut y: I = I::ZERO;
+    for _ in 0..n {
+        fuzz_step(&mut rng, &mut x);
+        fuzz_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
+
+/// Tester for shift functions
+pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
+    // Shift functions are very simple and do not need anything other than shifting a small
+    // set of random patterns for every fuzz length.
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x: I = Int::ZERO;
+    for i in 0..I::FUZZ_NUM {
+        fuzz_step(&mut rng, &mut x);
+        f(x, Int::ZERO);
+        f(x, I::FUZZ_LENGTHS[i] as u32);
+    }
+}
+
+fn fuzz_float_step<F: Float>(rng: &mut Xoshiro128StarStar, f: &mut F) {
+    let rng32 = rng.next_u32();
+    // we need to fuzz the different parts of the float separately, because the masking on larger
+    // significands will tend to set the exponent to all ones or all zeros frequently
+
+    // sign bit fuzzing
+    let sign = (rng32 & 1) != 0;
+
+    // exponent fuzzing. Only 4 bits for the selector needed.
+    let ones = (F::Int::ONE << F::EXPONENT_BITS) - F::Int::ONE;
+    let r0 = (rng32 >> 1) % F::EXPONENT_BITS;
+    let r1 = (rng32 >> 5) % F::EXPONENT_BITS;
+    // custom rotate shift. Note that `F::Int` is unsigned, so we can shift right without smearing
+    // the sign bit.
+    let mask = if r1 == 0 {
+        ones.wrapping_shr(r0)
+    } else {
+        let tmp = ones.wrapping_shr(r0);
+        (tmp.wrapping_shl(r1) | tmp.wrapping_shr(F::EXPONENT_BITS - r1)) & ones
+    };
+    let mut exp = (f.repr() & F::EXPONENT_MASK) >> F::SIGNIFICAND_BITS;
+    match (rng32 >> 9) % 4 {
+        0 => exp |= mask,
+        1 => exp &= mask,
+        _ => exp ^= mask,
+    }
+
+    // significand fuzzing
+    let mut sig = f.repr() & F::SIGNIFICAND_MASK;
+    fuzz_step(rng, &mut sig);
+    sig &= F::SIGNIFICAND_MASK;
+
+    *f = F::from_parts(sign, exp, sig);
+}
+
+macro_rules! float_edge_cases {
+    ($F:ident, $case:ident, $inner:block) => {
+        for exponent in [
+            F::Int::ZERO,
+            F::Int::ONE,
+            F::Int::ONE << (F::EXPONENT_BITS / 2),
+            (F::Int::ONE << (F::EXPONENT_BITS - 1)) - F::Int::ONE,
+            F::Int::ONE << (F::EXPONENT_BITS - 1),
+            (F::Int::ONE << (F::EXPONENT_BITS - 1)) + F::Int::ONE,
+            (F::Int::ONE << F::EXPONENT_BITS) - F::Int::ONE,
+        ]
+        .iter()
+        {
+            for significand in [
+                F::Int::ZERO,
+                F::Int::ONE,
+                F::Int::ONE << (F::SIGNIFICAND_BITS / 2),
+                (F::Int::ONE << (F::SIGNIFICAND_BITS - 1)) - F::Int::ONE,
+                F::Int::ONE << (F::SIGNIFICAND_BITS - 1),
+                (F::Int::ONE << (F::SIGNIFICAND_BITS - 1)) + F::Int::ONE,
+                (F::Int::ONE << F::SIGNIFICAND_BITS) - F::Int::ONE,
+            ]
+            .iter()
+            {
+                for sign in [false, true].iter() {
+                    let $case = F::from_parts(*sign, *exponent, *significand);
+                    $inner
+                }
+            }
+        }
+    };
+}
+
+pub fn fuzz_float<F: Float, E: Fn(F)>(n: u32, f: E) {
+    float_edge_cases!(F, case, {
+        f(case);
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        f(x);
+    }
+}
+
+pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
+    float_edge_cases!(F, case0, {
+        float_edge_cases!(F, case1, {
+            f(case0, case1);
+        });
+    });
+
+    // random fuzzer
+    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
+    let mut x = F::ZERO;
+    let mut y = F::ZERO;
+    for _ in 0..n {
+        fuzz_float_step(&mut rng, &mut x);
+        fuzz_float_step(&mut rng, &mut y);
+        f(x, y)
+    }
+}
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
new file mode 100644
index 00000000..ff56668b
--- /dev/null
+++ b/testcrate/tests/addsub.rs
@@ -0,0 +1,109 @@
+use testcrate::*;
+
+macro_rules! sum {
+    ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+        $(
+            fuzz_2(N, |x: $i, y: $i| {
+                let add0 = x.wrapping_add(y);
+                let sub0 = x.wrapping_sub(y);
+                let add1: $i = $fn_add(x, y);
+                let sub1: $i = $fn_sub(x, y);
+                if add0 != add1 {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn_add), x, y, add0, add1
+                    );
+                }
+                if sub0 != sub1 {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn_sub), x, y, sub0, sub1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+macro_rules! overflowing_sum {
+    ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+        $(
+            fuzz_2(N, |x: $i, y: $i| {
+                let add0 = x.overflowing_add(y);
+                let sub0 = x.overflowing_sub(y);
+                let add1: ($i, bool) = $fn_add(x, y);
+                let sub1: ($i, bool) = $fn_sub(x, y);
+                if add0.0 != add1.0 || add0.1 != add1.1 {
+                    panic!(
+                        "{}({}, {}): std: {:?}, builtins: {:?}",
+                        stringify!($fn_add), x, y, add0, add1
+                    );
+                }
+                if sub0.0 != sub1.0 || sub0.1 != sub1.1 {
+                    panic!(
+                        "{}({}, {}): std: {:?}, builtins: {:?}",
+                        stringify!($fn_sub), x, y, sub0, sub1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[test]
+fn addsub() {
+    use compiler_builtins::int::addsub::{
+        __rust_i128_add, __rust_i128_addo, __rust_i128_sub, __rust_i128_subo, __rust_u128_add,
+        __rust_u128_addo, __rust_u128_sub, __rust_u128_subo,
+    };
+
+    // Integer addition and subtraction is very simple, so 100 fuzzing passes should be plenty.
+    sum!(
+        u128, __rust_u128_add, __rust_u128_sub;
+        i128, __rust_i128_add, __rust_i128_sub;
+    );
+    overflowing_sum!(
+        u128, __rust_u128_addo, __rust_u128_subo;
+        i128, __rust_i128_addo, __rust_i128_subo;
+    );
+}
+
+macro_rules! float_sum {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+        $(
+            fuzz_float_2(N, |x: $f, y: $f| {
+                let add0 = x + y;
+                let sub0 = x - y;
+                let add1: $f = $fn_add(x, y);
+                let sub1: $f = $fn_sub(x, y);
+                if !Float::eq_repr(add0, add1) {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn_add), x, y, add0, add1
+                    );
+                }
+                if !Float::eq_repr(sub0, sub1) {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn_sub), x, y, sub0, sub1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[test]
+fn float_addsub() {
+    use compiler_builtins::float::{
+        add::{__adddf3, __addsf3},
+        sub::{__subdf3, __subsf3},
+        Float,
+    };
+
+    float_sum!(
+        f32, __addsf3, __subsf3;
+        f64, __adddf3, __subdf3;
+    );
+}
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
new file mode 100644
index 00000000..d359b65d
--- /dev/null
+++ b/testcrate/tests/cmp.rs
@@ -0,0 +1,52 @@
+use testcrate::*;
+
+macro_rules! cmp {
+    ($x:ident, $y:ident, $($unordered_val:expr, $fn:ident);*;) => {
+        $(
+            let cmp0 = if $x.is_nan() || $y.is_nan() {
+                $unordered_val
+            } else if $x < $y {
+                -1
+            } else if $x == $y {
+                0
+            } else {
+                1
+            };
+            let cmp1 = $fn($x, $y);
+            if cmp0 != cmp1 {
+                panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+            }
+        )*
+    };
+}
+
+#[test]
+fn float_comparisons() {
+    use compiler_builtins::float::cmp::{
+        __eqdf2, __eqsf2, __gedf2, __gesf2, __gtdf2, __gtsf2, __ledf2, __lesf2, __ltdf2, __ltsf2,
+        __nedf2, __nesf2, __unorddf2, __unordsf2,
+    };
+
+    fuzz_float_2(N, |x: f32, y: f32| {
+        assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
+        cmp!(x, y,
+            1, __ltsf2;
+            1, __lesf2;
+            1, __eqsf2;
+            -1, __gesf2;
+            -1, __gtsf2;
+            1, __nesf2;
+        );
+    });
+    fuzz_float_2(N, |x: f64, y: f64| {
+        assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
+        cmp!(x, y,
+            1, __ltdf2;
+            1, __ledf2;
+            1, __eqdf2;
+            -1, __gedf2;
+            -1, __gtdf2;
+            1, __nedf2;
+        );
+    });
+}
diff --git a/testcrate/tests/conv.rs b/testcrate/tests/conv.rs
new file mode 100644
index 00000000..7cdbf9fb
--- /dev/null
+++ b/testcrate/tests/conv.rs
@@ -0,0 +1,125 @@
+use testcrate::*;
+
+macro_rules! i_to_f {
+    ($($from:ty, $into:ty, $fn:ident);*;) => {
+        $(
+            fuzz(N, |x: $from| {
+                let f0 = x as $into;
+                let f1: $into = $fn(x);
+                // This makes sure that the conversion produced the best rounding possible, and does
+                // this independent of `x as $into` rounding correctly.
+                // This assumes that float to integer conversion is correct.
+                let y_minus_ulp = <$into>::from_bits(f1.to_bits().wrapping_sub(1)) as $from;
+                let y = f1 as $from;
+                let y_plus_ulp = <$into>::from_bits(f1.to_bits().wrapping_add(1)) as $from;
+                let error_minus = <$from as Int>::abs_diff(y_minus_ulp, x);
+                let error = <$from as Int>::abs_diff(y, x);
+                let error_plus = <$from as Int>::abs_diff(y_plus_ulp, x);
+                // The first two conditions check that none of the two closest float values are
+                // strictly closer in representation to `x`. The second makes sure that rounding is
+                // towards even significand if two float values are equally close to the integer.
+                if error_minus < error
+                    || error_plus < error
+                    || ((error_minus == error || error_plus == error)
+                        && ((f0.to_bits() & 1) != 0))
+                {
+                    panic!(
+                        "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
+                        stringify!($fn),
+                        x,
+                        f1.to_bits(),
+                        y_minus_ulp,
+                        y,
+                        y_plus_ulp,
+                        error_minus,
+                        error,
+                        error_plus,
+                    );
+                }
+                // Test against native conversion. We disable testing on all `x86` because of
+                // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
+                if f0 != f1 && !cfg!(any(
+                    target_arch = "x86",
+                    target_arch = "powerpc",
+                    target_arch = "powerpc64"
+                )) {
+                    panic!(
+                        "{}({}): std: {}, builtins: {}",
+                        stringify!($fn),
+                        x,
+                        f0,
+                        f1,
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[test]
+fn int_to_float() {
+    use compiler_builtins::float::conv::{
+        __floatdidf, __floatdisf, __floatsidf, __floatsisf, __floattidf, __floattisf,
+        __floatundidf, __floatundisf, __floatunsidf, __floatunsisf, __floatuntidf, __floatuntisf,
+    };
+    use compiler_builtins::int::Int;
+
+    i_to_f!(
+        u32, f32, __floatunsisf;
+        u32, f64, __floatunsidf;
+        i32, f32, __floatsisf;
+        i32, f64, __floatsidf;
+        u64, f32, __floatundisf;
+        u64, f64, __floatundidf;
+        i64, f32, __floatdisf;
+        i64, f64, __floatdidf;
+        u128, f32, __floatuntisf;
+        u128, f64, __floatuntidf;
+        i128, f32, __floattisf;
+        i128, f64, __floattidf;
+    );
+}
+
+macro_rules! f_to_i {
+    ($x:ident, $($f:ty, $fn:ident);*;) => {
+        $(
+            // it is undefined behavior in the first place to do conversions with NaNs
+            if !$x.is_nan() {
+                let conv0 = $x as $f;
+                let conv1: $f = $fn($x);
+                if conv0 != conv1 {
+                    panic!("{}({}): std: {}, builtins: {}", stringify!($fn), $x, conv0, conv1);
+                }
+            }
+        )*
+    };
+}
+
+#[test]
+fn float_to_int() {
+    use compiler_builtins::float::conv::{
+        __fixdfdi, __fixdfsi, __fixdfti, __fixsfdi, __fixsfsi, __fixsfti, __fixunsdfdi,
+        __fixunsdfsi, __fixunsdfti, __fixunssfdi, __fixunssfsi, __fixunssfti,
+    };
+
+    fuzz_float(N, |x: f32| {
+        f_to_i!(x,
+            u32, __fixunssfsi;
+            u64, __fixunssfdi;
+            u128, __fixunssfti;
+            i32, __fixsfsi;
+            i64, __fixsfdi;
+            i128, __fixsfti;
+        );
+    });
+    fuzz_float(N, |x: f64| {
+        f_to_i!(x,
+            u32, __fixunsdfsi;
+            u64, __fixunsdfdi;
+            u128, __fixunsdfti;
+            i32, __fixdfsi;
+            i64, __fixdfdi;
+            i128, __fixdfti;
+        );
+    });
+}
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index 199fa9db..0007c15a 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -1,8 +1,9 @@
-use rand_xoshiro::rand_core::{RngCore, SeedableRng};
-use rand_xoshiro::Xoshiro128StarStar;
-
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
-use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4};
+use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
+use testcrate::*;
+
+// Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
+// that sometimes 100_000 iterations of the random fuzzer is needed.
 
 /// Creates intensive test functions for division functions of a certain size
 macro_rules! test {
@@ -16,14 +17,17 @@ macro_rules! test {
     ) => {
         #[test]
         fn $test_name() {
-            fn assert_invariants(lhs: $uX, rhs: $uX) {
-                let rem: &mut $uX = &mut 0;
-                let quo: $uX = $unsigned_name(lhs, rhs, Some(rem));
-                let rem = *rem;
+            fuzz_2(N, |lhs, rhs| {
+                if rhs == 0 {
+                    return;
+                }
+
+                let mut rem: $uX = 0;
+                let quo: $uX = $unsigned_name(lhs, rhs, Some(&mut rem));
                 if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
                     panic!(
                         "unsigned division function failed with lhs:{} rhs:{} \
-                        expected:({}, {}) found:({}, {})",
+                        std:({}, {}) builtins:({}, {})",
                         lhs,
                         rhs,
                         lhs.wrapping_div(rhs),
@@ -55,7 +59,7 @@ macro_rules! test {
                 if incorrect_rem || lhs != rhs.wrapping_mul(quo).wrapping_add(rem) {
                     panic!(
                         "signed division function failed with lhs:{} rhs:{} \
-                        expected:({}, {}) found:({}, {})",
+                        std:({}, {}) builtins:({}, {})",
                         lhs,
                         rhs,
                         lhs.wrapping_div(rhs),
@@ -64,70 +68,7 @@ macro_rules! test {
                         rem
                     );
                 }
-            }
-
-            // Specially designed random fuzzer
-            let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-            let mut lhs: $uX = 0;
-            let mut rhs: $uX = 0;
-            // all ones constant
-            let ones: $uX = !0;
-            // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order
-            // problems that might occur for algorithms with two modes of operation (potentially
-            // there is some invariant that can be broken for large `duo` and maintained via
-            // alternating between modes, breaking the algorithm when it reaches the end).
-            let mut alt_ones: $uX = 1;
-            for _ in 0..($n / 2) {
-                alt_ones <<= 2;
-                alt_ones |= 1;
-            }
-            // creates a mask for indexing the bits of the type
-            let bit_indexing_mask = $n - 1;
-            for _ in 0..1_000_000 {
-                // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of
-                // ones with `lhs` and `rhs`. This results in excellent fuzzing entropy such as:
-                // lhs:10101010111101000000000100101010 rhs: 1010101010000000000000001000001
-                // lhs:10101010111101000000000101001010 rhs: 1010101010101010101010100010100
-                // lhs:10101010111101000000000101001010 rhs:11101010110101010101010100001110
-                // lhs:10101010000000000000000001001010 rhs:10100010100000000000000000001010
-                // lhs:10101010000000000000000001001010 rhs:            10101010101010101000
-                // lhs:10101010000000000000000001100000 rhs:11111111111101010101010101001111
-                // lhs:10101010000000101010101011000000 rhs:11111111111101010101010100000111
-                // lhs:10101010101010101010101011101010 rhs:             1010100000000000000
-                // lhs:11111111110101101010101011010111 rhs:             1010100000000000000
-                // The msb is set half of the time by the fuzzer, but `assert_invariants` tests
-                // both the signed and unsigned functions.
-                let r0: u32 = bit_indexing_mask & rng.next_u32();
-                let r1: u32 = bit_indexing_mask & rng.next_u32();
-                let mask = ones.wrapping_shr(r0).rotate_left(r1);
-                match rng.next_u32() % 8 {
-                    0 => lhs |= mask,
-                    1 => lhs &= mask,
-                    // both 2 and 3 to make XORs as common as ORs and ANDs combined, otherwise
-                    // the entropy gets destroyed too often
-                    2 | 3 => lhs ^= mask,
-                    4 => rhs |= mask,
-                    5 => rhs &= mask,
-                    _ => rhs ^= mask,
-                }
-                // do the same for alternating ones and zeros
-                let r0: u32 = bit_indexing_mask & rng.next_u32();
-                let r1: u32 = bit_indexing_mask & rng.next_u32();
-                let mask = alt_ones.wrapping_shr(r0).rotate_left(r1);
-                match rng.next_u32() % 8 {
-                    0 => lhs |= mask,
-                    1 => lhs &= mask,
-                    // both 2 and 3 to make XORs as common as ORs and ANDs combined, otherwise
-                    // the entropy gets destroyed too often
-                    2 | 3 => lhs ^= mask,
-                    4 => rhs |= mask,
-                    5 => rhs &= mask,
-                    _ => rhs ^= mask,
-                }
-                if rhs != 0 {
-                    assert_invariants(lhs, rhs);
-                }
-            }
+            });
         }
     };
 }
@@ -135,3 +76,61 @@ macro_rules! test {
 test!(32, u32, i32, div_rem_si4, __udivmodsi4, __divmodsi4);
 test!(64, u64, i64, div_rem_di4, __udivmoddi4, __divmoddi4);
 test!(128, u128, i128, div_rem_ti4, __udivmodti4, __divmodti4);
+
+#[test]
+fn divide_sparc() {
+    fuzz_2(N, |lhs, rhs| {
+        if rhs == 0 {
+            return;
+        }
+
+        let mut rem: u128 = 0;
+        let quo: u128 = u128_divide_sparc(lhs, rhs, &mut rem);
+        if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) {
+            panic!(
+                "u128_divide_sparc({}, {}): \
+                std:({}, {}), builtins:({}, {})",
+                lhs,
+                rhs,
+                lhs.wrapping_div(rhs),
+                lhs.wrapping_rem(rhs),
+                quo,
+                rem
+            );
+        }
+    });
+}
+
+macro_rules! float {
+    ($($i:ty, $fn:ident);*;) => {
+        $(
+            fuzz_float_2(N, |x: $i, y: $i| {
+                let quo0 = x / y;
+                let quo1: $i = $fn(x, y);
+                // division of subnormals is not currently handled
+                if !(Float::is_subnormal(&quo0) || Float::is_subnormal(&quo1)) {
+                    if !Float::eq_repr(quo0, quo1) {
+                        panic!(
+                            "{}({}, {}): std: {}, builtins: {}",
+                            stringify!($fn), x, y, quo0, quo1
+                        );
+                    }
+                }
+            });
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[test]
+fn float_div() {
+    use compiler_builtins::float::{
+        div::{__divdf3, __divsf3},
+        Float,
+    };
+
+    float!(
+        f32, __divsf3;
+        f64, __divdf3;
+    );
+}
diff --git a/testcrate/tests/leading_zeros.rs b/testcrate/tests/leading_zeros.rs
deleted file mode 100644
index b857d9e0..00000000
--- a/testcrate/tests/leading_zeros.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-use rand_xoshiro::rand_core::{RngCore, SeedableRng};
-use rand_xoshiro::Xoshiro128StarStar;
-
-use compiler_builtins::int::__clzsi2;
-use compiler_builtins::int::leading_zeros::{
-    usize_leading_zeros_default, usize_leading_zeros_riscv,
-};
-
-#[test]
-fn __clzsi2_test() {
-    // Binary fuzzer. We cannot just send a random number directly to `__clzsi2()`, because we need
-    // large sequences of zeros to test. This XORs, ANDs, and ORs random length strings of 1s to
-    // `x`. ORs insure sequences of ones, ANDs insures sequences of zeros, and XORs are not often
-    // destructive but add entropy.
-    let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x = 0usize;
-    // creates a mask for indexing the bits of the type
-    let bit_indexing_mask = usize::MAX.count_ones() - 1;
-    // 10000 iterations is enough to make sure edge cases like single set bits are tested and to go
-    // through many paths.
-    for _ in 0..10_000 {
-        let r0 = bit_indexing_mask & rng.next_u32();
-        // random length of ones
-        let ones: usize = !0 >> r0;
-        let r1 = bit_indexing_mask & rng.next_u32();
-        // random circular shift
-        let mask = ones.rotate_left(r1);
-        match rng.next_u32() % 4 {
-            0 => x |= mask,
-            1 => x &= mask,
-            // both 2 and 3 to make XORs as common as ORs and ANDs combined
-            _ => x ^= mask,
-        }
-        let lz = x.leading_zeros() as usize;
-        let lz0 = __clzsi2(x);
-        let lz1 = usize_leading_zeros_default(x);
-        let lz2 = usize_leading_zeros_riscv(x);
-        if lz0 != lz {
-            panic!("__clzsi2({}): expected: {}, found: {}", x, lz, lz0);
-        }
-        if lz1 != lz {
-            panic!(
-                "usize_leading_zeros_default({}): expected: {}, found: {}",
-                x, lz, lz1
-            );
-        }
-        if lz2 != lz {
-            panic!(
-                "usize_leading_zeros_riscv({}): expected: {}, found: {}",
-                x, lz, lz2
-            );
-        }
-    }
-}
diff --git a/testcrate/tests/misc.rs b/testcrate/tests/misc.rs
new file mode 100644
index 00000000..d31e3e6b
--- /dev/null
+++ b/testcrate/tests/misc.rs
@@ -0,0 +1,134 @@
+use testcrate::*;
+
+/// Make sure that the the edge case tester and randomized tester don't break, and list examples of
+/// fuzz values for documentation purposes.
+#[test]
+fn fuzz_values() {
+    const VALS: [u16; 47] = [
+        0b0, // edge cases
+        0b1111111111111111,
+        0b1111111111111110,
+        0b1111111111111100,
+        0b1111111110000000,
+        0b1111111100000000,
+        0b1110000000000000,
+        0b1100000000000000,
+        0b1000000000000000,
+        0b111111111111111,
+        0b111111111111110,
+        0b111111111111100,
+        0b111111110000000,
+        0b111111100000000,
+        0b110000000000000,
+        0b100000000000000,
+        0b11111111111111,
+        0b11111111111110,
+        0b11111111111100,
+        0b11111110000000,
+        0b11111100000000,
+        0b10000000000000,
+        0b111111111,
+        0b111111110,
+        0b111111100,
+        0b110000000,
+        0b100000000,
+        0b11111111,
+        0b11111110,
+        0b11111100,
+        0b10000000,
+        0b111,
+        0b110,
+        0b100,
+        0b11,
+        0b10,
+        0b1,
+        0b1010110100000, // beginning of random fuzzing
+        0b1100011001011010,
+        0b1001100101001111,
+        0b1101010100011010,
+        0b100010001,
+        0b1000000000000000,
+        0b1100000000000101,
+        0b1100111101010101,
+        0b1100010111111111,
+        0b1111110101111111,
+    ];
+    let mut i = 0;
+    fuzz(10, |x: u16| {
+        assert_eq!(x, VALS[i]);
+        i += 1;
+    });
+}
+
+#[test]
+fn leading_zeros() {
+    use compiler_builtins::int::__clzsi2;
+    use compiler_builtins::int::leading_zeros::{
+        usize_leading_zeros_default, usize_leading_zeros_riscv,
+    };
+    fuzz(N, |x: usize| {
+        let lz = x.leading_zeros() as usize;
+        let lz0 = __clzsi2(x);
+        let lz1 = usize_leading_zeros_default(x);
+        let lz2 = usize_leading_zeros_riscv(x);
+        if lz0 != lz {
+            panic!("__clzsi2({}): std: {}, builtins: {}", x, lz, lz0);
+        }
+        if lz1 != lz {
+            panic!(
+                "usize_leading_zeros_default({}): std: {}, builtins: {}",
+                x, lz, lz1
+            );
+        }
+        if lz2 != lz {
+            panic!(
+                "usize_leading_zeros_riscv({}): std: {}, builtins: {}",
+                x, lz, lz2
+            );
+        }
+    })
+}
+
+#[test]
+fn float_extend() {
+    fuzz_float(N, |x: f32| {
+        let tmp0 = x as f64;
+        let tmp1: f64 = compiler_builtins::float::extend::__extendsfdf2(x);
+        if !compiler_builtins::float::Float::eq_repr(tmp0, tmp1) {
+            panic!("__extendsfdf2({}): std: {}, builtins: {}", x, tmp0, tmp1);
+        }
+    });
+}
+
+// This doesn't quite work because of issues related to
+// https://github.com/rust-lang/rust/issues/73920.
+// TODO how do we resolve this?
+/*
+macro_rules! pow {
+    ($($f:ty, $fn:ident);*;) => {
+        $(
+            fuzz_float_2(N, |x: $f, y: $f| {
+                let n = y as i32;
+                let tmp0: $f = x.powi(n);
+                let tmp1: $f = $fn(x, n);
+                if tmp0 != tmp1 {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn), x, y, tmp0, tmp1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[test]
+fn float_pow() {
+    use compiler_builtins::float::pow::{__powidf2, __powisf2};
+
+    pow!(
+        f32, __powisf2;
+        f64, __powidf2;
+    );
+}
+*/
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
new file mode 100644
index 00000000..8b97ea46
--- /dev/null
+++ b/testcrate/tests/mul.rs
@@ -0,0 +1,114 @@
+use testcrate::*;
+
+macro_rules! mul {
+    ($($i:ty, $fn:ident);*;) => {
+        $(
+            fuzz_2(N, |x: $i, y: $i| {
+                let mul0 = x.wrapping_mul(y);
+                let mul1: $i = $fn(x, y);
+                if mul0 != mul1 {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn), x, y, mul0, mul1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[test]
+fn mul() {
+    use compiler_builtins::int::mul::{__muldi3, __multi3};
+
+    mul!(
+        u64, __muldi3;
+        i128, __multi3;
+    );
+}
+
+macro_rules! overflowing_mul {
+    ($($i:ty, $fn:ident);*;) => {
+        $(
+            fuzz_2(N, |x: $i, y: $i| {
+                let (mul0, o0) = x.overflowing_mul(y);
+                let mut o1 = 0i32;
+                let mul1: $i = $fn(x, y, &mut o1);
+                let o1 = o1 != 0;
+                if mul0 != mul1 || o0 != o1 {
+                    panic!(
+                        "{}({}, {}): std: ({}, {}), builtins: ({}, {})",
+                        stringify!($fn), x, y, mul0, o0, mul1, o1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[test]
+fn overflowing_mul() {
+    use compiler_builtins::int::mul::{
+        __mulodi4, __mulosi4, __muloti4, __rust_i128_mulo, __rust_u128_mulo,
+    };
+
+    overflowing_mul!(
+        i32, __mulosi4;
+        i64, __mulodi4;
+        i128, __muloti4;
+    );
+    fuzz_2(N, |x: u128, y: u128| {
+        let (mul0, o0) = x.overflowing_mul(y);
+        let (mul1, o1) = __rust_u128_mulo(x, y);
+        if mul0 != mul1 || o0 != o1 {
+            panic!(
+                "__rust_u128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
+                x, y, mul0, o0, mul1, o1
+            );
+        }
+        let x = x as i128;
+        let y = y as i128;
+        let (mul0, o0) = x.overflowing_mul(y);
+        let (mul1, o1) = __rust_i128_mulo(x, y);
+        if mul0 != mul1 || o0 != o1 {
+            panic!(
+                "__rust_i128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
+                x, y, mul0, o0, mul1, o1
+            );
+        }
+    });
+}
+
+macro_rules! float_mul {
+    ($($f:ty, $fn:ident);*;) => {
+        $(
+            fuzz_float_2(N, |x: $f, y: $f| {
+                let mul0 = x * y;
+                let mul1: $f = $fn(x, y);
+                // multiplication of subnormals is not currently handled
+                if !(Float::is_subnormal(&mul0) || Float::is_subnormal(&mul1)) {
+                    if !Float::eq_repr(mul0, mul1) {
+                        panic!(
+                            "{}({}, {}): std: {}, builtins: {}",
+                            stringify!($fn), x, y, mul0, mul1
+                        );
+                    }
+                }
+            });
+        )*
+    };
+}
+
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[test]
+fn float_mul() {
+    use compiler_builtins::float::{
+        mul::{__muldf3, __mulsf3},
+        Float,
+    };
+
+    float_mul!(
+        f32, __mulsf3;
+        f64, __muldf3;
+    );
+}
diff --git a/testcrate/tests/shift.rs b/testcrate/tests/shift.rs
new file mode 100644
index 00000000..ecb13a13
--- /dev/null
+++ b/testcrate/tests/shift.rs
@@ -0,0 +1,60 @@
+use testcrate::*;
+
+macro_rules! shift {
+    ($($i:ty, $fn_std:ident, $fn_builtins:ident);*;) => {
+        $(
+            fuzz_shift(|x: $i, s: u32| {
+                let tmp0: $i = x.$fn_std(s);
+                let tmp1: $i = $fn_builtins(x, s);
+                if tmp0 != tmp1 {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn_builtins), x, s, tmp0, tmp1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+macro_rules! overflowing_shift {
+    ($($i:ty, $fn_std:ident, $fn_builtins:ident);*;) => {
+        $(
+            fuzz_shift(|x: $i, s: u32| {
+                let tmp0: $i = x.$fn_std(s);
+                let (tmp1, o1): ($i, bool) = $fn_builtins(x, s.into());
+                if tmp0 != tmp1 || o1 {
+                    panic!(
+                        "{}({}, {}): std: {}, builtins: {}",
+                        stringify!($fn_builtins), x, s, tmp0, tmp1
+                    );
+                }
+            });
+        )*
+    };
+}
+
+#[test]
+fn shift() {
+    use compiler_builtins::int::shift::{
+        __ashldi3, __ashlsi3, __ashlti3, __ashrdi3, __ashrsi3, __ashrti3, __lshrdi3, __lshrsi3,
+        __lshrti3, __rust_i128_shlo, __rust_i128_shro, __rust_u128_shlo, __rust_u128_shro,
+    };
+    shift!(
+        u32, wrapping_shl, __ashlsi3;
+        u64, wrapping_shl, __ashldi3;
+        u128, wrapping_shl, __ashlti3;
+        i32, wrapping_shr, __ashrsi3;
+        i64, wrapping_shr, __ashrdi3;
+        i128, wrapping_shr, __ashrti3;
+        u32, wrapping_shr, __lshrsi3;
+        u64, wrapping_shr, __lshrdi3;
+        u128, wrapping_shr, __lshrti3;
+    );
+    overflowing_shift!(
+        u128, wrapping_shl, __rust_u128_shlo;
+        i128, wrapping_shl, __rust_i128_shlo;
+        u128, wrapping_shr, __rust_u128_shro;
+        i128, wrapping_shr, __rust_i128_shro;
+    );
+}