From 57fdf6a5eadcb5312aae0dfc3148aad225feba72 Mon Sep 17 00:00:00 2001
From: Elliott Mahler <join.together@gmail.com>
Date: Mon, 23 Mar 2020 22:03:22 -0700
Subject: [PATCH 1/7] Use the addcarry intrinsic when avilable

---
 src/algorithms.rs | 101 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 96 insertions(+), 5 deletions(-)

diff --git a/src/algorithms.rs b/src/algorithms.rs
index c65f3b4f..dd335eaf 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -17,6 +17,7 @@ use crate::big_digit::{self, BigDigit, DoubleBigDigit, SignedDoubleBigDigit};
 // Generic functions for add/subtract/multiply with carry/borrow:
 
 // Add with carry:
+#[allow(unused)]
 #[inline]
 fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
     *acc += DoubleBigDigit::from(a);
@@ -27,6 +28,7 @@ fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
 }
 
 // Subtract with borrow:
+#[allow(unused)]
 #[inline]
 fn sbb(a: BigDigit, b: BigDigit, acc: &mut SignedDoubleBigDigit) -> BigDigit {
     *acc += SignedDoubleBigDigit::from(a);
@@ -132,6 +134,41 @@ pub(crate) fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
 /// the addition first hoping that it will fit.
 ///
 /// The caller _must_ ensure that `a` is at least as long as `b`.
+#[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
+#[inline]
+pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
+    debug_assert!(a.len() >= b.len());
+
+    use std::arch::x86_64::_addcarry_u64;
+
+    let mut carry = 0;
+    let (a_lo, a_hi) = a.split_at_mut(b.len());
+
+    for (a, b) in a_lo.iter_mut().zip(b) {
+        // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+        carry = unsafe { _addcarry_u64(carry, *a, *b, a) };
+    }
+
+    if carry != 0 {
+        for a in a_hi {
+            // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+            carry = unsafe { _addcarry_u64(carry, *a, 0, a) };
+            if carry == 0 {
+                break;
+            }
+        }
+    }
+
+    carry as BigDigit
+}
+
+/// Two argument addition of raw slices, `a += b`, returning the carry.
+///
+/// This is used when the data `Vec` might need to resize to push a non-zero carry, so we perform
+/// the addition first hoping that it will fit.
+///
+/// The caller _must_ ensure that `a` is at least as long as `b`.
+#[cfg(not(all(u64_digit, target_arch = "x86_64")))] // run if we aren't using 64-bit digits, or if we're not running on x86_64
 #[inline]
 pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     debug_assert!(a.len() >= b.len());
@@ -166,6 +203,39 @@ pub(crate) fn add2(a: &mut [BigDigit], b: &[BigDigit]) {
     debug_assert!(carry == 0);
 }
 
+#[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
+pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
+    use std::arch::x86_64::_subborrow_u64;
+
+    let mut borrow = 0;
+
+    let len = cmp::min(a.len(), b.len());
+    let (a_lo, a_hi) = a.split_at_mut(len);
+    let (b_lo, b_hi) = b.split_at(len);
+
+    for (a, b) in a_lo.iter_mut().zip(b_lo) {
+        // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
+        borrow = unsafe { _subborrow_u64(borrow, *a, *b, a) };
+    }
+
+    if borrow != 0 {
+        for a in a_hi {
+            // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
+            borrow = unsafe { _subborrow_u64(borrow, *a, 0, a) };
+            if borrow == 0 {
+                break;
+            }
+        }
+    }
+
+    // note: we're _required_ to fail on underflow
+    assert!(
+        borrow == 0 && b_hi.iter().all(|x| *x == 0),
+        "Cannot subtract b from a because b is larger than a."
+    );
+}
+
+#[cfg(not(all(u64_digit, target_arch = "x86_64")))] // run if we aren't using 64-bit digits, or if we're not running on x86_64
 pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
     let mut borrow = 0;
 
@@ -194,6 +264,24 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
 }
 
 // Only for the Sub impl. `a` and `b` must have same length.
+#[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
+#[inline]
+pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
+    use std::arch::x86_64::_subborrow_u64;
+    debug_assert!(b.len() == a.len());
+
+    let mut borrow = 0;
+
+    for (ai, bi) in a.iter().zip(b) {
+        // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
+        borrow = unsafe { _subborrow_u64(borrow, *ai, *bi, bi) };
+    }
+
+    borrow as BigDigit
+}
+
+// Only for the Sub impl. `a` and `b` must have same length.
+#[cfg(not(all(u64_digit, target_arch = "x86_64")))] // run if we aren't using 64-bit digits, or if we're not running on x86_64
 #[inline]
 pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
     debug_assert!(b.len() == a.len());
@@ -259,11 +347,14 @@ pub(crate) fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
         *a = mac_with_carry(*a, b, c, &mut carry);
     }
 
-    let mut a = a_hi.iter_mut();
-    while carry != 0 {
-        let a = a.next().expect("carry overflow during multiplication!");
-        *a = adc(*a, 0, &mut carry);
-    }
+    let (carry_hi, carry_lo) = big_digit::from_doublebigdigit(carry);
+
+    let final_carry = if carry_hi == 0 {
+        __add2(a_hi, &[carry_lo])
+    } else {
+        __add2(a_hi, &[carry_hi, carry_lo])
+    };
+    assert_eq!(final_carry, 0, "carry overflow during multiplication!");
 }
 
 fn bigint_from_slice(slice: &[BigDigit]) -> BigInt {

From 4e20fc3a13eedb9d3654b9f38c19050de3b7d098 Mon Sep 17 00:00:00 2001
From: Elliott Mahler <join.together@gmail.com>
Date: Mon, 23 Mar 2020 22:42:33 -0700
Subject: [PATCH 2/7] Include intrinsics from core::arch, not std::arch

---
 src/algorithms.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/algorithms.rs b/src/algorithms.rs
index dd335eaf..6b8f8a6d 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -139,7 +139,7 @@ pub(crate) fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
 pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     debug_assert!(a.len() >= b.len());
 
-    use std::arch::x86_64::_addcarry_u64;
+    use core::arch::x86_64::_addcarry_u64;
 
     let mut carry = 0;
     let (a_lo, a_hi) = a.split_at_mut(b.len());
@@ -205,7 +205,7 @@ pub(crate) fn add2(a: &mut [BigDigit], b: &[BigDigit]) {
 
 #[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
 pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
-    use std::arch::x86_64::_subborrow_u64;
+    use core::arch::x86_64::_subborrow_u64;
 
     let mut borrow = 0;
 
@@ -267,7 +267,7 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
 #[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
 #[inline]
 pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
-    use std::arch::x86_64::_subborrow_u64;
+    use core::arch::x86_64::_subborrow_u64;
     debug_assert!(b.len() == a.len());
 
     let mut borrow = 0;

From 0be00d81474127a0c0e801a995e12b6cf799b5f7 Mon Sep 17 00:00:00 2001
From: Elliott Mahler <join.together@gmail.com>
Date: Wed, 25 Mar 2020 22:32:15 -0700
Subject: [PATCH 3/7] Moved the platform-specific code to adc and sbb, added a
 build.res entry

---
 build.rs          |   7 ++-
 src/algorithms.rs | 138 +++++++++++-----------------------------------
 2 files changed, 38 insertions(+), 107 deletions(-)

diff --git a/build.rs b/build.rs
index 93ee0998..4952abd8 100644
--- a/build.rs
+++ b/build.rs
@@ -6,7 +6,8 @@ use std::path::Path;
 
 fn main() {
     let pointer_width = env::var("CARGO_CFG_TARGET_POINTER_WIDTH");
-    if pointer_width.as_ref().map(String::as_str) == Ok("64") {
+    let u64_digit = pointer_width.as_ref().map(String::as_str) == Ok("64");
+    if u64_digit {
         autocfg::emit("u64_digit");
     }
     let ac = autocfg::new();
@@ -14,6 +15,10 @@ fn main() {
         autocfg::emit("has_try_from");
     }
 
+    if u64_digit && (ac.probe_path("core::arch::x86_64::_addcarry_u64") || ac.probe_path("std::arch::x86_64::_addcarry_u64")) {
+        autocfg::emit("use_addcarry_u64");
+    }
+
     autocfg::rerun_path("build.rs");
 
     write_radix_bases().unwrap();
diff --git a/src/algorithms.rs b/src/algorithms.rs
index 6b8f8a6d..38196549 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -12,30 +12,42 @@ use crate::bigint::BigInt;
 use crate::bigint::Sign;
 use crate::bigint::Sign::{Minus, NoSign, Plus};
 
-use crate::big_digit::{self, BigDigit, DoubleBigDigit, SignedDoubleBigDigit};
+use crate::big_digit::{self, BigDigit, DoubleBigDigit};
+
+#[cfg(not(use_addcarry_u64))] // only needed for the fallback implementation of `sbb`
+use crate::big_digit::SignedDoubleBigDigit;
 
 // Generic functions for add/subtract/multiply with carry/borrow:
 
 // Add with carry:
-#[allow(unused)]
+#[cfg(use_addcarry_u64)]
 #[inline]
-fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
-    *acc += DoubleBigDigit::from(a);
-    *acc += DoubleBigDigit::from(b);
-    let lo = *acc as BigDigit;
-    *acc >>= big_digit::BITS;
-    lo
+fn adc(carry: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
+    unsafe { core::arch::x86_64::_addcarry_u64(carry, a, b, out) }
+}
+
+#[cfg(not(use_addcarry_u64))] // fallback for environments where we don't have an addcarry intrinsic
+#[inline]
+fn adc(mut carry: DoubleBigDigit, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> DoubleBigDigit {
+    carry += DoubleBigDigit::from(a);
+    carry += DoubleBigDigit::from(b);
+    *out = carry as BigDigit;
+    carry >> big_digit::BITS
 }
 
 // Subtract with borrow:
-#[allow(unused)]
+#[cfg(use_addcarry_u64)]
 #[inline]
-fn sbb(a: BigDigit, b: BigDigit, acc: &mut SignedDoubleBigDigit) -> BigDigit {
-    *acc += SignedDoubleBigDigit::from(a);
-    *acc -= SignedDoubleBigDigit::from(b);
-    let lo = *acc as BigDigit;
-    *acc >>= big_digit::BITS;
-    lo
+fn sbb(carry: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
+    unsafe { core::arch::x86_64::_subborrow_u64(carry, a, b, out) }
+}
+#[cfg(not(use_addcarry_u64))] // fallback for environments where we don't have an addcarry intrinsic
+#[inline]
+fn sbb(mut carry: SignedDoubleBigDigit, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> SignedDoubleBigDigit {
+    carry += SignedDoubleBigDigit::from(a);
+    carry -= SignedDoubleBigDigit::from(b);
+    *out = carry as BigDigit;
+    carry >> big_digit::BITS
 }
 
 #[inline]
@@ -134,41 +146,6 @@ pub(crate) fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
 /// the addition first hoping that it will fit.
 ///
 /// The caller _must_ ensure that `a` is at least as long as `b`.
-#[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
-#[inline]
-pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
-    debug_assert!(a.len() >= b.len());
-
-    use core::arch::x86_64::_addcarry_u64;
-
-    let mut carry = 0;
-    let (a_lo, a_hi) = a.split_at_mut(b.len());
-
-    for (a, b) in a_lo.iter_mut().zip(b) {
-        // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
-        carry = unsafe { _addcarry_u64(carry, *a, *b, a) };
-    }
-
-    if carry != 0 {
-        for a in a_hi {
-            // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
-            carry = unsafe { _addcarry_u64(carry, *a, 0, a) };
-            if carry == 0 {
-                break;
-            }
-        }
-    }
-
-    carry as BigDigit
-}
-
-/// Two argument addition of raw slices, `a += b`, returning the carry.
-///
-/// This is used when the data `Vec` might need to resize to push a non-zero carry, so we perform
-/// the addition first hoping that it will fit.
-///
-/// The caller _must_ ensure that `a` is at least as long as `b`.
-#[cfg(not(all(u64_digit, target_arch = "x86_64")))] // run if we aren't using 64-bit digits, or if we're not running on x86_64
 #[inline]
 pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     debug_assert!(a.len() >= b.len());
@@ -177,12 +154,12 @@ pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     let (a_lo, a_hi) = a.split_at_mut(b.len());
 
     for (a, b) in a_lo.iter_mut().zip(b) {
-        *a = adc(*a, *b, &mut carry);
+        carry = adc(carry, *a, *b, a);
     }
 
     if carry != 0 {
         for a in a_hi {
-            *a = adc(*a, 0, &mut carry);
+            carry = adc(carry, *a, 0, a);
             if carry == 0 {
                 break;
             }
@@ -203,39 +180,6 @@ pub(crate) fn add2(a: &mut [BigDigit], b: &[BigDigit]) {
     debug_assert!(carry == 0);
 }
 
-#[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
-pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
-    use core::arch::x86_64::_subborrow_u64;
-
-    let mut borrow = 0;
-
-    let len = cmp::min(a.len(), b.len());
-    let (a_lo, a_hi) = a.split_at_mut(len);
-    let (b_lo, b_hi) = b.split_at(len);
-
-    for (a, b) in a_lo.iter_mut().zip(b_lo) {
-        // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
-        borrow = unsafe { _subborrow_u64(borrow, *a, *b, a) };
-    }
-
-    if borrow != 0 {
-        for a in a_hi {
-            // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
-            borrow = unsafe { _subborrow_u64(borrow, *a, 0, a) };
-            if borrow == 0 {
-                break;
-            }
-        }
-    }
-
-    // note: we're _required_ to fail on underflow
-    assert!(
-        borrow == 0 && b_hi.iter().all(|x| *x == 0),
-        "Cannot subtract b from a because b is larger than a."
-    );
-}
-
-#[cfg(not(all(u64_digit, target_arch = "x86_64")))] // run if we aren't using 64-bit digits, or if we're not running on x86_64
 pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
     let mut borrow = 0;
 
@@ -244,12 +188,12 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
     let (b_lo, b_hi) = b.split_at(len);
 
     for (a, b) in a_lo.iter_mut().zip(b_lo) {
-        *a = sbb(*a, *b, &mut borrow);
+        borrow = sbb(borrow, *a, *b, a);
     }
 
     if borrow != 0 {
         for a in a_hi {
-            *a = sbb(*a, 0, &mut borrow);
+            borrow = sbb(borrow, *a, 0, a);
             if borrow == 0 {
                 break;
             }
@@ -264,24 +208,6 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
 }
 
 // Only for the Sub impl. `a` and `b` must have same length.
-#[cfg(all(u64_digit, target_arch = "x86_64"))] // only run on x86_64, when we have u64 digits
-#[inline]
-pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
-    use core::arch::x86_64::_subborrow_u64;
-    debug_assert!(b.len() == a.len());
-
-    let mut borrow = 0;
-
-    for (ai, bi) in a.iter().zip(b) {
-        // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
-        borrow = unsafe { _subborrow_u64(borrow, *ai, *bi, bi) };
-    }
-
-    borrow as BigDigit
-}
-
-// Only for the Sub impl. `a` and `b` must have same length.
-#[cfg(not(all(u64_digit, target_arch = "x86_64")))] // run if we aren't using 64-bit digits, or if we're not running on x86_64
 #[inline]
 pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
     debug_assert!(b.len() == a.len());
@@ -289,7 +215,7 @@ pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
     let mut borrow = 0;
 
     for (ai, bi) in a.iter().zip(b) {
-        *bi = sbb(*ai, *bi, &mut borrow);
+        borrow = sbb(borrow, *ai, *bi, bi);
     }
 
     borrow as BigDigit

From 49ff7b745c22d3c973876ba8ea6a211b724acc5b Mon Sep 17 00:00:00 2001
From: Elliott Mahler <join.together@gmail.com>
Date: Wed, 25 Mar 2020 23:16:08 -0700
Subject: [PATCH 4/7] Backed out adc and sbb parameter rearrangement,
 implemented u32 for x64, applied rustfmt

---
 build.rs          | 10 ++++++-
 src/algorithms.rs | 73 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/build.rs b/build.rs
index 4952abd8..0aea8629 100644
--- a/build.rs
+++ b/build.rs
@@ -15,8 +15,16 @@ fn main() {
         autocfg::emit("has_try_from");
     }
 
-    if u64_digit && (ac.probe_path("core::arch::x86_64::_addcarry_u64") || ac.probe_path("std::arch::x86_64::_addcarry_u64")) {
+    if u64_digit
+        && (ac.probe_path("core::arch::x86_64::_addcarry_u64")
+            || ac.probe_path("std::arch::x86_64::_addcarry_u64"))
+    {
         autocfg::emit("use_addcarry_u64");
+    } else if !u64_digit
+        && (ac.probe_path("core::arch::x86_64::_addcarry_u32")
+            || ac.probe_path("core::arch::x86::_addcarry_u32"))
+    {
+        autocfg::emit("use_addcarry_u32");
     }
 
     autocfg::rerun_path("build.rs");
diff --git a/src/algorithms.rs b/src/algorithms.rs
index 38196549..66f13d9e 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -14,40 +14,67 @@ use crate::bigint::Sign::{Minus, NoSign, Plus};
 
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 
-#[cfg(not(use_addcarry_u64))] // only needed for the fallback implementation of `sbb`
+// only needed for the fallback implementation of `sbb`
+#[cfg(not(any(use_addcarry_u64, use_addcarry_u32)))]
 use crate::big_digit::SignedDoubleBigDigit;
 
-// Generic functions for add/subtract/multiply with carry/borrow:
+// Generic functions for add/subtract/multiply with carry/borrow. These are specialized for some platforms to take advantage of intrinsics etc
 
 // Add with carry:
 #[cfg(use_addcarry_u64)]
 #[inline]
-fn adc(carry: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
-    unsafe { core::arch::x86_64::_addcarry_u64(carry, a, b, out) }
+fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
+    let mut out = 0;
+    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    *acc = unsafe { core::arch::x86_64::_addcarry_u64(*acc, a, b, &mut out) };
+    out
 }
 
-#[cfg(not(use_addcarry_u64))] // fallback for environments where we don't have an addcarry intrinsic
+#[cfg(use_addcarry_u32)]
 #[inline]
-fn adc(mut carry: DoubleBigDigit, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> DoubleBigDigit {
-    carry += DoubleBigDigit::from(a);
-    carry += DoubleBigDigit::from(b);
-    *out = carry as BigDigit;
-    carry >> big_digit::BITS
+fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
+    let mut out = 0;
+    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    *acc = unsafe { core::arch::x86_64::_addcarry_u32(*acc, a, b, &mut out) };
+    out
+}
+
+#[cfg(not(any(use_addcarry_u64, use_addcarry_u32)))] // fallback for environments where we don't have an addcarry intrinsic
+#[inline]
+fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
+    *acc += DoubleBigDigit::from(a);
+    *acc += DoubleBigDigit::from(b);
+    let lo = *acc as BigDigit;
+    *acc >>= big_digit::BITS;
+    lo
 }
 
 // Subtract with borrow:
 #[cfg(use_addcarry_u64)]
 #[inline]
-fn sbb(carry: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
-    unsafe { core::arch::x86_64::_subborrow_u64(carry, a, b, out) }
+fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
+    let mut out = 0;
+    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    *acc = unsafe { core::arch::x86_64::_subborrow_u64(*acc, a, b, &mut out) };
+    out
 }
-#[cfg(not(use_addcarry_u64))] // fallback for environments where we don't have an addcarry intrinsic
+#[cfg(use_addcarry_u32)]
 #[inline]
-fn sbb(mut carry: SignedDoubleBigDigit, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> SignedDoubleBigDigit {
-    carry += SignedDoubleBigDigit::from(a);
-    carry -= SignedDoubleBigDigit::from(b);
-    *out = carry as BigDigit;
-    carry >> big_digit::BITS
+fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
+    let mut out = 0;
+    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    *acc = unsafe { core::arch::x86_64::_subborrow_u32(*acc, a, b, &mut out) };
+    out
+}
+
+#[cfg(not(any(use_addcarry_u64, use_addcarry_u32)))] // fallback for environments where we don't have an addcarry intrinsic
+#[inline]
+fn sbb(a: BigDigit, b: BigDigit, acc: &mut SignedDoubleBigDigit) -> BigDigit {
+    *acc += SignedDoubleBigDigit::from(a);
+    *acc -= SignedDoubleBigDigit::from(b);
+    let lo = *acc as BigDigit;
+    *acc >>= big_digit::BITS;
+    lo
 }
 
 #[inline]
@@ -154,12 +181,12 @@ pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     let (a_lo, a_hi) = a.split_at_mut(b.len());
 
     for (a, b) in a_lo.iter_mut().zip(b) {
-        carry = adc(carry, *a, *b, a);
+        *a = adc(*a, *b, &mut carry);
     }
 
     if carry != 0 {
         for a in a_hi {
-            carry = adc(carry, *a, 0, a);
+            *a = adc(*a, 0, &mut carry);
             if carry == 0 {
                 break;
             }
@@ -188,12 +215,12 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
     let (b_lo, b_hi) = b.split_at(len);
 
     for (a, b) in a_lo.iter_mut().zip(b_lo) {
-        borrow = sbb(borrow, *a, *b, a);
+        *a = sbb(*a, *b, &mut borrow);
     }
 
     if borrow != 0 {
         for a in a_hi {
-            borrow = sbb(borrow, *a, 0, a);
+            *a = sbb(*a, 0, &mut borrow);
             if borrow == 0 {
                 break;
             }
@@ -215,7 +242,7 @@ pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
     let mut borrow = 0;
 
     for (ai, bi) in a.iter().zip(b) {
-        borrow = sbb(borrow, *ai, *bi, bi);
+        *bi = sbb(*ai, *bi, &mut borrow);
     }
 
     borrow as BigDigit

From 0cc50c9b18dcc429f3b666d04b2093835909da55 Mon Sep 17 00:00:00 2001
From: Elliott Mahler <join.together@gmail.com>
Date: Wed, 25 Mar 2020 23:16:36 -0700
Subject: [PATCH 5/7] fixed copy/paste error

---
 src/algorithms.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/algorithms.rs b/src/algorithms.rs
index 66f13d9e..d441fb55 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -34,7 +34,7 @@ fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
 #[inline]
 fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    // Safety: There are absolutely no safety concerns with calling _addcarry_u32, it's just unsafe for API consistency with other intrinsics
     *acc = unsafe { core::arch::x86_64::_addcarry_u32(*acc, a, b, &mut out) };
     out
 }
@@ -54,7 +54,7 @@ fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
 #[inline]
 fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
     *acc = unsafe { core::arch::x86_64::_subborrow_u64(*acc, a, b, &mut out) };
     out
 }
@@ -62,7 +62,7 @@ fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
 #[inline]
 fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
+    // Safety: There are absolutely no safety concerns with calling _subborrow_u32, it's just unsafe for API consistency with other intrinsics
     *acc = unsafe { core::arch::x86_64::_subborrow_u32(*acc, a, b, &mut out) };
     out
 }

From e3971e6558c2140ea09da13b1726e573969c7950 Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Fri, 30 Oct 2020 13:04:50 -0700
Subject: [PATCH 6/7] Unify addcarry probing for x86_64/x86

---
 build.rs          | 26 +++++++++++++++-----------
 src/algorithms.rs | 46 ++++++++++++++++++++++++++++++----------------
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/build.rs b/build.rs
index 0aea8629..3daed5e8 100644
--- a/build.rs
+++ b/build.rs
@@ -11,20 +11,24 @@ fn main() {
         autocfg::emit("u64_digit");
     }
     let ac = autocfg::new();
-    if ac.probe_path("std::convert::TryFrom") || ac.probe_path("core::convert::TryFrom") {
+    let std = if ac.probe_sysroot_crate("std") {
+        "std"
+    } else {
+        "core"
+    };
+    if ac.probe_path(&format!("{}::convert::TryFrom", std)) {
         autocfg::emit("has_try_from");
     }
 
-    if u64_digit
-        && (ac.probe_path("core::arch::x86_64::_addcarry_u64")
-            || ac.probe_path("std::arch::x86_64::_addcarry_u64"))
-    {
-        autocfg::emit("use_addcarry_u64");
-    } else if !u64_digit
-        && (ac.probe_path("core::arch::x86_64::_addcarry_u32")
-            || ac.probe_path("core::arch::x86::_addcarry_u32"))
-    {
-        autocfg::emit("use_addcarry_u32");
+    if let Ok(target_arch) = env::var("CARGO_CFG_TARGET_ARCH") {
+        if target_arch == "x86_64" || target_arch == "x86" {
+            let digit = if u64_digit { "u64" } else { "u32" };
+
+            let addcarry = format!("{}::arch::{}::_addcarry_{}", std, target_arch, digit);
+            if ac.probe_path(&addcarry) {
+                autocfg::emit("use_addcarry");
+            }
+        }
     }
 
     autocfg::rerun_path("build.rs");
diff --git a/src/algorithms.rs b/src/algorithms.rs
index d441fb55..89fc35fe 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -5,6 +5,12 @@ use core::iter::repeat;
 use core::mem;
 use num_traits::{One, PrimInt, Zero};
 
+#[cfg(all(use_addcarry, target_arch = "x86_64"))]
+use core::arch::x86_64 as arch;
+
+#[cfg(all(use_addcarry, target_arch = "x86"))]
+use core::arch::x86 as arch;
+
 use crate::biguint::biguint_from_vec;
 use crate::biguint::BigUint;
 
@@ -15,31 +21,35 @@ use crate::bigint::Sign::{Minus, NoSign, Plus};
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 
 // only needed for the fallback implementation of `sbb`
-#[cfg(not(any(use_addcarry_u64, use_addcarry_u32)))]
+#[cfg(not(use_addcarry))]
 use crate::big_digit::SignedDoubleBigDigit;
 
-// Generic functions for add/subtract/multiply with carry/borrow. These are specialized for some platforms to take advantage of intrinsics etc
+// Generic functions for add/subtract/multiply with carry/borrow. These are specialized
+// for some platforms to take advantage of intrinsics, etc.
 
 // Add with carry:
-#[cfg(use_addcarry_u64)]
+#[cfg(all(use_addcarry, u64_digit))]
 #[inline]
 fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _addcarry_u64, it's just unsafe for API consistency with other intrinsics
-    *acc = unsafe { core::arch::x86_64::_addcarry_u64(*acc, a, b, &mut out) };
+    // Safety: There are absolutely no safety concerns with calling `_addcarry_u64`.
+    // It's just unsafe for API consistency with other intrinsics.
+    *acc = unsafe { arch::_addcarry_u64(*acc, a, b, &mut out) };
     out
 }
 
-#[cfg(use_addcarry_u32)]
+#[cfg(all(use_addcarry, not(u64_digit)))]
 #[inline]
 fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _addcarry_u32, it's just unsafe for API consistency with other intrinsics
-    *acc = unsafe { core::arch::x86_64::_addcarry_u32(*acc, a, b, &mut out) };
+    // Safety: There are absolutely no safety concerns with calling `_addcarry_u32`.
+    // It's just unsafe for API consistency with other intrinsics.
+    *acc = unsafe { arch::_addcarry_u32(*acc, a, b, &mut out) };
     out
 }
 
-#[cfg(not(any(use_addcarry_u64, use_addcarry_u32)))] // fallback for environments where we don't have an addcarry intrinsic
+// fallback for environments where we don't have an addcarry intrinsic
+#[cfg(not(use_addcarry))]
 #[inline]
 fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
     *acc += DoubleBigDigit::from(a);
@@ -50,24 +60,28 @@ fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
 }
 
 // Subtract with borrow:
-#[cfg(use_addcarry_u64)]
+#[cfg(all(use_addcarry, u64_digit))]
 #[inline]
 fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _subborrow_u64, it's just unsafe for API consistency with other intrinsics
-    *acc = unsafe { core::arch::x86_64::_subborrow_u64(*acc, a, b, &mut out) };
+    // Safety: There are absolutely no safety concerns with calling `_subborrow_u64`.
+    // It's just unsafe for API consistency with other intrinsics.
+    *acc = unsafe { arch::_subborrow_u64(*acc, a, b, &mut out) };
     out
 }
-#[cfg(use_addcarry_u32)]
+
+#[cfg(all(use_addcarry, not(u64_digit)))]
 #[inline]
 fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
     let mut out = 0;
-    // Safety: There are absolutely no safety concerns with calling _subborrow_u32, it's just unsafe for API consistency with other intrinsics
-    *acc = unsafe { core::arch::x86_64::_subborrow_u32(*acc, a, b, &mut out) };
+    // Safety: There are absolutely no safety concerns with calling `_subborrow_u32`.
+    // It's just unsafe for API consistency with other intrinsics.
+    *acc = unsafe { arch::_subborrow_u32(*acc, a, b, &mut out) };
     out
 }
 
-#[cfg(not(any(use_addcarry_u64, use_addcarry_u32)))] // fallback for environments where we don't have an addcarry intrinsic
+// fallback for environments where we don't have an addcarry intrinsic
+#[cfg(not(use_addcarry))]
 #[inline]
 fn sbb(a: BigDigit, b: BigDigit, acc: &mut SignedDoubleBigDigit) -> BigDigit {
     *acc += SignedDoubleBigDigit::from(a);

From e03bbc14681407bc9818eb059963b7489c6fdd37 Mon Sep 17 00:00:00 2001
From: Josh Stone <cuviper@gmail.com>
Date: Fri, 30 Oct 2020 14:48:13 -0700
Subject: [PATCH 7/7] Restructure adc/sbb to match addcarry/subborrow

---
 src/algorithms.rs | 62 ++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/src/algorithms.rs b/src/algorithms.rs
index 89fc35fe..a2037086 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -30,65 +30,55 @@ use crate::big_digit::SignedDoubleBigDigit;
 // Add with carry:
 #[cfg(all(use_addcarry, u64_digit))]
 #[inline]
-fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
-    let mut out = 0;
+fn adc(carry: u8, a: u64, b: u64, out: &mut u64) -> u8 {
     // Safety: There are absolutely no safety concerns with calling `_addcarry_u64`.
     // It's just unsafe for API consistency with other intrinsics.
-    *acc = unsafe { arch::_addcarry_u64(*acc, a, b, &mut out) };
-    out
+    unsafe { arch::_addcarry_u64(carry, a, b, out) }
 }
 
 #[cfg(all(use_addcarry, not(u64_digit)))]
 #[inline]
-fn adc(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
-    let mut out = 0;
+fn adc(carry: u8, a: u32, b: u32, out: &mut u32) -> u8 {
     // Safety: There are absolutely no safety concerns with calling `_addcarry_u32`.
     // It's just unsafe for API consistency with other intrinsics.
-    *acc = unsafe { arch::_addcarry_u32(*acc, a, b, &mut out) };
-    out
+    unsafe { arch::_addcarry_u32(carry, a, b, out) }
 }
 
 // fallback for environments where we don't have an addcarry intrinsic
 #[cfg(not(use_addcarry))]
 #[inline]
-fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
-    *acc += DoubleBigDigit::from(a);
-    *acc += DoubleBigDigit::from(b);
-    let lo = *acc as BigDigit;
-    *acc >>= big_digit::BITS;
-    lo
+fn adc(carry: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
+    let sum = DoubleBigDigit::from(a) + DoubleBigDigit::from(b) + DoubleBigDigit::from(carry);
+    *out = sum as BigDigit;
+    (sum >> big_digit::BITS) as u8
 }
 
 // Subtract with borrow:
 #[cfg(all(use_addcarry, u64_digit))]
 #[inline]
-fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
-    let mut out = 0;
+fn sbb(borrow: u8, a: u64, b: u64, out: &mut u64) -> u8 {
     // Safety: There are absolutely no safety concerns with calling `_subborrow_u64`.
     // It's just unsafe for API consistency with other intrinsics.
-    *acc = unsafe { arch::_subborrow_u64(*acc, a, b, &mut out) };
-    out
+    unsafe { arch::_subborrow_u64(borrow, a, b, out) }
 }
 
 #[cfg(all(use_addcarry, not(u64_digit)))]
 #[inline]
-fn sbb(a: BigDigit, b: BigDigit, acc: &mut u8) -> BigDigit {
-    let mut out = 0;
+fn sbb(borrow: u8, a: u32, b: u32, out: &mut u32) -> u8 {
     // Safety: There are absolutely no safety concerns with calling `_subborrow_u32`.
     // It's just unsafe for API consistency with other intrinsics.
-    *acc = unsafe { arch::_subborrow_u32(*acc, a, b, &mut out) };
-    out
+    unsafe { arch::_subborrow_u32(borrow, a, b, out) }
 }
 
-// fallback for environments where we don't have an addcarry intrinsic
+// fallback for environments where we don't have a subborrow intrinsic
 #[cfg(not(use_addcarry))]
 #[inline]
-fn sbb(a: BigDigit, b: BigDigit, acc: &mut SignedDoubleBigDigit) -> BigDigit {
-    *acc += SignedDoubleBigDigit::from(a);
-    *acc -= SignedDoubleBigDigit::from(b);
-    let lo = *acc as BigDigit;
-    *acc >>= big_digit::BITS;
-    lo
+fn sbb(borrow: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
+    let difference = SignedDoubleBigDigit::from(a)
+        - SignedDoubleBigDigit::from(b)
+        - SignedDoubleBigDigit::from(borrow);
+    *out = difference as BigDigit;
+    u8::from(difference < 0)
 }
 
 #[inline]
@@ -195,12 +185,12 @@ pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     let (a_lo, a_hi) = a.split_at_mut(b.len());
 
     for (a, b) in a_lo.iter_mut().zip(b) {
-        *a = adc(*a, *b, &mut carry);
+        carry = adc(carry, *a, *b, a);
     }
 
     if carry != 0 {
         for a in a_hi {
-            *a = adc(*a, 0, &mut carry);
+            carry = adc(carry, *a, 0, a);
             if carry == 0 {
                 break;
             }
@@ -229,12 +219,12 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
     let (b_lo, b_hi) = b.split_at(len);
 
     for (a, b) in a_lo.iter_mut().zip(b_lo) {
-        *a = sbb(*a, *b, &mut borrow);
+        borrow = sbb(borrow, *a, *b, a);
     }
 
     if borrow != 0 {
         for a in a_hi {
-            *a = sbb(*a, 0, &mut borrow);
+            borrow = sbb(borrow, *a, 0, a);
             if borrow == 0 {
                 break;
             }
@@ -250,16 +240,16 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
 
 // Only for the Sub impl. `a` and `b` must have same length.
 #[inline]
-pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
+pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> u8 {
     debug_assert!(b.len() == a.len());
 
     let mut borrow = 0;
 
     for (ai, bi) in a.iter().zip(b) {
-        *bi = sbb(*ai, *bi, &mut borrow);
+        borrow = sbb(borrow, *ai, *bi, bi);
     }
 
-    borrow as BigDigit
+    borrow
 }
 
 pub(crate) fn sub2rev(a: &[BigDigit], b: &mut [BigDigit]) {