diff --git a/.travis.yml b/.travis.yml
index 7276fe4a..79474e17 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,6 +13,12 @@ script:
   - ./ci/test_full.sh
 matrix:
   include:
+    - name: "Rust: stable-i686"
+      rust: stable-i686-unknown-linux-gnu
+      addons:
+        apt:
+          packages:
+            - gcc-multilib
     - name: "rustfmt"
       rust: 1.31.0
       before_script:
diff --git a/benches/bigint.rs b/benches/bigint.rs
index bc0875d8..90a7e856 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -112,6 +112,11 @@ fn divide_2(b: &mut Bencher) {
     divide_bench(b, 1 << 16, 1 << 12);
 }
 
+#[bench]
+fn divide_big_little(b: &mut Bencher) {
+    divide_bench(b, 1 << 16, 1 << 4);
+}
+
 #[bench]
 fn remainder_0(b: &mut Bencher) {
     remainder_bench(b, 1 << 8, 1 << 6);
@@ -127,6 +132,11 @@ fn remainder_2(b: &mut Bencher) {
     remainder_bench(b, 1 << 16, 1 << 12);
 }
 
+#[bench]
+fn remainder_big_little(b: &mut Bencher) {
+    remainder_bench(b, 1 << 16, 1 << 4);
+}
+
 #[bench]
 fn factorial_100(b: &mut Bencher) {
     b.iter(|| factorial(100));
diff --git a/build.rs b/build.rs
index e483c15f..1f55eb95 100644
--- a/build.rs
+++ b/build.rs
@@ -1,14 +1,78 @@
 extern crate autocfg;
 
 use std::env;
+use std::error::Error;
+use std::fs::File;
+use std::io::Write;
+use std::path::Path;
 
 fn main() {
     let ac = autocfg::new();
+
     if ac.probe_type("i128") {
-        println!("cargo:rustc-cfg=has_i128");
+        autocfg::emit("has_i128");
+
+        let pointer_width = env::var("CARGO_CFG_TARGET_POINTER_WIDTH");
+        if pointer_width.as_ref().map(String::as_str) == Ok("64") {
+            autocfg::emit("u64_digit");
+        }
     } else if env::var_os("CARGO_FEATURE_I128").is_some() {
         panic!("i128 support was not detected!");
     }
 
     autocfg::rerun_path("build.rs");
+
+    write_radix_bases().unwrap();
+}
+
+/// Write tables of the greatest power of each radix for the given bit size.  These are returned
+/// from `biguint::get_radix_base` to batch the multiplication/division of radix conversions on
+/// full `BigUint` values, operating on primitive integers as much as possible.
+///
+/// e.g. BASES_16[3] = (59049, 10) // 3¹⁰ fits in u16, but 3¹¹ is too big
+///      BASES_32[3] = (3486784401, 20)
+///      BASES_64[3] = (12157665459056928801, 40)
+///
+/// Powers of two are not included, just zeroed, as they're implemented with shifts.
+#[allow(unknown_lints, bare_trait_objects)]
+fn write_radix_bases() -> Result<(), Box<Error>> {
+    let out_dir = env::var("OUT_DIR")?;
+    let dest_path = Path::new(&out_dir).join("radix_bases.rs");
+    let mut f = File::create(&dest_path)?;
+
+    for &bits in &[16, 32, 64] {
+        let max = if bits < 64 {
+            (1 << bits) - 1
+        } else {
+            std::u64::MAX
+        };
+
+        writeln!(f, "#[deny(overflowing_literals)]")?;
+        writeln!(
+            f,
+            "pub static BASES_{bits}: [(u{bits}, usize); 257] = [",
+            bits = bits
+        )?;
+        for radix in 0u64..257 {
+            let (base, power) = if radix == 0 || radix.is_power_of_two() {
+                (0, 0)
+            } else {
+                let mut power = 1;
+                let mut base = radix;
+
+                while let Some(b) = base.checked_mul(radix) {
+                    if b > max {
+                        break;
+                    }
+                    base = b;
+                    power += 1;
+                }
+                (base, power)
+            };
+            writeln!(f, "    ({}, {}), // {}", base, power, radix)?;
+        }
+        writeln!(f, "];")?;
+    }
+
+    Ok(())
 }
diff --git a/ci/test_full.sh b/ci/test_full.sh
index 4e1b60e9..4915e880 100755
--- a/ci/test_full.sh
+++ b/ci/test_full.sh
@@ -5,13 +5,13 @@ set -ex
 echo Testing num-bigint on rustc ${TRAVIS_RUST_VERSION}
 
 FEATURES="serde"
-if [[ "$TRAVIS_RUST_VERSION" =~ ^(nightly|beta|stable|1.31.0|1.26.0|1.22.0)$ ]]; then
+if [[ "$TRAVIS_RUST_VERSION" =~ ^(nightly|beta|stable.*|1.31.0|1.26.0|1.22.0)$ ]]; then
   FEATURES="$FEATURES rand"
 fi
-if [[ "$TRAVIS_RUST_VERSION" =~ ^(nightly|beta|stable|1.31.0|1.26.0)$ ]]; then
+if [[ "$TRAVIS_RUST_VERSION" =~ ^(nightly|beta|stable.*|1.31.0|1.26.0)$ ]]; then
   FEATURES="$FEATURES i128"
 fi
-if [[ "$TRAVIS_RUST_VERSION" =~ ^(nightly|beta|stable|1.31.0)$ ]]; then
+if [[ "$TRAVIS_RUST_VERSION" =~ ^(nightly|beta|stable.*|1.31.0)$ ]]; then
   FEATURES="$FEATURES quickcheck quickcheck_macros"
 fi
 
diff --git a/src/algorithms.rs b/src/algorithms.rs
index 223f051a..aa42c6c8 100644
--- a/src/algorithms.rs
+++ b/src/algorithms.rs
@@ -6,6 +6,7 @@ use std::mem;
 use traits;
 use traits::{One, Zero};
 
+use biguint::biguint_from_vec;
 use biguint::BigUint;
 
 use bigint::BigInt;
@@ -68,29 +69,65 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi
     ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit)
 }
 
+/// For small divisors, we can divide without promoting to `DoubleBigDigit` by
+/// using half-size pieces of digit, like long-division.
+#[inline]
+fn div_half(rem: BigDigit, digit: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) {
+    use big_digit::{HALF, HALF_BITS};
+    use integer::Integer;
+
+    debug_assert!(rem < divisor && divisor <= HALF);
+    let (hi, rem) = ((rem << HALF_BITS) | (digit >> HALF_BITS)).div_rem(&divisor);
+    let (lo, rem) = ((rem << HALF_BITS) | (digit & HALF)).div_rem(&divisor);
+    ((hi << HALF_BITS) | lo, rem)
+}
+
+#[inline]
 pub fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit) {
     let mut rem = 0;
 
-    for d in a.data.iter_mut().rev() {
-        let (q, r) = div_wide(rem, *d, b);
-        *d = q;
-        rem = r;
+    if b <= big_digit::HALF {
+        for d in a.data.iter_mut().rev() {
+            let (q, r) = div_half(rem, *d, b);
+            *d = q;
+            rem = r;
+        }
+    } else {
+        for d in a.data.iter_mut().rev() {
+            let (q, r) = div_wide(rem, *d, b);
+            *d = q;
+            rem = r;
+        }
     }
 
     (a.normalized(), rem)
 }
 
+#[inline]
 pub fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit {
-    let mut rem: DoubleBigDigit = 0;
-    for &digit in a.data.iter().rev() {
-        rem = (rem << big_digit::BITS) + DoubleBigDigit::from(digit);
-        rem %= DoubleBigDigit::from(b);
+    let mut rem = 0;
+
+    if b <= big_digit::HALF {
+        for &digit in a.data.iter().rev() {
+            let (_, r) = div_half(rem, digit, b);
+            rem = r;
+        }
+    } else {
+        for &digit in a.data.iter().rev() {
+            let (_, r) = div_wide(rem, digit, b);
+            rem = r;
+        }
     }
 
-    rem as BigDigit
+    rem
 }
 
-// Only for the Add impl:
+/// Two argument addition of raw slices, `a += b`, returning the carry.
+///
+/// This is used when the data `Vec` might need to resize to push a non-zero carry, so we perform
+/// the addition first hoping that it will fit.
+///
+/// The caller _must_ ensure that `a` is at least as long as `b`.
 #[inline]
 pub fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     debug_assert!(a.len() >= b.len());
@@ -193,12 +230,12 @@ pub fn sub_sign(a: &[BigDigit], b: &[BigDigit]) -> (Sign, BigUint) {
         Greater => {
             let mut a = a.to_vec();
             sub2(&mut a, b);
-            (Plus, BigUint::new(a))
+            (Plus, biguint_from_vec(a))
         }
         Less => {
             let mut b = b.to_vec();
             sub2(&mut b, a);
-            (Minus, BigUint::new(b))
+            (Minus, biguint_from_vec(b))
         }
         _ => (NoSign, Zero::zero()),
     }
@@ -225,6 +262,10 @@ pub fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
     }
 }
 
+fn bigint_from_slice(slice: &[BigDigit]) -> BigInt {
+    BigInt::from(biguint_from_vec(slice.to_vec()))
+}
+
 /// Three argument multiply accumulate:
 /// acc += b * c
 fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
@@ -387,14 +428,14 @@ fn mac3(acc: &mut [BigDigit], b: &[BigDigit], c: &[BigDigit]) {
         // in place of multiplications.
         //
         // x(t) = x2*t^2 + x1*t + x0
-        let x0 = BigInt::from_slice(Plus, &x[..x0_len]);
-        let x1 = BigInt::from_slice(Plus, &x[x0_len..x0_len + x1_len]);
-        let x2 = BigInt::from_slice(Plus, &x[x0_len + x1_len..]);
+        let x0 = bigint_from_slice(&x[..x0_len]);
+        let x1 = bigint_from_slice(&x[x0_len..x0_len + x1_len]);
+        let x2 = bigint_from_slice(&x[x0_len + x1_len..]);
 
         // y(t) = y2*t^2 + y1*t + y0
-        let y0 = BigInt::from_slice(Plus, &y[..y0_len]);
-        let y1 = BigInt::from_slice(Plus, &y[y0_len..y0_len + y1_len]);
-        let y2 = BigInt::from_slice(Plus, &y[y0_len + y1_len..]);
+        let y0 = bigint_from_slice(&y[..y0_len]);
+        let y1 = bigint_from_slice(&y[y0_len..y0_len + y1_len]);
+        let y2 = bigint_from_slice(&y[y0_len + y1_len..]);
 
         // Let w(t) = x(t) * y(t)
         //
@@ -538,6 +579,7 @@ pub fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) {
     // want it to be the largest number we can efficiently divide by.
     //
     let shift = d.data.last().unwrap().leading_zeros() as usize;
+
     let (q, r) = if shift == 0 {
         // no need to clone d
         div_rem_core(u, &d)
@@ -655,8 +697,7 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
         let mut prod = b * &q0;
 
         while cmp_slice(&prod.data[..], &a.data[j..]) == Greater {
-            let one: BigUint = One::one();
-            q0 -= one;
+            q0 -= 1u32;
             prod -= b;
         }
 
@@ -709,7 +750,7 @@ pub fn biguint_shl(n: Cow<BigUint>, bits: usize) -> BigUint {
         }
     }
 
-    BigUint::new(data)
+    biguint_from_vec(data)
 }
 
 #[inline]
@@ -736,7 +777,7 @@ pub fn biguint_shr(n: Cow<BigUint>, bits: usize) -> BigUint {
         }
     }
 
-    BigUint::new(data)
+    biguint_from_vec(data)
 }
 
 pub fn cmp_slice(a: &[BigDigit], b: &[BigDigit]) -> Ordering {
@@ -766,7 +807,6 @@ pub fn cmp_slice(a: &[BigDigit], b: &[BigDigit]) -> Ordering {
 mod algorithm_tests {
     use big_digit::BigDigit;
     use traits::Num;
-    use Sign::Plus;
     use {BigInt, BigUint};
 
     #[test]
@@ -780,8 +820,8 @@ mod algorithm_tests {
 
         let a = BigUint::from_str_radix("265252859812191058636308480000000", 10).unwrap();
         let b = BigUint::from_str_radix("26525285981219105863630848000000", 10).unwrap();
-        let a_i = BigInt::from_biguint(Plus, a.clone());
-        let b_i = BigInt::from_biguint(Plus, b.clone());
+        let a_i = BigInt::from(a.clone());
+        let b_i = BigInt::from(b.clone());
 
         assert_eq!(sub_sign_i(&a.data[..], &b.data[..]), &a_i - &b_i);
         assert_eq!(sub_sign_i(&b.data[..], &a.data[..]), &b_i - &a_i);
diff --git a/src/bigint.rs b/src/bigint.rs
index bd74e7d3..d62767c2 100644
--- a/src/bigint.rs
+++ b/src/bigint.rs
@@ -111,12 +111,30 @@ impl<'de> serde::Deserialize<'de> for Sign {
 }
 
 /// A big signed integer type.
-#[derive(Clone, Debug, Hash)]
+#[derive(Debug, Hash)]
 pub struct BigInt {
     sign: Sign,
     data: BigUint,
 }
 
+// Note: derived `Clone` doesn't specialize `clone_from`,
+// but we want to keep the allocation in `data`.
+impl Clone for BigInt {
+    #[inline]
+    fn clone(&self) -> Self {
+        BigInt {
+            sign: self.sign,
+            data: self.data.clone(),
+        }
+    }
+
+    #[inline]
+    fn clone_from(&mut self, other: &Self) {
+        self.sign = other.sign;
+        self.data.clone_from(&other.data);
+    }
+}
+
 #[cfg(feature = "quickcheck")]
 impl Arbitrary for BigInt {
     fn arbitrary<G: Gen>(g: &mut G) -> Self {
@@ -271,8 +289,9 @@ impl<'a> Not for &'a BigInt {
 
     fn not(self) -> BigInt {
         match self.sign {
-            NoSign | Plus => BigInt::from_biguint(Minus, &self.data + 1u32),
-            Minus => BigInt::from_biguint(Plus, &self.data - 1u32),
+            NoSign => -BigInt::one(),
+            Plus => -BigInt::from(&self.data + 1u32),
+            Minus => BigInt::from(&self.data - 1u32),
         }
     }
 }
@@ -352,8 +371,8 @@ impl<'a, 'b> BitAnd<&'b BigInt> for &'a BigInt {
     #[inline]
     fn bitand(self, other: &BigInt) -> BigInt {
         match (self.sign, other.sign) {
-            (NoSign, _) | (_, NoSign) => BigInt::from_slice(NoSign, &[]),
-            (Plus, Plus) => BigInt::from_biguint(Plus, &self.data & &other.data),
+            (NoSign, _) | (_, NoSign) => BigInt::zero(),
+            (Plus, Plus) => BigInt::from(&self.data & &other.data),
             (Plus, Minus) => self.clone() & other,
             (Minus, Plus) => other.clone() & self,
             (Minus, Minus) => {
@@ -384,7 +403,7 @@ impl<'a> BitAndAssign<&'a BigInt> for BigInt {
     fn bitand_assign(&mut self, other: &BigInt) {
         match (self.sign, other.sign) {
             (NoSign, _) => {}
-            (_, NoSign) => self.assign_from_slice(NoSign, &[]),
+            (_, NoSign) => self.set_zero(),
             (Plus, Plus) => {
                 self.data &= &other.data;
                 if self.data.is_zero() {
@@ -489,7 +508,7 @@ impl<'a, 'b> BitOr<&'b BigInt> for &'a BigInt {
         match (self.sign, other.sign) {
             (NoSign, _) => other.clone(),
             (_, NoSign) => self.clone(),
-            (Plus, Plus) => BigInt::from_biguint(Plus, &self.data | &other.data),
+            (Plus, Plus) => BigInt::from(&self.data | &other.data),
             (Plus, Minus) => other.clone() | self,
             (Minus, Plus) => self.clone() | other,
             (Minus, Minus) => {
@@ -520,7 +539,7 @@ impl<'a> BitOrAssign<&'a BigInt> for BigInt {
     fn bitor_assign(&mut self, other: &BigInt) {
         match (self.sign, other.sign) {
             (_, NoSign) => {}
-            (NoSign, _) => self.assign_from_slice(other.sign, other.digits()),
+            (NoSign, _) => self.clone_from(other),
             (Plus, Plus) => self.data |= &other.data,
             (Plus, Minus) => {
                 bitor_pos_neg(self.digits_mut(), other.digits());
@@ -646,7 +665,7 @@ impl<'a> BitXorAssign<&'a BigInt> for BigInt {
     fn bitxor_assign(&mut self, other: &BigInt) {
         match (self.sign, other.sign) {
             (_, NoSign) => {}
-            (NoSign, _) => self.assign_from_slice(other.sign, other.digits()),
+            (NoSign, _) => self.clone_from(other),
             (Plus, Plus) => {
                 self.data ^= &other.data;
                 if self.data.is_zero() {
@@ -729,10 +748,7 @@ impl ShlAssign<usize> for BigInt {
 // Negative values need a rounding adjustment if there are any ones in the
 // bits that are getting shifted out.
 fn shr_round_down(i: &BigInt, rhs: usize) -> bool {
-    i.is_negative()
-        && biguint::trailing_zeros(&i.data)
-            .map(|n| n < rhs)
-            .unwrap_or(false)
+    i.is_negative() && i.trailing_zeros().map(|n| n < rhs).unwrap_or(false)
 }
 
 impl Shr<usize> for BigInt {
@@ -772,7 +788,10 @@ impl ShrAssign<usize> for BigInt {
 impl Zero for BigInt {
     #[inline]
     fn zero() -> BigInt {
-        BigInt::from_biguint(NoSign, Zero::zero())
+        BigInt {
+            sign: NoSign,
+            data: BigUint::zero(),
+        }
     }
 
     #[inline]
@@ -790,7 +809,10 @@ impl Zero for BigInt {
 impl One for BigInt {
     #[inline]
     fn one() -> BigInt {
-        BigInt::from_biguint(Plus, One::one())
+        BigInt {
+            sign: Plus,
+            data: BigUint::one(),
+        }
     }
 
     #[inline]
@@ -810,7 +832,7 @@ impl Signed for BigInt {
     fn abs(&self) -> BigInt {
         match self.sign {
             Plus | NoSign => self.clone(),
-            Minus => BigInt::from_biguint(Plus, self.data.clone()),
+            Minus => BigInt::from(self.data.clone()),
         }
     }
 
@@ -826,9 +848,9 @@ impl Signed for BigInt {
     #[inline]
     fn signum(&self) -> BigInt {
         match self.sign {
-            Plus => BigInt::from_biguint(Plus, One::one()),
-            Minus => BigInt::from_biguint(Minus, One::one()),
-            NoSign => Zero::zero(),
+            Plus => BigInt::one(),
+            Minus => -BigInt::one(),
+            NoSign => BigInt::zero(),
         }
     }
 
@@ -1007,15 +1029,16 @@ impl Add<u32> for BigInt {
     fn add(self, other: u32) -> BigInt {
         match self.sign {
             NoSign => From::from(other),
-            Plus => BigInt::from_biguint(Plus, self.data + other),
+            Plus => BigInt::from(self.data + other),
             Minus => match self.data.cmp(&From::from(other)) {
                 Equal => Zero::zero(),
-                Less => BigInt::from_biguint(Plus, other - self.data),
-                Greater => BigInt::from_biguint(Minus, self.data - other),
+                Less => BigInt::from(other - self.data),
+                Greater => -BigInt::from(self.data - other),
             },
         }
     }
 }
+
 impl AddAssign<u32> for BigInt {
     #[inline]
     fn add_assign(&mut self, other: u32) {
@@ -1031,15 +1054,16 @@ impl Add<u64> for BigInt {
     fn add(self, other: u64) -> BigInt {
         match self.sign {
             NoSign => From::from(other),
-            Plus => BigInt::from_biguint(Plus, self.data + other),
+            Plus => BigInt::from(self.data + other),
             Minus => match self.data.cmp(&From::from(other)) {
                 Equal => Zero::zero(),
-                Less => BigInt::from_biguint(Plus, other - self.data),
-                Greater => BigInt::from_biguint(Minus, self.data - other),
+                Less => BigInt::from(other - self.data),
+                Greater => -BigInt::from(self.data - other),
             },
         }
     }
 }
+
 impl AddAssign<u64> for BigInt {
     #[inline]
     fn add_assign(&mut self, other: u64) {
@@ -1055,12 +1079,12 @@ impl Add<u128> for BigInt {
     #[inline]
     fn add(self, other: u128) -> BigInt {
         match self.sign {
-            NoSign => From::from(other),
-            Plus => BigInt::from_biguint(Plus, self.data + other),
+            NoSign => BigInt::from(other),
+            Plus => BigInt::from(self.data + other),
             Minus => match self.data.cmp(&From::from(other)) {
-                Equal => Zero::zero(),
-                Less => BigInt::from_biguint(Plus, other - self.data),
-                Greater => BigInt::from_biguint(Minus, self.data - other),
+                Equal => BigInt::zero(),
+                Less => BigInt::from(other - self.data),
+                Greater => -BigInt::from(self.data - other),
             },
         }
     }
@@ -1235,12 +1259,12 @@ impl Sub<u32> for BigInt {
     #[inline]
     fn sub(self, other: u32) -> BigInt {
         match self.sign {
-            NoSign => BigInt::from_biguint(Minus, From::from(other)),
-            Minus => BigInt::from_biguint(Minus, self.data + other),
+            NoSign => -BigInt::from(other),
+            Minus => -BigInt::from(self.data + other),
             Plus => match self.data.cmp(&From::from(other)) {
                 Equal => Zero::zero(),
-                Greater => BigInt::from_biguint(Plus, self.data - other),
-                Less => BigInt::from_biguint(Minus, other - self.data),
+                Greater => BigInt::from(self.data - other),
+                Less => -BigInt::from(other - self.data),
             },
         }
     }
@@ -1286,12 +1310,12 @@ impl Sub<u64> for BigInt {
     #[inline]
     fn sub(self, other: u64) -> BigInt {
         match self.sign {
-            NoSign => BigInt::from_biguint(Minus, From::from(other)),
-            Minus => BigInt::from_biguint(Minus, self.data + other),
+            NoSign => -BigInt::from(other),
+            Minus => -BigInt::from(self.data + other),
             Plus => match self.data.cmp(&From::from(other)) {
                 Equal => Zero::zero(),
-                Greater => BigInt::from_biguint(Plus, self.data - other),
-                Less => BigInt::from_biguint(Minus, other - self.data),
+                Greater => BigInt::from(self.data - other),
+                Less => -BigInt::from(other - self.data),
             },
         }
     }
@@ -1311,12 +1335,12 @@ impl Sub<u128> for BigInt {
     #[inline]
     fn sub(self, other: u128) -> BigInt {
         match self.sign {
-            NoSign => BigInt::from_biguint(Minus, From::from(other)),
-            Minus => BigInt::from_biguint(Minus, self.data + other),
+            NoSign => -BigInt::from(other),
+            Minus => -BigInt::from(self.data + other),
             Plus => match self.data.cmp(&From::from(other)) {
                 Equal => Zero::zero(),
-                Greater => BigInt::from_biguint(Plus, self.data - other),
-                Less => BigInt::from_biguint(Minus, other - self.data),
+                Greater => BigInt::from(self.data - other),
+                Less => -BigInt::from(other - self.data),
             },
         }
     }
@@ -1851,8 +1875,14 @@ impl<'a, 'b> Rem<&'b BigInt> for &'a BigInt {
 
     #[inline]
     fn rem(self, other: &BigInt) -> BigInt {
-        let (_, r) = self.div_rem(other);
-        r
+        if let Some(other) = other.to_u32() {
+            self % other
+        } else if let Some(other) = other.to_i32() {
+            self % other
+        } else {
+            let (_, r) = self.div_rem(other);
+            r
+        }
     }
 }
 
@@ -1895,7 +1925,7 @@ impl Rem<BigInt> for u32 {
 
     #[inline]
     fn rem(self, other: BigInt) -> BigInt {
-        BigInt::from_biguint(Plus, self % other.data)
+        BigInt::from(self % other.data)
     }
 }
 
@@ -1923,7 +1953,7 @@ impl Rem<BigInt> for u64 {
 
     #[inline]
     fn rem(self, other: BigInt) -> BigInt {
-        BigInt::from_biguint(Plus, self % other.data)
+        BigInt::from(self % other.data)
     }
 }
 
@@ -1954,7 +1984,7 @@ impl Rem<BigInt> for u128 {
 
     #[inline]
     fn rem(self, other: BigInt) -> BigInt {
-        BigInt::from_biguint(Plus, self % other.data)
+        BigInt::from(self % other.data)
     }
 }
 
@@ -2154,8 +2184,8 @@ impl Integer for BigInt {
     fn div_mod_floor(&self, other: &BigInt) -> (BigInt, BigInt) {
         // m.sign == other.sign
         let (d_ui, m_ui) = self.data.div_rem(&other.data);
-        let d = BigInt::from_biguint(Plus, d_ui);
-        let m = BigInt::from_biguint(Plus, m_ui);
+        let d = BigInt::from(d_ui);
+        let m = BigInt::from(m_ui);
         let one: BigInt = One::one();
         match (self.sign, other.sign) {
             (_, NoSign) => panic!(),
@@ -2183,13 +2213,13 @@ impl Integer for BigInt {
     /// The result is always positive.
     #[inline]
     fn gcd(&self, other: &BigInt) -> BigInt {
-        BigInt::from_biguint(Plus, self.data.gcd(&other.data))
+        BigInt::from(self.data.gcd(&other.data))
     }
 
     /// Calculates the Lowest Common Multiple (LCM) of the number and `other`.
     #[inline]
     fn lcm(&self, other: &BigInt) -> BigInt {
-        BigInt::from_biguint(Plus, self.data.lcm(&other.data))
+        BigInt::from(self.data.lcm(&other.data))
     }
 
     /// Deprecated, use `is_multiple_of` instead.
@@ -2337,9 +2367,9 @@ impl FromPrimitive for BigInt {
     #[inline]
     fn from_f64(n: f64) -> Option<BigInt> {
         if n >= 0.0 {
-            BigUint::from_f64(n).map(|x| BigInt::from_biguint(Plus, x))
+            BigUint::from_f64(n).map(BigInt::from)
         } else {
-            BigUint::from_f64(-n).map(|x| BigInt::from_biguint(Minus, x))
+            BigUint::from_f64(-n).map(|x| -BigInt::from(x))
         }
     }
 }
@@ -2609,8 +2639,7 @@ impl BigInt {
     #[inline]
     pub fn assign_from_slice(&mut self, sign: Sign, slice: &[u32]) {
         if sign == NoSign {
-            self.data.assign_from_slice(&[]);
-            self.sign = NoSign;
+            self.set_zero();
         } else {
             self.data.assign_from_slice(slice);
             self.sign = match self.data.is_zero() {
@@ -3024,6 +3053,12 @@ impl BigInt {
     pub fn nth_root(&self, n: u32) -> Self {
         Roots::nth_root(self, n)
     }
+
+    /// Returns the number of least-significant bits that are zero,
+    /// or `None` if the entire number is zero.
+    pub fn trailing_zeros(&self) -> Option<usize> {
+        self.data.trailing_zeros()
+    }
 }
 
 impl_sum_iter_type!(BigInt);
diff --git a/src/bigrand.rs b/src/bigrand.rs
index 69564dd1..ab0e0ed8 100644
--- a/src/bigrand.rs
+++ b/src/bigrand.rs
@@ -2,14 +2,13 @@
 
 use rand::distributions::uniform::{SampleUniform, UniformSampler};
 use rand::prelude::*;
-use rand::AsByteSliceMut;
 
 use BigInt;
 use BigUint;
 use Sign::*;
 
-use big_digit::BigDigit;
 use bigint::{into_magnitude, magnitude};
+use biguint::biguint_from_vec;
 
 use integer::Integer;
 use traits::Zero;
@@ -39,21 +38,45 @@ pub trait RandBigInt {
     fn gen_bigint_range(&mut self, lbound: &BigInt, ubound: &BigInt) -> BigInt;
 }
 
+fn gen_bits<R: Rng + ?Sized>(rng: &mut R, data: &mut [u32], rem: usize) {
+    // `fill` is faster than many `gen::<u32>` calls
+    rng.fill(data);
+    if rem > 0 {
+        let last = data.len() - 1;
+        data[last] >>= 32 - rem;
+    }
+}
+
 impl<R: Rng + ?Sized> RandBigInt for R {
+    #[cfg(not(u64_digit))]
     fn gen_biguint(&mut self, bit_size: usize) -> BigUint {
-        use super::big_digit::BITS;
-        let (digits, rem) = bit_size.div_rem(&BITS);
-        let mut data = vec![BigDigit::default(); digits + (rem > 0) as usize];
-        // `fill_bytes` is faster than many `gen::<u32>` calls
-        self.fill_bytes(data[..].as_byte_slice_mut());
-        // Swap bytes per the `Rng::fill` source. This might be
-        // unnecessary if reproducibility across architectures is not
-        // desired.
-        data.to_le();
-        if rem > 0 {
-            data[digits] >>= BITS - rem;
+        let (digits, rem) = bit_size.div_rem(&32);
+        let mut data = vec![0u32; digits + (rem > 0) as usize];
+        gen_bits(self, &mut data, rem);
+        biguint_from_vec(data)
+    }
+
+    #[cfg(u64_digit)]
+    fn gen_biguint(&mut self, bit_size: usize) -> BigUint {
+        use std::slice;
+
+        let (digits, rem) = bit_size.div_rem(&32);
+        let native_digits = bit_size.div_ceil(&64);
+        let mut data = vec![0u64; native_digits];
+        unsafe {
+            // Generate bits in a `&mut [u32]` slice for value stability
+            let ptr = data.as_mut_ptr() as *mut u32;
+            let len = digits + (rem > 0) as usize;
+            debug_assert!(native_digits * 2 >= len);
+            let data = slice::from_raw_parts_mut(ptr, len);
+            gen_bits(self, data, rem);
+        }
+        #[cfg(target_endian = "big")]
+        for digit in &mut data {
+            // swap u32 digits into u64 endianness
+            *digit = (*digit << 32) | (*digit >> 32);
         }
-        BigUint::new(data)
+        biguint_from_vec(data)
     }
 
     fn gen_bigint(&mut self, bit_size: usize) -> BigInt {
diff --git a/src/biguint.rs b/src/biguint.rs
index 68363424..ff30ac24 100644
--- a/src/biguint.rs
+++ b/src/biguint.rs
@@ -13,7 +13,7 @@ use std::ops::{
 };
 use std::str::{self, FromStr};
 use std::{f32, f64};
-use std::{u64, u8};
+use std::{u32, u64, u8};
 
 #[cfg(feature = "serde")]
 use serde;
@@ -46,22 +46,38 @@ use ParseBigIntError;
 use quickcheck::{Arbitrary, Gen};
 
 /// A big unsigned integer type.
-#[derive(Clone, Debug, Hash)]
+#[derive(Debug, Hash)]
 pub struct BigUint {
     data: Vec<BigDigit>,
 }
 
+// Note: derived `Clone` doesn't specialize `clone_from`,
+// but we want to keep the allocation in `data`.
+impl Clone for BigUint {
+    #[inline]
+    fn clone(&self) -> Self {
+        BigUint {
+            data: self.data.clone(),
+        }
+    }
+
+    #[inline]
+    fn clone_from(&mut self, other: &Self) {
+        self.data.clone_from(&other.data);
+    }
+}
+
 #[cfg(feature = "quickcheck")]
 impl Arbitrary for BigUint {
     fn arbitrary<G: Gen>(g: &mut G) -> Self {
         // Use arbitrary from Vec
-        Self::new(Vec::<u32>::arbitrary(g))
+        biguint_from_vec(Vec::<BigDigit>::arbitrary(g))
     }
 
     #[allow(bare_trait_objects)] // `dyn` needs Rust 1.27 to parse, even when cfg-disabled
     fn shrink(&self) -> Box<Iterator<Item = Self>> {
         // Use shrinker from Vec
-        Box::new(self.data.shrink().map(BigUint::new))
+        Box::new(self.data.shrink().map(biguint_from_vec))
     }
 }
 
@@ -156,7 +172,7 @@ fn from_bitwise_digits_le(v: &[u8], bits: usize) -> BigUint {
         })
         .collect();
 
-    BigUint::new(data)
+    biguint_from_vec(data)
 }
 
 // Convert from a power of two radix (bits == ilog2(radix)) where bits doesn't evenly divide
@@ -191,7 +207,7 @@ fn from_inexact_bitwise_digits_le(v: &[u8], bits: usize) -> BigUint {
         data.push(d as BigDigit);
     }
 
-    BigUint::new(data)
+    biguint_from_vec(data)
 }
 
 // Read little-endian radix digits
@@ -204,7 +220,7 @@ fn from_radix_digits_be(v: &[u8], radix: u32) -> BigUint {
     let big_digits = (bits / big_digit::BITS as f64).ceil();
     let mut data = Vec::with_capacity(big_digits as usize);
 
-    let (base, power) = get_radix_base(radix);
+    let (base, power) = get_radix_base(radix, big_digit::BITS);
     let radix = radix as BigDigit;
 
     let r = v.len() % power;
@@ -234,7 +250,7 @@ fn from_radix_digits_be(v: &[u8], radix: u32) -> BigUint {
         add2(&mut data, &[n]);
     }
 
-    BigUint::new(data)
+    biguint_from_vec(data)
 }
 
 impl Num for BigUint {
@@ -437,7 +453,7 @@ impl ShrAssign<usize> for BigUint {
 impl Zero for BigUint {
     #[inline]
     fn zero() -> BigUint {
-        BigUint::new(Vec::new())
+        BigUint { data: Vec::new() }
     }
 
     #[inline]
@@ -454,7 +470,7 @@ impl Zero for BigUint {
 impl One for BigUint {
     #[inline]
     fn one() -> BigUint {
-        BigUint::new(vec![1])
+        BigUint { data: vec![1] }
     }
 
     #[inline]
@@ -623,6 +639,7 @@ impl Add<u64> for BigUint {
 }
 
 impl AddAssign<u64> for BigUint {
+    #[cfg(not(u64_digit))]
     #[inline]
     fn add_assign(&mut self, other: u64) {
         let (hi, lo) = big_digit::from_doublebigdigit(other);
@@ -639,6 +656,21 @@ impl AddAssign<u64> for BigUint {
             }
         }
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn add_assign(&mut self, other: u64) {
+        if other != 0 {
+            if self.data.is_empty() {
+                self.data.push(0);
+            }
+
+            let carry = __add2(&mut self.data, &[other as BigDigit]);
+            if carry != 0 {
+                self.data.push(carry);
+            }
+        }
+    }
 }
 
 #[cfg(has_i128)]
@@ -654,6 +686,7 @@ impl Add<u128> for BigUint {
 
 #[cfg(has_i128)]
 impl AddAssign<u128> for BigUint {
+    #[cfg(not(u64_digit))]
     #[inline]
     fn add_assign(&mut self, other: u128) {
         if other <= u128::from(u64::max_value()) {
@@ -678,6 +711,24 @@ impl AddAssign<u128> for BigUint {
             }
         }
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn add_assign(&mut self, other: u128) {
+        let (hi, lo) = big_digit::from_doublebigdigit(other);
+        if hi == 0 {
+            *self += lo;
+        } else {
+            while self.data.len() < 2 {
+                self.data.push(0);
+            }
+
+            let carry = __add2(&mut self.data, &[lo, hi]);
+            if carry != 0 {
+                self.data.push(carry);
+            }
+        }
+    }
 }
 
 forward_val_val_binop!(impl Sub for BigUint, sub);
@@ -743,6 +794,18 @@ impl SubAssign<u32> for BigUint {
 impl Sub<BigUint> for u32 {
     type Output = BigUint;
 
+    #[cfg(not(u64_digit))]
+    #[inline]
+    fn sub(self, mut other: BigUint) -> BigUint {
+        if other.data.len() == 0 {
+            other.data.push(self);
+        } else {
+            sub2rev(&[self], &mut other.data[..]);
+        }
+        other.normalized()
+    }
+
+    #[cfg(u64_digit)]
     #[inline]
     fn sub(self, mut other: BigUint) -> BigUint {
         if other.data.is_empty() {
@@ -765,17 +828,26 @@ impl Sub<u64> for BigUint {
 }
 
 impl SubAssign<u64> for BigUint {
+    #[cfg(not(u64_digit))]
     #[inline]
     fn sub_assign(&mut self, other: u64) {
         let (hi, lo) = big_digit::from_doublebigdigit(other);
         sub2(&mut self.data[..], &[lo, hi]);
         self.normalize();
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn sub_assign(&mut self, other: u64) {
+        sub2(&mut self.data[..], &[other as BigDigit]);
+        self.normalize();
+    }
 }
 
 impl Sub<BigUint> for u64 {
     type Output = BigUint;
 
+    #[cfg(not(u64_digit))]
     #[inline]
     fn sub(self, mut other: BigUint) -> BigUint {
         while other.data.len() < 2 {
@@ -786,6 +858,17 @@ impl Sub<BigUint> for u64 {
         sub2rev(&[lo, hi], &mut other.data[..]);
         other.normalized()
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn sub(self, mut other: BigUint) -> BigUint {
+        if other.data.is_empty() {
+            other.data.push(self);
+        } else {
+            sub2rev(&[self], &mut other.data[..]);
+        }
+        other.normalized()
+    }
 }
 
 #[cfg(has_i128)]
@@ -800,17 +883,28 @@ impl Sub<u128> for BigUint {
 }
 #[cfg(has_i128)]
 impl SubAssign<u128> for BigUint {
+    #[cfg(not(u64_digit))]
+    #[inline]
     fn sub_assign(&mut self, other: u128) {
         let (a, b, c, d) = u32_from_u128(other);
         sub2(&mut self.data[..], &[d, c, b, a]);
         self.normalize();
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn sub_assign(&mut self, other: u128) {
+        let (hi, lo) = big_digit::from_doublebigdigit(other);
+        sub2(&mut self.data[..], &[lo, hi]);
+        self.normalize();
+    }
 }
 
 #[cfg(has_i128)]
 impl Sub<BigUint> for u128 {
     type Output = BigUint;
 
+    #[cfg(not(u64_digit))]
     #[inline]
     fn sub(self, mut other: BigUint) -> BigUint {
         while other.data.len() < 4 {
@@ -821,6 +915,18 @@ impl Sub<BigUint> for u128 {
         sub2rev(&[d, c, b, a], &mut other.data[..]);
         other.normalized()
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn sub(self, mut other: BigUint) -> BigUint {
+        while other.data.len() < 2 {
+            other.data.push(0);
+        }
+
+        let (hi, lo) = big_digit::from_doublebigdigit(self);
+        sub2rev(&[lo, hi], &mut other.data[..]);
+        other.normalized()
+    }
 }
 
 forward_all_binop_to_ref_ref!(impl Mul for BigUint, mul);
@@ -881,6 +987,7 @@ impl Mul<u64> for BigUint {
     }
 }
 impl MulAssign<u64> for BigUint {
+    #[cfg(not(u64_digit))]
     #[inline]
     fn mul_assign(&mut self, other: u64) {
         if other == 0 {
@@ -892,6 +999,19 @@ impl MulAssign<u64> for BigUint {
             *self = mul3(&self.data[..], &[lo, hi])
         }
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn mul_assign(&mut self, other: u64) {
+        if other == 0 {
+            self.data.clear();
+        } else {
+            let carry = scalar_mul(&mut self.data[..], other as BigDigit);
+            if carry != 0 {
+                self.data.push(carry);
+            }
+        }
+    }
 }
 
 #[cfg(has_i128)]
@@ -906,6 +1026,7 @@ impl Mul<u128> for BigUint {
 }
 #[cfg(has_i128)]
 impl MulAssign<u128> for BigUint {
+    #[cfg(not(u64_digit))]
     #[inline]
     fn mul_assign(&mut self, other: u128) {
         if other == 0 {
@@ -917,6 +1038,19 @@ impl MulAssign<u128> for BigUint {
             *self = mul3(&self.data[..], &[d, c, b, a])
         }
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn mul_assign(&mut self, other: u128) {
+        if other == 0 {
+            self.data.clear();
+        } else if other <= BigDigit::max_value() as u128 {
+            *self *= other as BigDigit
+        } else {
+            let (hi, lo) = big_digit::from_doublebigdigit(other);
+            *self = mul3(&self.data[..], &[lo, hi])
+        }
+    }
 }
 
 forward_val_ref_binop!(impl Div for BigUint, div);
@@ -1006,6 +1140,7 @@ impl DivAssign<u64> for BigUint {
 impl Div<BigUint> for u64 {
     type Output = BigUint;
 
+    #[cfg(not(u64_digit))]
     #[inline]
     fn div(self, other: BigUint) -> BigUint {
         match other.data.len() {
@@ -1015,6 +1150,16 @@ impl Div<BigUint> for u64 {
             _ => Zero::zero(),
         }
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn div(self, other: BigUint) -> BigUint {
+        match other.data.len() {
+            0 => panic!(),
+            1 => From::from(self / other.data[0]),
+            _ => Zero::zero(),
+        }
+    }
 }
 
 #[cfg(has_i128)]
@@ -1027,6 +1172,7 @@ impl Div<u128> for BigUint {
         q
     }
 }
+
 #[cfg(has_i128)]
 impl DivAssign<u128> for BigUint {
     #[inline]
@@ -1039,6 +1185,7 @@ impl DivAssign<u128> for BigUint {
 impl Div<BigUint> for u128 {
     type Output = BigUint;
 
+    #[cfg(not(u64_digit))]
     #[inline]
     fn div(self, other: BigUint) -> BigUint {
         match other.data.len() {
@@ -1054,6 +1201,17 @@ impl Div<BigUint> for u128 {
             _ => Zero::zero(),
         }
     }
+
+    #[cfg(u64_digit)]
+    #[inline]
+    fn div(self, other: BigUint) -> BigUint {
+        match other.data.len() {
+            0 => panic!(),
+            1 => From::from(self / other.data[0] as u128),
+            2 => From::from(self / big_digit::to_doublebigdigit(other.data[1], other.data[0])),
+            _ => Zero::zero(),
+        }
+    }
 }
 
 forward_val_ref_binop!(impl Rem for BigUint, rem);
@@ -1065,8 +1223,12 @@ impl Rem<BigUint> for BigUint {
 
     #[inline]
     fn rem(self, other: BigUint) -> BigUint {
-        let (_, r) = div_rem(self, other);
-        r
+        if let Some(other) = other.to_u32() {
+            &self % other
+        } else {
+            let (_, r) = div_rem(self, other);
+            r
+        }
     }
 }
 
@@ -1075,8 +1237,12 @@ impl<'a, 'b> Rem<&'b BigUint> for &'a BigUint {
 
     #[inline]
     fn rem(self, other: &BigUint) -> BigUint {
-        let (_, r) = self.div_rem(other);
-        r
+        if let Some(other) = other.to_u32() {
+            self % other
+        } else {
+            let (_, r) = self.div_rem(other);
+            r
+        }
     }
 }
 impl<'a> RemAssign<&'a BigUint> for BigUint {
@@ -1098,7 +1264,7 @@ impl<'a> Rem<u32> for &'a BigUint {
 
     #[inline]
     fn rem(self, other: u32) -> BigUint {
-        From::from(rem_digit(self, other as BigDigit))
+        rem_digit(self, other as BigDigit).into()
     }
 }
 impl RemAssign<u32> for BigUint {
@@ -1287,7 +1453,7 @@ impl Integer for BigUint {
     fn gcd(&self, other: &Self) -> Self {
         #[inline]
         fn twos(x: &BigUint) -> usize {
-            trailing_zeros(x).unwrap_or(0)
+            x.trailing_zeros().unwrap_or(0)
         }
 
         // Stein's algorithm
@@ -1850,7 +2016,7 @@ fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
     let mut res = Vec::with_capacity(radix_digits as usize);
     let mut digits = u.clone();
 
-    let (base, power) = get_radix_base(radix);
+    let (base, power) = get_radix_base(radix, big_digit::HALF_BITS);
     let radix = radix as BigDigit;
 
     while digits.data.len() > 1 {
@@ -1912,13 +2078,34 @@ pub fn to_str_radix_reversed(u: &BigUint, radix: u32) -> Vec<u8> {
     res
 }
 
+/// Creates and initializes a `BigUint`.
+///
+/// The digits are in little-endian base matching `BigDigit`.
+///
+/// This is an internal `pub(crate)`-ish API only!
+#[inline]
+pub fn biguint_from_vec(digits: Vec<BigDigit>) -> BigUint {
+    BigUint { data: digits }.normalized()
+}
+
 impl BigUint {
     /// Creates and initializes a `BigUint`.
     ///
     /// The base 2<sup>32</sup> digits are ordered least significant digit first.
     #[inline]
     pub fn new(digits: Vec<u32>) -> BigUint {
-        BigUint { data: digits }.normalized()
+        let mut big = BigUint::zero();
+
+        #[cfg(not(u64_digit))]
+        {
+            big.data = digits;
+            big.normalize();
+        }
+
+        #[cfg(u64_digit)]
+        big.assign_from_slice(&digits);
+
+        big
     }
 
     /// Creates and initializes a `BigUint`.
@@ -1926,7 +2113,9 @@ impl BigUint {
     /// The base 2<sup>32</sup> digits are ordered least significant digit first.
     #[inline]
     pub fn from_slice(slice: &[u32]) -> BigUint {
-        BigUint::new(slice.to_vec())
+        let mut big = BigUint::zero();
+        big.assign_from_slice(slice);
+        big
     }
 
     /// Assign a value to a `BigUint`.
@@ -1934,8 +2123,21 @@ impl BigUint {
     /// The base 2<sup>32</sup> digits are ordered least significant digit first.
     #[inline]
     pub fn assign_from_slice(&mut self, slice: &[u32]) {
-        self.data.resize(slice.len(), 0);
-        self.data.clone_from_slice(slice);
+        self.data.clear();
+
+        #[cfg(not(u64_digit))]
+        self.data.extend_from_slice(slice);
+
+        #[cfg(u64_digit)]
+        self.data.extend(slice.chunks(2).map(|chunk| {
+            // raw could have odd length
+            let mut digit = BigDigit::from(chunk[0]);
+            if let Some(&hi) = chunk.get(1) {
+                digit |= BigDigit::from(hi) << 32;
+            }
+            digit
+        }));
+
         self.normalize();
     }
 
@@ -2140,7 +2342,30 @@ impl BigUint {
     /// ```
     #[inline]
     pub fn to_u32_digits(&self) -> Vec<u32> {
-        self.data.clone()
+        let mut digits = Vec::new();
+
+        #[cfg(not(u64_digit))]
+        digits.clone_from(&self.data);
+
+        #[cfg(u64_digit)]
+        {
+            if let Some((&last, data)) = self.data.split_last() {
+                let last_lo = last as u32;
+                let last_hi = (last >> 32) as u32;
+                let u32_len = data.len() * 2 + 1 + (last_hi != 0) as usize;
+                digits.reserve_exact(u32_len);
+                for &x in data {
+                    digits.push(x as u32);
+                    digits.push((x >> 32) as u32);
+                }
+                digits.push(last_lo);
+                if last_hi != 0 {
+                    digits.push(last_hi);
+                }
+            }
+        }
+
+        digits
     }
 
     /// Returns the integer formatted as a string in the given radix.
@@ -2259,6 +2484,16 @@ impl BigUint {
     pub fn nth_root(&self, n: u32) -> Self {
         Roots::nth_root(self, n)
     }
+
+    /// Returns the number of least-significant bits that are zero,
+    /// or `None` if the entire number is zero.
+    pub fn trailing_zeros(&self) -> Option<usize> {
+        self.data
+            .iter()
+            .enumerate()
+            .find(|&(_, &digit)| digit != 0)
+            .map(|(i, digit)| i * big_digit::BITS + digit.trailing_zeros() as usize)
+    }
 }
 
 fn plain_modpow(base: &BigUint, exp_data: &[BigDigit], modulus: &BigUint) -> BigUint {
@@ -2360,16 +2595,6 @@ fn test_plain_modpow() {
     );
 }
 
-/// Returns the number of least-significant bits that are zero,
-/// or `None` if the entire number is zero.
-pub fn trailing_zeros(u: &BigUint) -> Option<usize> {
-    u.data
-        .iter()
-        .enumerate()
-        .find(|&(_, &digit)| digit != 0)
-        .map(|(i, digit)| i * big_digit::BITS + digit.trailing_zeros() as usize)
-}
-
 impl_sum_iter_type!(BigUint);
 impl_product_iter_type!(BigUint);
 
@@ -2406,6 +2631,7 @@ impl IntDigits for BigUint {
 
 /// Combine four `u32`s into a single `u128`.
 #[cfg(has_i128)]
+#[cfg(any(test, not(u64_digit)))]
 #[inline]
 fn u32_to_u128(a: u32, b: u32, c: u32, d: u32) -> u128 {
     u128::from(d) | (u128::from(c) << 32) | (u128::from(b) << 64) | (u128::from(a) << 96)
@@ -2413,6 +2639,7 @@ fn u32_to_u128(a: u32, b: u32, c: u32, d: u32) -> u128 {
 
 /// Split a single `u128` into four `u32`.
 #[cfg(has_i128)]
+#[cfg(any(test, not(u64_digit)))]
 #[inline]
 fn u32_from_u128(n: u128) -> (u32, u32, u32, u32) {
     (
@@ -2424,6 +2651,7 @@ fn u32_from_u128(n: u128) -> (u32, u32, u32, u32) {
 }
 
 #[cfg(feature = "serde")]
+#[cfg(not(u64_digit))]
 impl serde::Serialize for BigUint {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
@@ -2438,611 +2666,152 @@ impl serde::Serialize for BigUint {
 }
 
 #[cfg(feature = "serde")]
+#[cfg(u64_digit)]
+impl serde::Serialize for BigUint {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeSeq;
+        if let Some((&last, data)) = self.data.split_last() {
+            let last_lo = last as u32;
+            let last_hi = (last >> 32) as u32;
+            let u32_len = data.len() * 2 + 1 + (last_hi != 0) as usize;
+            let mut seq = serializer.serialize_seq(Some(u32_len))?;
+            for &x in data {
+                seq.serialize_element(&(x as u32))?;
+                seq.serialize_element(&((x >> 32) as u32))?;
+            }
+            seq.serialize_element(&last_lo)?;
+            if last_hi != 0 {
+                seq.serialize_element(&last_hi)?;
+            }
+            seq.end()
+        } else {
+            let data: &[u32] = &[];
+            data.serialize(serializer)
+        }
+    }
+}
+
+#[cfg(feature = "serde")]
+#[cfg(not(u64_digit))]
 impl<'de> serde::Deserialize<'de> for BigUint {
     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: serde::Deserializer<'de>,
     {
         let data: Vec<u32> = Vec::deserialize(deserializer)?;
-        Ok(BigUint::new(data))
+        Ok(biguint_from_vec(data))
     }
 }
 
-/// Returns the greatest power of the radix <= big_digit::BASE
+#[cfg(feature = "serde")]
+#[cfg(u64_digit)]
+impl<'de> serde::Deserialize<'de> for BigUint {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        use serde::de::{SeqAccess, Visitor};
+
+        struct U32Visitor;
+
+        impl<'de> Visitor<'de> for U32Visitor {
+            type Value = BigUint;
+
+            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+                formatter.write_str("a sequence of unsigned 32-bit numbers")
+            }
+
+            fn visit_seq<S>(self, mut seq: S) -> Result<Self::Value, S::Error>
+            where
+                S: SeqAccess<'de>,
+            {
+                let u32_len = seq.size_hint().unwrap_or(0);
+                let len = u32_len.div_ceil(&2);
+                let mut data = Vec::with_capacity(len);
+
+                while let Some(lo) = seq.next_element::<u32>()? {
+                    let mut value = BigDigit::from(lo);
+                    if let Some(hi) = seq.next_element::<u32>()? {
+                        value |= BigDigit::from(hi) << 32;
+                    }
+                    data.push(value);
+                }
+
+                Ok(biguint_from_vec(data))
+            }
+        }
+
+        deserializer.deserialize_seq(U32Visitor)
+    }
+}
+
+/// Returns the greatest power of the radix for the given bit size
 #[inline]
-fn get_radix_base(radix: u32) -> (BigDigit, usize) {
+fn get_radix_base(radix: u32, bits: usize) -> (BigDigit, usize) {
+    mod gen {
+        include! { concat!(env!("OUT_DIR"), "/radix_bases.rs") }
+    }
+
     debug_assert!(
         2 <= radix && radix <= 256,
         "The radix must be within 2...256"
     );
     debug_assert!(!radix.is_power_of_two());
+    debug_assert!(bits <= big_digit::BITS);
 
-    // To generate this table:
-    //    for radix in 2u64..257 {
-    //        let mut power = big_digit::BITS / fls(radix as u64);
-    //        let mut base = radix.pow(power as u32);
-    //
-    //        while let Some(b) = base.checked_mul(radix) {
-    //            if b > big_digit::MAX {
-    //                break;
-    //            }
-    //            base = b;
-    //            power += 1;
-    //        }
-    //
-    //        println!("({:10}, {:2}), // {:2}", base, power, radix);
-    //    }
-    // and
-    //    for radix in 2u64..257 {
-    //        let mut power = 64 / fls(radix as u64);
-    //        let mut base = radix.pow(power as u32);
-    //
-    //        while let Some(b) = base.checked_mul(radix) {
-    //            base = b;
-    //            power += 1;
-    //        }
-    //
-    //        println!("({:20}, {:2}), // {:2}", base, power, radix);
-    //    }
-    match big_digit::BITS {
+    match bits {
+        16 => {
+            let (base, power) = gen::BASES_16[radix as usize];
+            (base as BigDigit, power)
+        }
         32 => {
-            const BASES: [(u32, usize); 257] = [
-                (0, 0),
-                (0, 0),
-                (0, 0),           //  2
-                (3486784401, 20), //  3
-                (0, 0),           //  4
-                (1220703125, 13), //  5
-                (2176782336, 12), //  6
-                (1977326743, 11), //  7
-                (0, 0),           //  8
-                (3486784401, 10), //  9
-                (1000000000, 9),  // 10
-                (2357947691, 9),  // 11
-                (429981696, 8),   // 12
-                (815730721, 8),   // 13
-                (1475789056, 8),  // 14
-                (2562890625, 8),  // 15
-                (0, 0),           // 16
-                (410338673, 7),   // 17
-                (612220032, 7),   // 18
-                (893871739, 7),   // 19
-                (1280000000, 7),  // 20
-                (1801088541, 7),  // 21
-                (2494357888, 7),  // 22
-                (3404825447, 7),  // 23
-                (191102976, 6),   // 24
-                (244140625, 6),   // 25
-                (308915776, 6),   // 26
-                (387420489, 6),   // 27
-                (481890304, 6),   // 28
-                (594823321, 6),   // 29
-                (729000000, 6),   // 30
-                (887503681, 6),   // 31
-                (0, 0),           // 32
-                (1291467969, 6),  // 33
-                (1544804416, 6),  // 34
-                (1838265625, 6),  // 35
-                (2176782336, 6),  // 36
-                (2565726409, 6),  // 37
-                (3010936384, 6),  // 38
-                (3518743761, 6),  // 39
-                (4096000000, 6),  // 40
-                (115856201, 5),   // 41
-                (130691232, 5),   // 42
-                (147008443, 5),   // 43
-                (164916224, 5),   // 44
-                (184528125, 5),   // 45
-                (205962976, 5),   // 46
-                (229345007, 5),   // 47
-                (254803968, 5),   // 48
-                (282475249, 5),   // 49
-                (312500000, 5),   // 50
-                (345025251, 5),   // 51
-                (380204032, 5),   // 52
-                (418195493, 5),   // 53
-                (459165024, 5),   // 54
-                (503284375, 5),   // 55
-                (550731776, 5),   // 56
-                (601692057, 5),   // 57
-                (656356768, 5),   // 58
-                (714924299, 5),   // 59
-                (777600000, 5),   // 60
-                (844596301, 5),   // 61
-                (916132832, 5),   // 62
-                (992436543, 5),   // 63
-                (0, 0),           // 64
-                (1160290625, 5),  // 65
-                (1252332576, 5),  // 66
-                (1350125107, 5),  // 67
-                (1453933568, 5),  // 68
-                (1564031349, 5),  // 69
-                (1680700000, 5),  // 70
-                (1804229351, 5),  // 71
-                (1934917632, 5),  // 72
-                (2073071593, 5),  // 73
-                (2219006624, 5),  // 74
-                (2373046875, 5),  // 75
-                (2535525376, 5),  // 76
-                (2706784157, 5),  // 77
-                (2887174368, 5),  // 78
-                (3077056399, 5),  // 79
-                (3276800000, 5),  // 80
-                (3486784401, 5),  // 81
-                (3707398432, 5),  // 82
-                (3939040643, 5),  // 83
-                (4182119424, 5),  // 84
-                (52200625, 4),    // 85
-                (54700816, 4),    // 86
-                (57289761, 4),    // 87
-                (59969536, 4),    // 88
-                (62742241, 4),    // 89
-                (65610000, 4),    // 90
-                (68574961, 4),    // 91
-                (71639296, 4),    // 92
-                (74805201, 4),    // 93
-                (78074896, 4),    // 94
-                (81450625, 4),    // 95
-                (84934656, 4),    // 96
-                (88529281, 4),    // 97
-                (92236816, 4),    // 98
-                (96059601, 4),    // 99
-                (100000000, 4),   // 100
-                (104060401, 4),   // 101
-                (108243216, 4),   // 102
-                (112550881, 4),   // 103
-                (116985856, 4),   // 104
-                (121550625, 4),   // 105
-                (126247696, 4),   // 106
-                (131079601, 4),   // 107
-                (136048896, 4),   // 108
-                (141158161, 4),   // 109
-                (146410000, 4),   // 110
-                (151807041, 4),   // 111
-                (157351936, 4),   // 112
-                (163047361, 4),   // 113
-                (168896016, 4),   // 114
-                (174900625, 4),   // 115
-                (181063936, 4),   // 116
-                (187388721, 4),   // 117
-                (193877776, 4),   // 118
-                (200533921, 4),   // 119
-                (207360000, 4),   // 120
-                (214358881, 4),   // 121
-                (221533456, 4),   // 122
-                (228886641, 4),   // 123
-                (236421376, 4),   // 124
-                (244140625, 4),   // 125
-                (252047376, 4),   // 126
-                (260144641, 4),   // 127
-                (0, 0),           // 128
-                (276922881, 4),   // 129
-                (285610000, 4),   // 130
-                (294499921, 4),   // 131
-                (303595776, 4),   // 132
-                (312900721, 4),   // 133
-                (322417936, 4),   // 134
-                (332150625, 4),   // 135
-                (342102016, 4),   // 136
-                (352275361, 4),   // 137
-                (362673936, 4),   // 138
-                (373301041, 4),   // 139
-                (384160000, 4),   // 140
-                (395254161, 4),   // 141
-                (406586896, 4),   // 142
-                (418161601, 4),   // 143
-                (429981696, 4),   // 144
-                (442050625, 4),   // 145
-                (454371856, 4),   // 146
-                (466948881, 4),   // 147
-                (479785216, 4),   // 148
-                (492884401, 4),   // 149
-                (506250000, 4),   // 150
-                (519885601, 4),   // 151
-                (533794816, 4),   // 152
-                (547981281, 4),   // 153
-                (562448656, 4),   // 154
-                (577200625, 4),   // 155
-                (592240896, 4),   // 156
-                (607573201, 4),   // 157
-                (623201296, 4),   // 158
-                (639128961, 4),   // 159
-                (655360000, 4),   // 160
-                (671898241, 4),   // 161
-                (688747536, 4),   // 162
-                (705911761, 4),   // 163
-                (723394816, 4),   // 164
-                (741200625, 4),   // 165
-                (759333136, 4),   // 166
-                (777796321, 4),   // 167
-                (796594176, 4),   // 168
-                (815730721, 4),   // 169
-                (835210000, 4),   // 170
-                (855036081, 4),   // 171
-                (875213056, 4),   // 172
-                (895745041, 4),   // 173
-                (916636176, 4),   // 174
-                (937890625, 4),   // 175
-                (959512576, 4),   // 176
-                (981506241, 4),   // 177
-                (1003875856, 4),  // 178
-                (1026625681, 4),  // 179
-                (1049760000, 4),  // 180
-                (1073283121, 4),  // 181
-                (1097199376, 4),  // 182
-                (1121513121, 4),  // 183
-                (1146228736, 4),  // 184
-                (1171350625, 4),  // 185
-                (1196883216, 4),  // 186
-                (1222830961, 4),  // 187
-                (1249198336, 4),  // 188
-                (1275989841, 4),  // 189
-                (1303210000, 4),  // 190
-                (1330863361, 4),  // 191
-                (1358954496, 4),  // 192
-                (1387488001, 4),  // 193
-                (1416468496, 4),  // 194
-                (1445900625, 4),  // 195
-                (1475789056, 4),  // 196
-                (1506138481, 4),  // 197
-                (1536953616, 4),  // 198
-                (1568239201, 4),  // 199
-                (1600000000, 4),  // 200
-                (1632240801, 4),  // 201
-                (1664966416, 4),  // 202
-                (1698181681, 4),  // 203
-                (1731891456, 4),  // 204
-                (1766100625, 4),  // 205
-                (1800814096, 4),  // 206
-                (1836036801, 4),  // 207
-                (1871773696, 4),  // 208
-                (1908029761, 4),  // 209
-                (1944810000, 4),  // 210
-                (1982119441, 4),  // 211
-                (2019963136, 4),  // 212
-                (2058346161, 4),  // 213
-                (2097273616, 4),  // 214
-                (2136750625, 4),  // 215
-                (2176782336, 4),  // 216
-                (2217373921, 4),  // 217
-                (2258530576, 4),  // 218
-                (2300257521, 4),  // 219
-                (2342560000, 4),  // 220
-                (2385443281, 4),  // 221
-                (2428912656, 4),  // 222
-                (2472973441, 4),  // 223
-                (2517630976, 4),  // 224
-                (2562890625, 4),  // 225
-                (2608757776, 4),  // 226
-                (2655237841, 4),  // 227
-                (2702336256, 4),  // 228
-                (2750058481, 4),  // 229
-                (2798410000, 4),  // 230
-                (2847396321, 4),  // 231
-                (2897022976, 4),  // 232
-                (2947295521, 4),  // 233
-                (2998219536, 4),  // 234
-                (3049800625, 4),  // 235
-                (3102044416, 4),  // 236
-                (3154956561, 4),  // 237
-                (3208542736, 4),  // 238
-                (3262808641, 4),  // 239
-                (3317760000, 4),  // 240
-                (3373402561, 4),  // 241
-                (3429742096, 4),  // 242
-                (3486784401, 4),  // 243
-                (3544535296, 4),  // 244
-                (3603000625, 4),  // 245
-                (3662186256, 4),  // 246
-                (3722098081, 4),  // 247
-                (3782742016, 4),  // 248
-                (3844124001, 4),  // 249
-                (3906250000, 4),  // 250
-                (3969126001, 4),  // 251
-                (4032758016, 4),  // 252
-                (4097152081, 4),  // 253
-                (4162314256, 4),  // 254
-                (4228250625, 4),  // 255
-                (0, 0),           // 256
-            ];
-
-            let (base, power) = BASES[radix as usize];
+            let (base, power) = gen::BASES_32[radix as usize];
             (base as BigDigit, power)
         }
         64 => {
-            const BASES: [(u64, usize); 257] = [
-                (0, 0),
-                (0, 0),
-                (9223372036854775808, 63),  //  2
-                (12157665459056928801, 40), //  3
-                (4611686018427387904, 31),  //  4
-                (7450580596923828125, 27),  //  5
-                (4738381338321616896, 24),  //  6
-                (3909821048582988049, 22),  //  7
-                (9223372036854775808, 21),  //  8
-                (12157665459056928801, 20), //  9
-                (10000000000000000000, 19), // 10
-                (5559917313492231481, 18),  // 11
-                (2218611106740436992, 17),  // 12
-                (8650415919381337933, 17),  // 13
-                (2177953337809371136, 16),  // 14
-                (6568408355712890625, 16),  // 15
-                (1152921504606846976, 15),  // 16
-                (2862423051509815793, 15),  // 17
-                (6746640616477458432, 15),  // 18
-                (15181127029874798299, 15), // 19
-                (1638400000000000000, 14),  // 20
-                (3243919932521508681, 14),  // 21
-                (6221821273427820544, 14),  // 22
-                (11592836324538749809, 14), // 23
-                (876488338465357824, 13),   // 24
-                (1490116119384765625, 13),  // 25
-                (2481152873203736576, 13),  // 26
-                (4052555153018976267, 13),  // 27
-                (6502111422497947648, 13),  // 28
-                (10260628712958602189, 13), // 29
-                (15943230000000000000, 13), // 30
-                (787662783788549761, 12),   // 31
-                (1152921504606846976, 12),  // 32
-                (1667889514952984961, 12),  // 33
-                (2386420683693101056, 12),  // 34
-                (3379220508056640625, 12),  // 35
-                (4738381338321616896, 12),  // 36
-                (6582952005840035281, 12),  // 37
-                (9065737908494995456, 12),  // 38
-                (12381557655576425121, 12), // 39
-                (16777216000000000000, 12), // 40
-                (550329031716248441, 11),   // 41
-                (717368321110468608, 11),   // 42
-                (929293739471222707, 11),   // 43
-                (1196683881290399744, 11),  // 44
-                (1532278301220703125, 11),  // 45
-                (1951354384207722496, 11),  // 46
-                (2472159215084012303, 11),  // 47
-                (3116402981210161152, 11),  // 48
-                (3909821048582988049, 11),  // 49
-                (4882812500000000000, 11),  // 50
-                (6071163615208263051, 11),  // 51
-                (7516865509350965248, 11),  // 52
-                (9269035929372191597, 11),  // 53
-                (11384956040305711104, 11), // 54
-                (13931233916552734375, 11), // 55
-                (16985107389382393856, 11), // 56
-                (362033331456891249, 10),   // 57
-                (430804206899405824, 10),   // 58
-                (511116753300641401, 10),   // 59
-                (604661760000000000, 10),   // 60
-                (713342911662882601, 10),   // 61
-                (839299365868340224, 10),   // 62
-                (984930291881790849, 10),   // 63
-                (1152921504606846976, 10),  // 64
-                (1346274334462890625, 10),  // 65
-                (1568336880910795776, 10),  // 66
-                (1822837804551761449, 10),  // 67
-                (2113922820157210624, 10),  // 68
-                (2446194060654759801, 10),  // 69
-                (2824752490000000000, 10),  // 70
-                (3255243551009881201, 10),  // 71
-                (3743906242624487424, 10),  // 72
-                (4297625829703557649, 10),  // 73
-                (4923990397355877376, 10),  // 74
-                (5631351470947265625, 10),  // 75
-                (6428888932339941376, 10),  // 76
-                (7326680472586200649, 10),  // 77
-                (8335775831236199424, 10),  // 78
-                (9468276082626847201, 10),  // 79
-                (10737418240000000000, 10), // 80
-                (12157665459056928801, 10), // 81
-                (13744803133596058624, 10), // 82
-                (15516041187205853449, 10), // 83
-                (17490122876598091776, 10), // 84
-                (231616946283203125, 9),    // 85
-                (257327417311663616, 9),    // 86
-                (285544154243029527, 9),    // 87
-                (316478381828866048, 9),    // 88
-                (350356403707485209, 9),    // 89
-                (387420489000000000, 9),    // 90
-                (427929800129788411, 9),    // 91
-                (472161363286556672, 9),    // 92
-                (520411082988487293, 9),    // 93
-                (572994802228616704, 9),    // 94
-                (630249409724609375, 9),    // 95
-                (692533995824480256, 9),    // 96
-                (760231058654565217, 9),    // 97
-                (833747762130149888, 9),    // 98
-                (913517247483640899, 9),    // 99
-                (1000000000000000000, 9),   // 100
-                (1093685272684360901, 9),   // 101
-                (1195092568622310912, 9),   // 102
-                (1304773183829244583, 9),   // 103
-                (1423311812421484544, 9),   // 104
-                (1551328215978515625, 9),   // 105
-                (1689478959002692096, 9),   // 106
-                (1838459212420154507, 9),   // 107
-                (1999004627104432128, 9),   // 108
-                (2171893279442309389, 9),   // 109
-                (2357947691000000000, 9),   // 110
-                (2558036924386500591, 9),   // 111
-                (2773078757450186752, 9),   // 112
-                (3004041937984268273, 9),   // 113
-                (3251948521156637184, 9),   // 114
-                (3517876291919921875, 9),   // 115
-                (3802961274698203136, 9),   // 116
-                (4108400332687853397, 9),   // 117
-                (4435453859151328768, 9),   // 118
-                (4785448563124474679, 9),   // 119
-                (5159780352000000000, 9),   // 120
-                (5559917313492231481, 9),   // 121
-                (5987402799531080192, 9),   // 122
-                (6443858614676334363, 9),   // 123
-                (6930988311686938624, 9),   // 124
-                (7450580596923828125, 9),   // 125
-                (8004512848309157376, 9),   // 126
-                (8594754748609397887, 9),   // 127
-                (9223372036854775808, 9),   // 128
-                (9892530380752880769, 9),   // 129
-                (10604499373000000000, 9),  // 130
-                (11361656654439817571, 9),  // 131
-                (12166492167065567232, 9),  // 132
-                (13021612539908538853, 9),  // 133
-                (13929745610903012864, 9),  // 134
-                (14893745087865234375, 9),  // 135
-                (15916595351771938816, 9),  // 136
-                (17001416405572203977, 9),  // 137
-                (18151468971815029248, 9),  // 138
-                (139353667211683681, 8),    // 139
-                (147578905600000000, 8),    // 140
-                (156225851787813921, 8),    // 141
-                (165312903998914816, 8),    // 142
-                (174859124550883201, 8),    // 143
-                (184884258895036416, 8),    // 144
-                (195408755062890625, 8),    // 145
-                (206453783524884736, 8),    // 146
-                (218041257467152161, 8),    // 147
-                (230193853492166656, 8),    // 148
-                (242935032749128801, 8),    // 149
-                (256289062500000000, 8),    // 150
-                (270281038127131201, 8),    // 151
-                (284936905588473856, 8),    // 152
-                (300283484326400961, 8),    // 153
-                (316348490636206336, 8),    // 154
-                (333160561500390625, 8),    // 155
-                (350749278894882816, 8),    // 156
-                (369145194573386401, 8),    // 157
-                (388379855336079616, 8),    // 158
-                (408485828788939521, 8),    // 159
-                (429496729600000000, 8),    // 160
-                (451447246258894081, 8),    // 161
-                (474373168346071296, 8),    // 162
-                (498311414318121121, 8),    // 163
-                (523300059815673856, 8),    // 164
-                (549378366500390625, 8),    // 165
-                (576586811427594496, 8),    // 166
-                (604967116961135041, 8),    // 167
-                (634562281237118976, 8),    // 168
-                (665416609183179841, 8),    // 169
-                (697575744100000000, 8),    // 170
-                (731086699811838561, 8),    // 171
-                (765997893392859136, 8),    // 172
-                (802359178476091681, 8),    // 173
-                (840221879151902976, 8),    // 174
-                (879638824462890625, 8),    // 175
-                (920664383502155776, 8),    // 176
-                (963354501121950081, 8),    // 177
-                (1007766734259732736, 8),   // 178
-                (1053960288888713761, 8),   // 179
-                (1101996057600000000, 8),   // 180
-                (1151936657823500641, 8),   // 181
-                (1203846470694789376, 8),   // 182
-                (1257791680575160641, 8),   // 183
-                (1313840315232157696, 8),   // 184
-                (1372062286687890625, 8),   // 185
-                (1432529432742502656, 8),   // 186
-                (1495315559180183521, 8),   // 187
-                (1560496482665168896, 8),   // 188
-                (1628150074335205281, 8),   // 189
-                (1698356304100000000, 8),   // 190
-                (1771197285652216321, 8),   // 191
-                (1846757322198614016, 8),   // 192
-                (1925122952918976001, 8),   // 193
-                (2006383000160502016, 8),   // 194
-                (2090628617375390625, 8),   // 195
-                (2177953337809371136, 8),   // 196
-                (2268453123948987361, 8),   // 197
-                (2362226417735475456, 8),   // 198
-                (2459374191553118401, 8),   // 199
-                (2560000000000000000, 8),   // 200
-                (2664210032449121601, 8),   // 201
-                (2772113166407885056, 8),   // 202
-                (2883821021683985761, 8),   // 203
-                (2999448015365799936, 8),   // 204
-                (3119111417625390625, 8),   // 205
-                (3242931408352297216, 8),   // 206
-                (3371031134626313601, 8),   // 207
-                (3503536769037500416, 8),   // 208
-                (3640577568861717121, 8),   // 209
-                (3782285936100000000, 8),   // 210
-                (3928797478390152481, 8),   // 211
-                (4080251070798954496, 8),   // 212
-                (4236788918503437921, 8),   // 213
-                (4398556620369715456, 8),   // 214
-                (4565703233437890625, 8),   // 215
-                (4738381338321616896, 8),   // 216
-                (4916747105530914241, 8),   // 217
-                (5100960362726891776, 8),   // 218
-                (5291184662917065441, 8),   // 219
-                (5487587353600000000, 8),   // 220
-                (5690339646868044961, 8),   // 221
-                (5899616690476974336, 8),   // 222
-                (6115597639891380481, 8),   // 223
-                (6338465731314712576, 8),   // 224
-                (6568408355712890625, 8),   // 225
-                (6805617133840466176, 8),   // 226
-                (7050287992278341281, 8),   // 227
-                (7302621240492097536, 8),   // 228
-                (7562821648920027361, 8),   // 229
-                (7831098528100000000, 8),   // 230
-                (8107665808844335041, 8),   // 231
-                (8392742123471896576, 8),   // 232
-                (8686550888106661441, 8),   // 233
-                (8989320386052055296, 8),   // 234
-                (9301283852250390625, 8),   // 235
-                (9622679558836781056, 8),   // 236
-                (9953750901796946721, 8),   // 237
-                (10294746488738365696, 8),  // 238
-                (10645920227784266881, 8),  // 239
-                (11007531417600000000, 8),  // 240
-                (11379844838561358721, 8),  // 241
-                (11763130845074473216, 8),  // 242
-                (12157665459056928801, 8),  // 243
-                (12563730464589807616, 8),  // 244
-                (12981613503750390625, 8),  // 245
-                (13411608173635297536, 8),  // 246
-                (13854014124583882561, 8),  // 247
-                (14309137159611744256, 8),  // 248
-                (14777289335064248001, 8),  // 249
-                (15258789062500000000, 8),  // 250
-                (15753961211814252001, 8),  // 251
-                (16263137215612256256, 8),  // 252
-                (16786655174842630561, 8),  // 253
-                (17324859965700833536, 8),  // 254
-                (17878103347812890625, 8),  // 255
-                (72057594037927936, 7),     // 256
-            ];
-
-            let (base, power) = BASES[radix as usize];
+            let (base, power) = gen::BASES_64[radix as usize];
             (base as BigDigit, power)
         }
         _ => panic!("Invalid bigdigit size"),
     }
 }
 
+#[cfg(not(u64_digit))]
 #[test]
 fn test_from_slice() {
-    fn check(slice: &[BigDigit], data: &[BigDigit]) {
-        assert!(BigUint::from_slice(slice).data == data);
+    fn check(slice: &[u32], data: &[BigDigit]) {
+        assert_eq!(BigUint::from_slice(slice).data, data);
     }
     check(&[1], &[1]);
     check(&[0, 0, 0], &[]);
     check(&[1, 2, 0, 0], &[1, 2]);
     check(&[0, 0, 1, 2], &[0, 0, 1, 2]);
     check(&[0, 0, 1, 2, 0, 0], &[0, 0, 1, 2]);
-    check(&[-1i32 as BigDigit], &[-1i32 as BigDigit]);
+    check(&[-1i32 as u32], &[-1i32 as BigDigit]);
 }
 
+#[cfg(u64_digit)]
 #[test]
-fn test_assign_from_slice() {
-    fn check(slice: &[BigDigit], data: &[BigDigit]) {
-        let mut p = BigUint::from_slice(&[2627_u32, 0_u32, 9182_u32, 42_u32]);
-        p.assign_from_slice(slice);
-        assert!(p.data == data);
+fn test_from_slice() {
+    fn check(slice: &[u32], data: &[BigDigit]) {
+        assert_eq!(
+            BigUint::from_slice(slice).data,
+            data,
+            "from {:?}, to {:?}",
+            slice,
+            data
+        );
     }
     check(&[1], &[1]);
     check(&[0, 0, 0], &[]);
-    check(&[1, 2, 0, 0], &[1, 2]);
-    check(&[0, 0, 1, 2], &[0, 0, 1, 2]);
-    check(&[0, 0, 1, 2, 0, 0], &[0, 0, 1, 2]);
-    check(&[-1i32 as BigDigit], &[-1i32 as BigDigit]);
+    check(&[1, 2], &[8_589_934_593]);
+    check(&[1, 2, 0, 0], &[8_589_934_593]);
+    check(&[0, 0, 1, 2], &[0, 8_589_934_593]);
+    check(&[0, 0, 1, 2, 0, 0], &[0, 8_589_934_593]);
+    check(&[-1i32 as u32], &[(-1i32 as u32) as BigDigit]);
 }
 
 #[cfg(has_i128)]
diff --git a/src/lib.rs b/src/lib.rs
index 837870a6..236ff7a3 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -196,19 +196,34 @@ pub use bigrand::{RandBigInt, RandomBits, UniformBigInt, UniformBigUint};
 
 mod big_digit {
     /// A `BigDigit` is a `BigUint`'s composing element.
+    #[cfg(not(u64_digit))]
     pub type BigDigit = u32;
+    #[cfg(u64_digit)]
+    pub type BigDigit = u64;
 
     /// A `DoubleBigDigit` is the internal type used to do the computations.  Its
     /// size is the double of the size of `BigDigit`.
+    #[cfg(not(u64_digit))]
     pub type DoubleBigDigit = u64;
+    #[cfg(u64_digit)]
+    pub type DoubleBigDigit = u128;
 
     /// A `SignedDoubleBigDigit` is the signed version of `DoubleBigDigit`.
+    #[cfg(not(u64_digit))]
     pub type SignedDoubleBigDigit = i64;
+    #[cfg(u64_digit)]
+    pub type SignedDoubleBigDigit = i128;
 
     // `DoubleBigDigit` size dependent
+    #[cfg(not(u64_digit))]
     pub const BITS: usize = 32;
+    #[cfg(u64_digit)]
+    pub const BITS: usize = 64;
 
-    const LO_MASK: DoubleBigDigit = (-1i32 as DoubleBigDigit) >> BITS;
+    pub const HALF_BITS: usize = BITS / 2;
+    pub const HALF: BigDigit = (1 << HALF_BITS) - 1;
+
+    const LO_MASK: DoubleBigDigit = (1 << BITS) - 1;
 
     #[inline]
     fn get_hi(n: DoubleBigDigit) -> BigDigit {
diff --git a/src/monty.rs b/src/monty.rs
index 62f59b38..88b417aa 100644
--- a/src/monty.rs
+++ b/src/monty.rs
@@ -1,129 +1,222 @@
-use integer::Integer;
-use traits::Zero;
+use std::mem;
+use std::ops::Shl;
+use traits::{One, Zero};
 
+use big_digit::{self, BigDigit, DoubleBigDigit, SignedDoubleBigDigit};
 use biguint::BigUint;
 
-struct MontyReducer<'a> {
-    n: &'a BigUint,
-    n0inv: u32,
+struct MontyReducer {
+    n0inv: BigDigit,
 }
 
-// Calculate the modular inverse of `num`, using Extended GCD.
-//
-// Reference:
-// Brent & Zimmermann, Modern Computer Arithmetic, v0.5.9, Algorithm 1.20
-fn inv_mod_u32(num: u32) -> u32 {
-    // num needs to be relatively prime to 2**32 -- i.e. it must be odd.
-    assert!(num % 2 != 0);
-
-    let mut a: i64 = i64::from(num);
-    let mut b: i64 = i64::from(u32::max_value()) + 1;
-
-    // ExtendedGcd
-    // Input: positive integers a and b
-    // Output: integers (g, u, v) such that g = gcd(a, b) = ua + vb
-    // As we don't need v for modular inverse, we don't calculate it.
-
-    // 1: (u, w) <- (1, 0)
-    let mut u = 1;
-    let mut w = 0;
-    // 3: while b != 0
-    while b != 0 {
-        // 4: (q, r) <- DivRem(a, b)
-        let q = a / b;
-        let r = a % b;
-        // 5: (a, b) <- (b, r)
-        a = b;
-        b = r;
-        // 6: (u, w) <- (w, u - qw)
-        let m = u - w * q;
-        u = w;
-        w = m;
+// k0 = -m**-1 mod 2**BITS. Algorithm from: Dumas, J.G. "On Newton–Raphson
+// Iteration for Multiplicative Inverses Modulo Prime Powers".
+fn inv_mod_alt(b: BigDigit) -> BigDigit {
+    assert_ne!(b & 1, 0);
+
+    let mut k0 = 2 - b as SignedDoubleBigDigit;
+    let mut t = (b - 1) as SignedDoubleBigDigit;
+    let mut i = 1;
+    while i < big_digit::BITS {
+        t = t.wrapping_mul(t);
+        k0 = k0.wrapping_mul(t + 1);
+
+        i <<= 1;
     }
+    -k0 as BigDigit
+}
 
-    assert!(a == 1);
-    // Downcasting acts like a mod 2^32 too.
-    u as u32
+impl MontyReducer {
+    fn new(n: &BigUint) -> Self {
+        let n0inv = inv_mod_alt(n.data[0]);
+        MontyReducer { n0inv: n0inv }
+    }
 }
 
-impl<'a> MontyReducer<'a> {
-    fn new(n: &'a BigUint) -> Self {
-        let n0inv = inv_mod_u32(n.data[0]);
-        MontyReducer { n: n, n0inv: n0inv }
+/// Computes z mod m = x * y * 2 ** (-n*_W) mod m
+/// assuming k = -1/m mod 2**_W
+/// See Gueron, "Efficient Software Implementations of Modular Exponentiation".
+/// https://eprint.iacr.org/2011/239.pdf
+/// In the terminology of that paper, this is an "Almost Montgomery Multiplication":
+/// x and y are required to satisfy 0 <= z < 2**(n*_W) and then the result
+/// z is guaranteed to satisfy 0 <= z < 2**(n*_W), but it may not be < m.
+fn montgomery(x: &BigUint, y: &BigUint, m: &BigUint, k: BigDigit, n: usize) -> BigUint {
+    // This code assumes x, y, m are all the same length, n.
+    // (required by addMulVVW and the for loop).
+    // It also assumes that x, y are already reduced mod m,
+    // or else the result will not be properly reduced.
+    assert!(
+        x.data.len() == n && y.data.len() == n && m.data.len() == n,
+        "{:?} {:?} {:?} {}",
+        x,
+        y,
+        m,
+        n
+    );
+
+    let mut z = BigUint::zero();
+    z.data.resize(n * 2, 0);
+
+    let mut c: BigDigit = 0;
+    for i in 0..n {
+        let c2 = add_mul_vvw(&mut z.data[i..n + i], &x.data, y.data[i]);
+        let t = z.data[i].wrapping_mul(k);
+        let c3 = add_mul_vvw(&mut z.data[i..n + i], &m.data, t);
+        let cx = c.wrapping_add(c2);
+        let cy = cx.wrapping_add(c3);
+        z.data[n + i] = cy;
+        if cx < c2 || cy < c3 {
+            c = 1;
+        } else {
+            c = 0;
+        }
     }
+
+    if c == 0 {
+        z.data = z.data[n..].to_vec();
+    } else {
+        {
+            let (mut first, second) = z.data.split_at_mut(n);
+            sub_vv(&mut first, &second, &m.data);
+        }
+        z.data = z.data[..n].to_vec();
+    }
+
+    z
 }
 
-// Montgomery Reduction
-//
-// Reference:
-// Brent & Zimmermann, Modern Computer Arithmetic, v0.5.9, Algorithm 2.6
-fn monty_redc(a: BigUint, mr: &MontyReducer) -> BigUint {
-    let mut c = a.data;
-    let n = &mr.n.data;
-    let n_size = n.len();
-
-    // Allocate sufficient work space
-    c.resize(2 * n_size + 2, 0);
-
-    // β is the size of a word, in this case 32 bits. So "a mod β" is
-    // equivalent to masking a to 32 bits.
-    // mu <- -N^(-1) mod β
-    let mu = 0u32.wrapping_sub(mr.n0inv);
-
-    // 1: for i = 0 to (n-1)
-    for i in 0..n_size {
-        // 2: q_i <- mu*c_i mod β
-        let q_i = c[i].wrapping_mul(mu);
-
-        // 3: C <- C + q_i * N * β^i
-        super::algorithms::mac_digit(&mut c[i..], n, q_i);
+#[inline(always)]
+fn add_mul_vvw(z: &mut [BigDigit], x: &[BigDigit], y: BigDigit) -> BigDigit {
+    let mut c = 0;
+    for (zi, xi) in z.iter_mut().zip(x.iter()) {
+        let (z1, z0) = mul_add_www(*xi, y, *zi);
+        let (c_, zi_) = add_ww(z0, c, 0);
+        *zi = zi_;
+        c = c_ + z1;
     }
 
-    // 4: R <- C * β^(-n)
-    // This is an n-word bitshift, equivalent to skipping n words.
-    let ret = BigUint::new(c[n_size..].to_vec());
+    c
+}
 
-    // 5: if R >= β^n then return R-N else return R.
-    if ret < *mr.n {
-        ret
-    } else {
-        ret - mr.n
+/// The resulting carry c is either 0 or 1.
+#[inline(always)]
+fn sub_vv(z: &mut [BigDigit], x: &[BigDigit], y: &[BigDigit]) -> BigDigit {
+    let mut c = 0;
+    for (i, (xi, yi)) in x.iter().zip(y.iter()).enumerate().take(z.len()) {
+        let zi = xi.wrapping_sub(*yi).wrapping_sub(c);
+        z[i] = zi;
+        // see "Hacker's Delight", section 2-12 (overflow detection)
+        c = ((yi & !xi) | ((yi | !xi) & zi)) >> (big_digit::BITS - 1)
     }
+
+    c
 }
 
-// Montgomery Multiplication
-fn monty_mult(a: BigUint, b: &BigUint, mr: &MontyReducer) -> BigUint {
-    monty_redc(a * b, mr)
+/// z1<<_W + z0 = x+y+c, with c == 0 or 1
+#[inline(always)]
+fn add_ww(x: BigDigit, y: BigDigit, c: BigDigit) -> (BigDigit, BigDigit) {
+    let yc = y.wrapping_add(c);
+    let z0 = x.wrapping_add(yc);
+    let z1 = if z0 < x || yc < y { 1 } else { 0 };
+
+    (z1, z0)
 }
 
-// Montgomery Squaring
-fn monty_sqr(a: BigUint, mr: &MontyReducer) -> BigUint {
-    // TODO: Replace with an optimised squaring function
-    monty_redc(&a * &a, mr)
+/// z1 << _W + z0 = x * y + c
+#[inline(always)]
+fn mul_add_www(x: BigDigit, y: BigDigit, c: BigDigit) -> (BigDigit, BigDigit) {
+    let z = x as DoubleBigDigit * y as DoubleBigDigit + c as DoubleBigDigit;
+    ((z >> big_digit::BITS) as BigDigit, z as BigDigit)
 }
 
-pub fn monty_modpow(a: &BigUint, exp: &BigUint, modulus: &BigUint) -> BigUint {
-    let mr = MontyReducer::new(modulus);
+/// Calculates x ** y mod m using a fixed, 4-bit window.
+pub fn monty_modpow(x: &BigUint, y: &BigUint, m: &BigUint) -> BigUint {
+    assert!(m.data[0] & 1 == 1);
+    let mr = MontyReducer::new(m);
+    let num_words = m.data.len();
 
-    // Calculate the Montgomery parameter
-    let mut v = vec![0; modulus.data.len()];
-    v.push(1);
-    let r = BigUint::new(v);
+    let mut x = x.clone();
+
+    // We want the lengths of x and m to be equal.
+    // It is OK if x >= m as long as len(x) == len(m).
+    if x.data.len() > num_words {
+        x %= m;
+        // Note: now len(x) <= numWords, not guaranteed ==.
+    }
+    if x.data.len() < num_words {
+        x.data.resize(num_words, 0);
+    }
 
-    // Map the base to the Montgomery domain
-    let mut apri = a * &r % modulus;
+    // rr = 2**(2*_W*len(m)) mod m
+    let mut rr = BigUint::one();
+    rr = (rr.shl(2 * num_words * big_digit::BITS)) % m;
+    if rr.data.len() < num_words {
+        rr.data.resize(num_words, 0);
+    }
+    // one = 1, with equal length to that of m
+    let mut one = BigUint::one();
+    one.data.resize(num_words, 0);
+
+    let n = 4;
+    // powers[i] contains x^i
+    let mut powers = Vec::with_capacity(1 << n);
+    powers.push(montgomery(&one, &rr, m, mr.n0inv, num_words));
+    powers.push(montgomery(&x, &rr, m, mr.n0inv, num_words));
+    for i in 2..1 << n {
+        let r = montgomery(&powers[i - 1], &powers[1], m, mr.n0inv, num_words);
+        powers.push(r);
+    }
+
+    // initialize z = 1 (Montgomery 1)
+    let mut z = powers[0].clone();
+    z.data.resize(num_words, 0);
+    let mut zz = BigUint::zero();
+    zz.data.resize(num_words, 0);
+
+    // same windowed exponent, but with Montgomery multiplications
+    for i in (0..y.data.len()).rev() {
+        let mut yi = y.data[i];
+        let mut j = 0;
+        while j < big_digit::BITS {
+            if i != y.data.len() - 1 || j != 0 {
+                zz = montgomery(&z, &z, m, mr.n0inv, num_words);
+                z = montgomery(&zz, &zz, m, mr.n0inv, num_words);
+                zz = montgomery(&z, &z, m, mr.n0inv, num_words);
+                z = montgomery(&zz, &zz, m, mr.n0inv, num_words);
+            }
+            zz = montgomery(
+                &z,
+                &powers[(yi >> (big_digit::BITS - n)) as usize],
+                m,
+                mr.n0inv,
+                num_words,
+            );
+            mem::swap(&mut z, &mut zz);
+            yi <<= n;
+            j += n;
+        }
+    }
 
-    // Binary exponentiation
-    let mut ans = &r % modulus;
-    let mut e = exp.clone();
-    while !e.is_zero() {
-        if e.is_odd() {
-            ans = monty_mult(ans, &apri, &mr);
+    // convert to regular number
+    zz = montgomery(&z, &one, m, mr.n0inv, num_words);
+
+    zz.normalize();
+    // One last reduction, just in case.
+    // See golang.org/issue/13907.
+    if zz >= *m {
+        // Common case is m has high bit set; in that case,
+        // since zz is the same length as m, there can be just
+        // one multiple of m to remove. Just subtract.
+        // We think that the subtract should be sufficient in general,
+        // so do that unconditionally, but double-check,
+        // in case our beliefs are wrong.
+        // The div is not expected to be reached.
+        zz -= m;
+        if zz >= *m {
+            zz %= m;
         }
-        apri = monty_sqr(apri, &mr);
-        e >>= 1;
     }
 
-    // Map the result back to the residues domain
-    monty_redc(ans, &mr)
+    zz.normalize();
+    zz
 }
diff --git a/tests/bigint_scalar.rs b/tests/bigint_scalar.rs
index ae9a6d7a..8b199726 100644
--- a/tests/bigint_scalar.rs
+++ b/tests/bigint_scalar.rs
@@ -98,15 +98,14 @@ fn test_scalar_div_rem() {
         assert!(q == *ans_q);
         assert!(r == *ans_r);
 
+        let b = BigInt::from(b);
         let (a, b, ans_q, ans_r) = (a.clone(), b.clone(), ans_q.clone(), ans_r.clone());
-        assert_op!(a / b == ans_q);
-        assert_op!(a % b == ans_r);
+        assert_signed_scalar_op!(a / b == ans_q);
+        assert_signed_scalar_op!(a % b == ans_r);
 
-        if b <= i32::max_value() as u32 {
-            let nb = -(b as i32);
-            assert_op!(a / nb == -ans_q.clone());
-            assert_op!(a % nb == ans_r);
-        }
+        let nb = -b;
+        assert_signed_scalar_op!(a / nb == -ans_q.clone());
+        assert_signed_scalar_op!(a % nb == ans_r);
     }
 
     fn check(a: &BigInt, b: u32, q: &BigInt, r: &BigInt) {
diff --git a/tests/modpow.rs b/tests/modpow.rs
index a5f8b928..e6784d84 100644
--- a/tests/modpow.rs
+++ b/tests/modpow.rs
@@ -76,7 +76,7 @@ mod biguint {
     }
 
     #[test]
-    fn test_modpow() {
+    fn test_modpow_single() {
         check_modpow::<u32>(1, 0, 11, 1);
         check_modpow::<u32>(0, 15, 11, 0);
         check_modpow::<u32>(3, 7, 11, 9);
@@ -119,7 +119,7 @@ mod bigint {
 
     fn check_modpow<T: Into<BigInt>>(b: T, e: T, m: T, r: T) {
         fn check(b: &BigInt, e: &BigInt, m: &BigInt, r: &BigInt) {
-            assert_eq!(&b.modpow(e, m), r);
+            assert_eq!(&b.modpow(e, m), r, "{} ** {} (mod {}) != {}", b, e, m, r);
 
             let even_m = m << 1;
             let even_modpow = b.modpow(e, m);
diff --git a/tests/serde.rs b/tests/serde.rs
index 0f3d4868..a0251b9c 100644
--- a/tests/serde.rs
+++ b/tests/serde.rs
@@ -101,3 +101,30 @@ fn bigint_factorial_100() {
 
     assert_tokens(&n, &tokens);
 }
+
+#[test]
+fn big_digits() {
+    // Try a few different lengths for u32/u64 digit coverage
+    for len in 1..10 {
+        let digits = 1u32..len + 1;
+        let n = BigUint::new(digits.clone().collect());
+
+        let mut tokens = vec![];
+        tokens.push(Token::Seq {
+            len: Some(len as usize),
+        });
+        tokens.extend(digits.map(Token::U32));
+        tokens.push(Token::SeqEnd);
+
+        assert_tokens(&n, &tokens);
+
+        let n = BigInt::from(n);
+        tokens.insert(0, Token::Tuple { len: 2 });
+        tokens.insert(1, Token::I8(1));
+        tokens.push(Token::TupleEnd);
+        assert_tokens(&n, &tokens);
+
+        tokens[1] = Token::I8(-1);
+        assert_tokens(&-n, &tokens);
+    }
+}