From 83e8d2b2fd70f02a73a36b4619ad4a28d500f411 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 23 Feb 2022 13:20:21 -0800 Subject: [PATCH] Use inline asm! for x86 DIV on Rust 1.59+ --- .github/workflows/ci.yaml | 1 + bors.toml | 1 + build.rs | 5 +++++ src/biguint/convert.rs | 13 ++++++++--- src/biguint/division.rs | 45 ++++++++++++++++++++++++++++++++++++--- 5 files changed, 59 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 28433681..b6143635 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,6 +18,7 @@ jobs: 1.36.0, # alloc, rand 1.40.0, # arbitrary 1.46.0, # quickcheck + 1.59.0, # asm! stable, beta, nightly diff --git a/bors.toml b/bors.toml index 163c03ab..d31295cf 100644 --- a/bors.toml +++ b/bors.toml @@ -4,6 +4,7 @@ status = [ "Test (1.36.0)", "Test (1.40.0)", "Test (1.46.0)", + "Test (1.59.0)", "Test (stable)", "Test (beta)", "Test (nightly)", diff --git a/build.rs b/build.rs index 3daed5e8..bc60d6db 100644 --- a/build.rs +++ b/build.rs @@ -10,6 +10,7 @@ fn main() { if u64_digit { autocfg::emit("u64_digit"); } + let ac = autocfg::new(); let std = if ac.probe_sysroot_crate("std") { "std" @@ -28,6 +29,10 @@ fn main() { if ac.probe_path(&addcarry) { autocfg::emit("use_addcarry"); } + + if ac.probe_path(&format!("{}::arch::asm", std)) { + autocfg::emit("use_x86_div"); + } } } diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs index 5cf05cb6..3b58818a 100644 --- a/src/biguint/convert.rs +++ b/src/biguint/convert.rs @@ -657,7 +657,14 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { let mut digits = u.clone(); - let (base, power) = get_radix_base(radix, big_digit::HALF_BITS); + // X86 DIV can quickly divide by a full digit, otherwise we choose a divisor + // that's suitable for `div_half` to avoid slow `DoubleBigDigit` division. + let bits = if cfg!(use_x86_div) { + big_digit::BITS + } else { + big_digit::HALF_BITS + }; + let (base, power) = get_radix_base(radix, bits); let radix = radix as BigDigit; // For very large numbers, the O(n²) loop of repeated `div_rem_digit` dominates the @@ -665,8 +672,8 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { // The threshold for this was chosen by anecdotal performance measurements to // approximate where this starts to make a noticeable difference. if digits.data.len() >= 64 { - let mut big_base = BigUint::from(base * base); - let mut big_power = 2usize; + let mut big_base = BigUint::from(base); + let mut big_power = 1usize; // Choose a target base length near √n. let target_len = digits.data.len().sqrt(); diff --git a/src/biguint/division.rs b/src/biguint/division.rs index b5d4259c..a4870df9 100644 --- a/src/biguint/division.rs +++ b/src/biguint/division.rs @@ -18,6 +18,7 @@ use num_traits::{CheckedDiv, One, ToPrimitive, Zero}; /// This is _not_ true for an arbitrary numerator/denominator. /// /// (This function also matches what the x86 divide instruction does). +#[cfg(not(use_x86_div))] #[inline] fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { debug_assert!(hi < divisor); @@ -27,6 +28,44 @@ fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigi ((lhs / rhs) as BigDigit, (lhs % rhs) as BigDigit) } +/// With Rust 1.59+ for stable `asm!`, x86 and x86_64 can use a real `div` instruction. +#[cfg(use_x86_div)] +#[inline] +fn div_wide(hi: BigDigit, lo: BigDigit, divisor: BigDigit) -> (BigDigit, BigDigit) { + // This debug assertion covers the potential #DE for divisor==0 or a quotient too large for one + // register, otherwise in release mode it will become a target-specific fault like SIGFPE. + // This should never occur with the inputs from our few `div_wide` callers. + debug_assert!(hi < divisor); + + // SAFETY: The `div` instruction only affects registers, reading the explicit operand as the + // divisor, and implicitly reading RDX:RAX or EDX:EAX as the dividend. The result is implicitly + // written back to RAX or EAX for the quotient and RDX or EDX for the remainder. No memory is + // used, and flags are not preserved. + unsafe { + let (div, rem); + + #[cfg(u64_digit)] + core::arch::asm!( + "div {:r}", + in(reg) divisor, + inout("rdx") hi => rem, + inout("rax") lo => div, + options(pure, nomem, nostack), + ); + + #[cfg(not(u64_digit))] + core::arch::asm!( + "div {:e}", + in(reg) divisor, + inout("edx") hi => rem, + inout("eax") lo => div, + options(pure, nomem, nostack), + ); + + (div, rem) + } +} + /// For small divisors, we can divide without promoting to `DoubleBigDigit` by /// using half-size pieces of digit, like long-division. #[inline] @@ -47,7 +86,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit) let mut rem = 0; - if b <= big_digit::HALF { + if !cfg!(use_x86_div) && b <= big_digit::HALF { for d in a.data.iter_mut().rev() { let (q, r) = div_half(rem, *d, b); *d = q; @@ -72,7 +111,7 @@ fn rem_digit(a: &BigUint, b: BigDigit) -> BigDigit { let mut rem = 0; - if b <= big_digit::HALF { + if !cfg!(use_x86_div) && b <= big_digit::HALF { for &digit in a.data.iter().rev() { let (_, r) = div_half(rem, digit, b); rem = r; @@ -232,7 +271,7 @@ fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) { let mut a0 = 0; // [b1, b0] are the two most significant digits of the divisor. They never change. - let b0 = *b.last().unwrap(); + let b0 = b[b.len() - 1]; let b1 = b[b.len() - 2]; let q_len = a.data.len() - b.len() + 1;