From 849e9b2c837949e6b124b2e56d01f27b95faa57e Mon Sep 17 00:00:00 2001 From: TheIronBorn <> Date: Tue, 19 Jun 2018 19:50:35 -0700 Subject: [PATCH] simd shuffle --- rand_core/src/lib.rs | 6 +-- src/lib.rs | 19 ++++++- src/prng/mod.rs | 9 ++-- src/rngs/jitter.rs | 4 +- src/rngs/os.rs | 4 +- src/simd_shuffle.rs | 120 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 150 insertions(+), 12 deletions(-) create mode 100644 src/simd_shuffle.rs diff --git a/rand_core/src/lib.rs b/rand_core/src/lib.rs index d098602fef8..3cd2a630ffd 100644 --- a/rand_core/src/lib.rs +++ b/rand_core/src/lib.rs @@ -92,7 +92,7 @@ pub mod simd_impls; /// to avoid platform differences, and avoid making any changes which affect /// output (except by communicating that the release has breaking changes). /// -/// Typically implementators will implement only one of the methods available +/// Typically implementors will implement only one of the methods available /// in this trait directly, then use the helper functions from the /// [`rand_core::impls`] module to implement the other methods. /// @@ -209,7 +209,7 @@ pub trait RngCore { /// Some generators may satisfy an additional property, however this is not /// required by this trait: if the CSPRNG's state is revealed, it should not be /// computationally-feasible to reconstruct output prior to this. Some other -/// generators allow backwards-computation and are consided *reversible*. +/// generators allow backwards-computation and are considered *reversible*. /// /// Note that this trait is provided for guidance only and cannot guarantee /// suitability for cryptographic applications. In general it should only be @@ -293,7 +293,7 @@ pub trait SeedableRng: Sized { /// /// It is however not required that this function yield the same state as a /// reference implementation of the PRNG given equivalent seed; if necessary - /// another constructor replicating behaviour from a reference + /// another constructor replicating behavior from a reference /// implementation can be added. /// /// PRNG implementations should make sure `from_seed` never panics. In the diff --git a/src/lib.rs b/src/lib.rs index a74913ff8c1..95e39a67830 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -270,6 +270,11 @@ extern crate rand_core; #[cfg(feature="simd_support")] extern crate stdsimd; +#[cfg(feature="simd_support")] +mod simd_shuffle; +#[cfg(feature="simd_support")] +pub use simd_shuffle::SimdShuf; + // Re-exports from rand_core pub use rand_core::{RngCore, CryptoRng, SeedableRng}; pub use rand_core::{ErrorKind, Error}; @@ -318,7 +323,7 @@ pub mod isaac { #[cfg(feature="simd_support")] use stdsimd::simd::*; -use core::{marker, mem, slice}; +use core::{marker, mem, slice, ptr}; use distributions::{Distribution, Standard}; use distributions::uniform::{SampleUniform, UniformSampler}; @@ -622,7 +627,8 @@ pub trait Rng: RngCore { // invariant: elements with index >= i have been locked in place. i -= 1; // lock element i in place. - values.swap(i, self.gen_range(0, i + 1)); + let r = self.gen_range(0, i + 1); + unsafe { swap_unchecked(values, i, r); } } } @@ -688,6 +694,15 @@ pub trait Rng: RngCore { } } +/// Use to remove bound checks when the compiler isn't smart enough about +/// `gen_range` and related. +#[inline] +unsafe fn swap_unchecked(values: &mut [T], a: usize, b: usize) { + let pa: *mut T = values.get_unchecked_mut(a); + let pb: *mut T = values.get_unchecked_mut(b); + ptr::swap(pa, pb); +} + impl Rng for R {} /// Trait for casting types to byte slices diff --git a/src/prng/mod.rs b/src/prng/mod.rs index bfba4885fbf..84ff94ed4ae 100644 --- a/src/prng/mod.rs +++ b/src/prng/mod.rs @@ -21,7 +21,7 @@ //! //! In simple terms, the basic PRNGs are often predictable; CSPRNGs should not //! be predictable *when used correctly*. -//! +//! //! Contents of this documentation: //! //! 1. [The generators](#the-generators) @@ -136,7 +136,7 @@ //! 256 bits would be approximately the minimum secure size. In practice, //! CSPRNGs tend to use quite a bit more, [`ChaChaRng`] is relatively small with //! 136 bytes of state. -//! +//! //! ## Initialization time //! //! The time required to initialize new generators varies significantly. Many @@ -320,6 +320,8 @@ pub mod hc128; pub mod isaac; pub mod isaac64; mod sfc32; +#[cfg(feature = "simd_support")] +mod sfc_alt; mod xorshift; mod isaac_array; @@ -328,6 +330,7 @@ pub use self::chacha::ChaChaRng; pub use self::hc128::Hc128Rng; pub use self::isaac::IsaacRng; pub use self::isaac64::Isaac64Rng; -pub use self::sfc32::Sfc32Rng; pub use self::sfc32::*; +#[cfg(feature = "simd_support")] +pub use self::sfc_alt::*; pub use self::xorshift::XorShiftRng; diff --git a/src/rngs/jitter.rs b/src/rngs/jitter.rs index a31a1df67e3..0dc308614eb 100644 --- a/src/rngs/jitter.rs +++ b/src/rngs/jitter.rs @@ -598,11 +598,11 @@ impl JitterRng { self.stir_pool(); self.data } - + /// Basic quality tests on the timer, by measuring CPU timing jitter a few /// hundred times. /// - /// If succesful, this will return the estimated number of rounds necessary + /// If successful, this will return the estimated number of rounds necessary /// to collect 64 bits of entropy. Otherwise a [`TimerError`] with the cause /// of the failure will be returned. /// diff --git a/src/rngs/os.rs b/src/rngs/os.rs index 01fdfb0c1f5..8d9029ef8b7 100644 --- a/src/rngs/os.rs +++ b/src/rngs/os.rs @@ -87,7 +87,7 @@ use rand_core::{CryptoRng, RngCore, Error, impls}; /// # Panics /// /// `OsRng` is extremely unlikely to fail if `OsRng::new()`, and one read from -/// it, where succesfull. But in case it does fail, only [`try_fill_bytes`] is +/// it, where successful. But in case it does fail, only [`try_fill_bytes`] is /// able to report the cause. Depending on the error the other [`RngCore`] /// methods will retry several times, and panic in case the error remains. /// @@ -963,7 +963,7 @@ mod imp { #[cfg(windows)] mod imp { extern crate winapi; - + use {Error, ErrorKind}; use super::OsRngImpl; diff --git a/src/simd_shuffle.rs b/src/simd_shuffle.rs new file mode 100644 index 00000000000..12637b679fc --- /dev/null +++ b/src/simd_shuffle.rs @@ -0,0 +1,120 @@ +//! An SIMD shuffle implementation. +//! +//! Loosely based on Daniel Lemire's [SIMDxorshift]. +//! +//! [SIMDxorshift]: (https://github.com/lemire/SIMDxorshift + +// use core::mem::size_of; +use stdsimd::simd::*; + +// use distributions::range::SampleSingleHigh; +use distributions::uniform::WideningMultiply; +use {swap_unchecked, Rng}; + +/// A trait for shuffling slices. +pub trait SimdShuf { + /// Shuffle a mutable slice in place, using an SIMD implementation. + /// + /// To be used in the form: + /// ```rust + /// u16x8::simd_shuffle(&mut rng, &mut list); + /// ``` + /// + /// Use a vector of size greater than or equal to the PRNG output. + /// Smaller lane widths will likely be faster for equal vector sizes. + /// + /// # Panics + /// + /// If `values.len()` is larger than the maximum value of the vector's + /// lanes. (If `values.len()` is unknown, use a `u32xN` or `u64xN` + /// vector depending on [`target_pointer_width`].) + /// + /// [`target_pointer_width`]: https://doc.rust-lang.org/reference/attributes.html#conditional-compilation + fn simd_shuffle(rng: &mut R, values: &mut [T]); +} + +macro_rules! impl_simd_shuf { + ($vec:ident, $scalar:ident) => { + impl SimdShuf for $vec { + // TODO: make this adapt when too many/few elements + // could match on values.len and use an appropriate + // lane width, based on chosen vector width + #[inline(always)] + fn simd_shuffle(rng: &mut R, values: &mut [T]) { + assert!( + values.len() <= $scalar::max_value() as usize, + "Slice length too long for the vector's lanes", + ); + + // Create a vector to hold `$vec::lanes()` range bounds at + // once. This should be evaluated at compile-time. + // TODO: consider making this a macro + let mut interval = $vec::default(); + for vec_idx in 0..$vec::lanes() { + // (len, len - 1, len - 2, len - 3, ..., len - $vec::lanes() + 1) + interval = interval.replace(vec_idx, (values.len() - vec_idx) as $scalar); + } + let mut slice_idx = values.len(); + + // shuffle a multiple of `$vec::lanes()` slice elements + for _ in 0..values.len() / $vec::lanes() { + let rand_indices = rng.gen_range($vec::splat(0), interval); + + // swap each `rand_idx` with the next `slice_idx` + // TODO: could probably be optimized + for vec_idx in 0..$vec::lanes() { + slice_idx -= 1; + let rand_idx = rand_indices.extract(vec_idx) as usize; + unsafe { swap_unchecked(values, slice_idx, rand_idx); } + } + + // move onto the next interval + interval -= $vec::lanes() as $scalar; + } + + // shuffle the remaining elements + // This is likely overzealous + let remainder = values.len() % $vec::lanes(); + if remainder > 1 { + match remainder - 1 { + 1...2 => rem_shuf!(u16x2, remainder, rng, values, slice_idx, u16), + 3...4 => rem_shuf!(u16x4, remainder, rng, values, slice_idx, u16), + 5...8 => rem_shuf!(u16x8, remainder, rng, values, slice_idx, u16), + 9...16 => rem_shuf!(u16x16, remainder, rng, values, slice_idx, u16), + 17...32 => rem_shuf!(u16x32, remainder, rng, values, slice_idx, u16), + 33...64 | _ => rem_shuf!(u8x64, remainder, rng, values, slice_idx, u8), + } + } + } + } + }; + + // bulk implementation for scalar types + ($($vec:ident,)+, $scalar:ident) => {$( + impl_simd_shuf!($vec, $scalar); + )+}; +} + +macro_rules! rem_shuf { + ($vec:ident, $rem:ident, $rng:ident, $values:ident, $slice_idx:ident, $scalar:ty) => {{ + // We can exit interval generation early, because we only need a few + // indices. We can't avoid generating unneeded random indices however, + // so we use a default value of 1 to speed up the uniform sampling. + let mut interval = $vec::splat(1); + for vec_idx in 0..$rem - 1 { + interval = interval.replace(vec_idx, ($rem - vec_idx) as $scalar); + } + + let rand_indices = $rng.gen_range($vec::splat(0), interval); + for vec_idx in 0..$rem - 1 { + $slice_idx -= 1; + let rand_idx = rand_indices.extract(vec_idx) as usize; + unsafe { swap_unchecked($values, $slice_idx, rand_idx); } + } + }}; +} + +impl_simd_shuf!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64,, u8); +impl_simd_shuf!(u16x2, u16x4, u16x8, u16x16, u16x32,, u16); +impl_simd_shuf!(u32x2, u32x4, u32x8, u32x16,, u32); +impl_simd_shuf!(u64x2, u64x4, u64x8,, u64);