simd shuffle

TheIronBorn · Jun 20, 2018 · 849e9b2 · 849e9b2
1 parent a418ea3
commit 849e9b2
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 12 deletions.
diff --git a/rand_core/src/lib.rs b/rand_core/src/lib.rs
@@ -92,7 +92,7 @@ pub mod simd_impls;
 /// to avoid platform differences, and avoid making any changes which affect
 /// output (except by communicating that the release has breaking changes).
 ///
-/// Typically implementators will implement only one of the methods available
+/// Typically implementors will implement only one of the methods available
 /// in this trait directly, then use the helper functions from the
 /// [`rand_core::impls`] module to implement the other methods.
 ///
@@ -209,7 +209,7 @@ pub trait RngCore {
 /// Some generators may satisfy an additional property, however this is not
 /// required by this trait: if the CSPRNG's state is revealed, it should not be
 /// computationally-feasible to reconstruct output prior to this. Some other
-/// generators allow backwards-computation and are consided *reversible*.
+/// generators allow backwards-computation and are considered *reversible*.
 ///
 /// Note that this trait is provided for guidance only and cannot guarantee
 /// suitability for cryptographic applications. In general it should only be
@@ -293,7 +293,7 @@ pub trait SeedableRng: Sized {
     ///
     /// It is however not required that this function yield the same state as a
     /// reference implementation of the PRNG given equivalent seed; if necessary
-    /// another constructor replicating behaviour from a reference
+    /// another constructor replicating behavior from a reference
     /// implementation can be added.
     ///
     /// PRNG implementations should make sure `from_seed` never panics. In the

diff --git a/src/lib.rs b/src/lib.rs
@@ -270,6 +270,11 @@ extern crate rand_core;
 #[cfg(feature="simd_support")]
 extern crate stdsimd;
 
+#[cfg(feature="simd_support")]
+mod simd_shuffle;
+#[cfg(feature="simd_support")]
+pub use simd_shuffle::SimdShuf;
+
 // Re-exports from rand_core
 pub use rand_core::{RngCore, CryptoRng, SeedableRng};
 pub use rand_core::{ErrorKind, Error};
@@ -318,7 +323,7 @@ pub mod isaac {
 
 #[cfg(feature="simd_support")]
 use stdsimd::simd::*;
-use core::{marker, mem, slice};
+use core::{marker, mem, slice, ptr};
 use distributions::{Distribution, Standard};
 use distributions::uniform::{SampleUniform, UniformSampler};
 
@@ -622,7 +627,8 @@ pub trait Rng: RngCore {
             // invariant: elements with index >= i have been locked in place.
             i -= 1;
             // lock element i in place.
-            values.swap(i, self.gen_range(0, i + 1));
+            let r = self.gen_range(0, i + 1);
+            unsafe { swap_unchecked(values, i, r); }
         }
     }
 
@@ -688,6 +694,15 @@ pub trait Rng: RngCore {
     }
 }
 
+/// Use to remove bound checks when the compiler isn't smart enough about
+/// `gen_range` and related.
+#[inline]
+unsafe fn swap_unchecked<T>(values: &mut [T], a: usize, b: usize) {
+    let pa: *mut T = values.get_unchecked_mut(a);
+    let pb: *mut T = values.get_unchecked_mut(b);
+    ptr::swap(pa, pb);
+}
+
 impl<R: RngCore + ?Sized> Rng for R {}
 
 /// Trait for casting types to byte slices

diff --git a/src/prng/mod.rs b/src/prng/mod.rs
@@ -21,7 +21,7 @@
 //!
 //! In simple terms, the basic PRNGs are often predictable; CSPRNGs should not
 //! be predictable *when used correctly*.
-//! 
+//!
 //! Contents of this documentation:
 //!
 //! 1. [The generators](#the-generators)
@@ -136,7 +136,7 @@
 //! 256 bits would be approximately the minimum secure size. In practice,
 //! CSPRNGs tend to use quite a bit more, [`ChaChaRng`] is relatively small with
 //! 136 bytes of state.
-//! 
+//!
 //! ## Initialization time
 //!
 //! The time required to initialize new generators varies significantly. Many
@@ -320,6 +320,8 @@ pub mod hc128;
 pub mod isaac;
 pub mod isaac64;
 mod sfc32;
+#[cfg(feature = "simd_support")]
+mod sfc_alt;
 mod xorshift;
 
 mod isaac_array;
@@ -328,6 +330,7 @@ pub use self::chacha::ChaChaRng;
 pub use self::hc128::Hc128Rng;
 pub use self::isaac::IsaacRng;
 pub use self::isaac64::Isaac64Rng;
-pub use self::sfc32::Sfc32Rng;
 pub use self::sfc32::*;
+#[cfg(feature = "simd_support")]
+pub use self::sfc_alt::*;
 pub use self::xorshift::XorShiftRng;
diff --git a/src/rngs/jitter.rs b/src/rngs/jitter.rs
@@ -598,11 +598,11 @@ impl JitterRng {
         self.stir_pool();
         self.data
     }
-    
+
     /// Basic quality tests on the timer, by measuring CPU timing jitter a few
     /// hundred times.
     ///
-    /// If succesful, this will return the estimated number of rounds necessary
+    /// If successful, this will return the estimated number of rounds necessary
     /// to collect 64 bits of entropy. Otherwise a [`TimerError`] with the cause
     /// of the failure will be returned.
     ///

diff --git a/src/rngs/os.rs b/src/rngs/os.rs
@@ -87,7 +87,7 @@ use rand_core::{CryptoRng, RngCore, Error, impls};
 /// # Panics
 ///
 /// `OsRng` is extremely unlikely to fail if `OsRng::new()`, and one read from
-/// it, where succesfull. But in case it does fail, only [`try_fill_bytes`] is
+/// it, where successful. But in case it does fail, only [`try_fill_bytes`] is
 /// able to report the cause. Depending on the error the other [`RngCore`]
 /// methods will retry several times, and panic in case the error remains.
 ///
@@ -963,7 +963,7 @@ mod imp {
 #[cfg(windows)]
 mod imp {
     extern crate winapi;
-    
+
     use {Error, ErrorKind};
     use super::OsRngImpl;
 

diff --git a/src/simd_shuffle.rs b/src/simd_shuffle.rs
@@ -0,0 +1,120 @@
+//! An SIMD shuffle implementation.
+//!
+//! Loosely based on Daniel Lemire's [SIMDxorshift].
+//!
+//! [SIMDxorshift]: (https://github.com/lemire/SIMDxorshift
+
+// use core::mem::size_of;
+use stdsimd::simd::*;
+
+// use distributions::range::SampleSingleHigh;
+use distributions::uniform::WideningMultiply;
+use {swap_unchecked, Rng};
+
+/// A trait for shuffling slices.
+pub trait SimdShuf {
+    /// Shuffle a mutable slice in place, using an SIMD implementation.
+    ///
+    /// To be used in the form:
+    /// ```rust
+    /// u16x8::simd_shuffle(&mut rng, &mut list);
+    /// ```
+    ///
+    /// Use a vector of size greater than or equal to the PRNG output.
+    /// Smaller lane widths will likely be faster for equal vector sizes.
+    ///
+    /// # Panics
+    ///
+    /// If `values.len()` is larger than the maximum value of the vector's
+    /// lanes. (If `values.len()` is unknown, use a `u32xN` or `u64xN`
+    /// vector depending on [`target_pointer_width`].)
+    ///
+    /// [`target_pointer_width`]: https://doc.rust-lang.org/reference/attributes.html#conditional-compilation
+    fn simd_shuffle<R: Rng, T>(rng: &mut R, values: &mut [T]);
+}
+
+macro_rules! impl_simd_shuf {
+    ($vec:ident, $scalar:ident) => {
+        impl SimdShuf for $vec {
+            // TODO: make this adapt when too many/few elements
+            //       could match on values.len and use an appropriate
+            //       lane width, based on chosen vector width
+            #[inline(always)]
+            fn simd_shuffle<R: Rng, T>(rng: &mut R, values: &mut [T]) {
+                assert!(
+                    values.len() <= $scalar::max_value() as usize,
+                    "Slice length too long for the vector's lanes",
+                );
+
+                // Create a vector to hold `$vec::lanes()` range bounds at
+                // once. This should be evaluated at compile-time.
+                // TODO: consider making this a macro
+                let mut interval = $vec::default();
+                for vec_idx in 0..$vec::lanes() {
+                    // (len, len - 1, len - 2, len - 3, ..., len - $vec::lanes() + 1)
+                    interval = interval.replace(vec_idx, (values.len() - vec_idx) as $scalar);
+                }
+                let mut slice_idx = values.len();
+
+                // shuffle a multiple of `$vec::lanes()` slice elements
+                for _ in 0..values.len() / $vec::lanes() {
+                    let rand_indices = rng.gen_range($vec::splat(0), interval);
+
+                    // swap each `rand_idx` with the next `slice_idx`
+                    // TODO: could probably be optimized
+                    for vec_idx in 0..$vec::lanes() {
+                        slice_idx -= 1;
+                        let rand_idx = rand_indices.extract(vec_idx) as usize;
+                        unsafe { swap_unchecked(values, slice_idx, rand_idx); }
+                    }
+
+                    // move onto the next interval
+                    interval -= $vec::lanes() as $scalar;
+                }
+
+                // shuffle the remaining elements
+                // This is likely overzealous
+                let remainder = values.len() % $vec::lanes();
+                if remainder > 1 {
+                    match remainder - 1 {
+                        1...2 => rem_shuf!(u16x2, remainder, rng, values, slice_idx, u16),
+                        3...4 => rem_shuf!(u16x4, remainder, rng, values, slice_idx, u16),
+                        5...8 => rem_shuf!(u16x8, remainder, rng, values, slice_idx, u16),
+                        9...16 => rem_shuf!(u16x16, remainder, rng, values, slice_idx, u16),
+                        17...32 => rem_shuf!(u16x32, remainder, rng, values, slice_idx, u16),
+                        33...64 | _ => rem_shuf!(u8x64, remainder, rng, values, slice_idx, u8),
+                    }
+                }
+            }
+        }
+    };
+
+    // bulk implementation for scalar types
+    ($($vec:ident,)+, $scalar:ident) => {$(
+        impl_simd_shuf!($vec, $scalar);
+    )+};
+}
+
+macro_rules! rem_shuf {
+    ($vec:ident, $rem:ident, $rng:ident, $values:ident, $slice_idx:ident, $scalar:ty) => {{
+        // We can exit interval generation early, because we only need a few
+        // indices.  We can't avoid generating unneeded random indices however,
+        // so we use a default value of 1 to speed up the uniform sampling.
+        let mut interval = $vec::splat(1);
+        for vec_idx in 0..$rem - 1 {
+            interval = interval.replace(vec_idx, ($rem - vec_idx) as $scalar);
+        }
+
+        let rand_indices = $rng.gen_range($vec::splat(0), interval);
+        for vec_idx in 0..$rem - 1 {
+            $slice_idx -= 1;
+            let rand_idx = rand_indices.extract(vec_idx) as usize;
+            unsafe { swap_unchecked($values, $slice_idx, rand_idx); }
+        }
+    }};
+}
+
+impl_simd_shuf!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64,, u8);
+impl_simd_shuf!(u16x2, u16x4, u16x8, u16x16, u16x32,, u16);
+impl_simd_shuf!(u32x2, u32x4, u32x8, u32x16,, u32);
+impl_simd_shuf!(u64x2, u64x4, u64x8,, u64);